Automating

Putting it all together and getting organized

|
`-- data-raw/
|   |
|   `-- Afghanistan-gdp-percapita.csv
|   `-- Albania-gdp-percapita.csv
|   `-- ....
|
`-- data-output/
|
`-- fig/
|
`-- R/
|   |
|    `-- figures.R
|    `-- data.R
|    `-- utils.R
|    `-- dependencies.R
|
`-- tests/
|
`-- manuscript.Rmd
`-- make.R

Tests

testthat has a convenient function called test_dir that will run tests included in files in a given directory. We can use it to run all the tests in our tests/ folder.

test_dir("tests/")

Let's turn it into a function, so we'll be able to add some additional functionalities to it a little later. We are also going to save it at the root of our working directory in the file called make.R:

## add this to make.R
make_tests <- function() {
    test_dir("tests/")
}

Figures

This is the code to generate the two figures in the manuscript:

## add this to R/figure.R
plot_summary_lifeExp_by_continent <- function(mean_lifeExp) {
    ggplot(mean_lifeExp, aes(x = year, y = mean_lifeExp, colour = continent)) +
      geom_line() + facet_wrap(~ continent) + theme(legend.position = "top")
}

plot_change_trend <- function(mean_lifeExp, year_break) {
    tmp_data <- get_coef_before_after(mean_lifeExp, year_break)
    ggplot(tmp_data, aes(x = period, y = trend, colour = continent, group = continent)) +
      geom_point() + geom_path()
}
## -----

Generate PDFs

This is the code to generate PDF files from figures, and the two figures as PDF files:

## add this to make.R
make_figures <- function(path = "fig", ...) {
    make_summary_by_continent(path = path, ...)
    make_change_trend(path = path, ...)
}

make_summary_by_continent <- function(path = "fig", ...) {
    mean_lifeExp <- get_mean_lifeExp(gather_gdp_data())
    p <- plot_summary_lifeExp_by_continent(mean_lifeExp)
    make_pdf(print(p), file = file.path(path, "summary_by_continent.pdf"), ...)
}

make_change_trend <- function(path = "fig", year = 1980, ...) {
    mean_lifeExp <- get_mean_lifeExp(gather_gdp_data())
    p <- plot_change_trend(mean_lifeExp, year = year)
    make_pdf(print(p), file = file.path(path, "change_trend.pdf"), ...)
}
## -----

Data

This is the code that generates the intermediate datasets:

## add this to R/data.R
gather_gdp_data <- function(path = "data-raw") {
    split_gdp_files <- list.files(path = path, pattern = "gdp-percapita\\.csv$", full.names = TRUE)

    split_gdp_list <- lapply(split_gdp_files, read.csv)
    gdp <- do.call("rbind", split_gdp_list)
    gdp
}

get_mean_lifeExp <- function(gdp) {
    mean_lifeExp_by_cont <- gdp %>% group_by(continent, year) %>%
      summarize(mean_lifeExp = mean(lifeExp)) %>% as.data.frame
    mean_lifeExp_by_cont
}

get_latest_lifeExp <- function(gdp) {
    latest_lifeExp <- gdp %>% filter(year == max(gdp$year)) %>%
      group_by(continent) %>%
      summarize(latest_lifeExp = mean(lifeExp)) %>%
      as.data.frame
    latest_lifeExp
}

get_coef_before_after <- function(mean_lifeExp, year_break) {
    coef_before_after <- lapply(unique(mean_lifeExp$continent), function(cont) {
                                    mdl_before <- lm(mean_lifeExp ~ year,
                                                     data = mean_lifeExp,
                                                     subset = (continent == cont & year <= year_break))
                                    mdl_after  <- lm(mean_lifeExp ~ year,
                                                     data = mean_lifeExp,
                                                     subset = (continent == cont & year > year_break))
                                    rbind(c(as.character(cont), "before", coef(mdl_before)[2]),
                                          c(as.character(cont), "after", coef(mdl_after)[2]))
                                }) %>%
      do.call("rbind", .) %>% as.data.frame %>%
      setNames(c("continent", "period", "trend"))
    coef_before_after$trend <- as.numeric(levels(coef_before_after$trend)[coef_before_after$trend])
    coef_before_after$period <- factor(coef_before_after$period, levels = c("before", "after"))
    coef_before_after
}
## -----

Generate CSVs

This is the code to generate the CSV files that contain the intermediate datasets that are needed to draw the figures. The function make_data generates both datasets at once.

## add this to make.R
make_data <- function(path = "data-output", verbose = TRUE) {
    make_gdp_data(path)
    make_mean_lifeExp_data()
}

make_gdp_data <- function(path = "data-output") {
    gdp <- gather_gdp_data()
    make_csv(gdp, file = file.path(path, "gdp.csv"))
}

make_mean_lifeExp_data <- function(path = "data-output") {
    gdp <- gather_gdp_data()
    make_csv(get_mean_lifeExp(gdp), file = file.path(path, "mean_lifeExp.csv"))
}
## -----

Cleaning

The only way to ensure that your analysis is reproducible is to delete all the intermediate and final products to make sure your functions can recreate everything from the raw data and your code.

Having the figures and the intermediate data files isolated in their own folders in your working directory will allow you to make sure you only delete these generated figures, and none of the original data.

## add this to make.R
clean_data <- function(path = "data-output") {
    to_rm <- list.files(path = path, pattern = "csv$", full.names = TRUE)
    res <- file.remove(to_rm)
    invisible(res)
}

clean_figures <- function(path = "fig") {
    to_rm <- list.files(path = path, pattern = "pdf$", full.names = TRUE)
    res <- file.remove(to_rm)
    invisible(res)
}
## -----

Make everything

These are wrapper functions to generate/delete everything:

## add this to make.R
make_ms <- function() {
    rmarkdown::render("manuscript.Rmd", "html_document")
    invisible(file.exists("manuscript.html"))
}

clean_ms <- function() {
    res <- file.remove("manuscript.html")
    invisible(res)
}

make_all <- function() {
    make_data()
    make_figures()
    make_tests()
    make_ms()
}

clean_all <- function() {
    clean_data()
    clean_figures()
    clean_ms()
}
## -----

More comprehensive

and before we continue, we are going to replace the make_tests function with something a little more comprehensive:

## add this to make.R
make_tests <- function() {
    if (require(testthat)) {
        p <- test_dir("tests/")
        if (!interactive() && any(p$failed)) {
            q("no", status = 1, FALSE)
        }
    } else {
        message("skipped the tests, testthat not available.")
        return(NULL)
    }
}
## -----