3rd June 2016
David Robinson summarized the goal on his laptop

see also what Karl Broman is recommanding for people who learnt R a while ago

source: David Robinson check out David's broom presentation
Hadley Wickham and Wes McKinney recently released feather, a new python / R project.
It rapidly stores dataframes as binary files and preserves column types.

Tutorial based on the great conference by Hadley Wickham
progress bar will be added

Using purrr and tidyr. Hadley is focusing on every part of R to clean it up.
purrr revisits the apply family in a consistent way. tidyr::nest nests list in tibble::data_frame to keep related things together.
compare (notice seq_along instead of 1:length(mtcars))
means <- vector("double", ncol(mtcars))
for (i in seq_along(mtcars)) {
means[i] <- mean(mtcars[[i]])
}
means
## [1] 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250 ## [7] 17.848750 0.437500 0.406250 3.687500 2.812500
and
library("purrr")
map_dbl(mtcars, mean)
## mpg cyl disp hp drat wt ## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250 ## qsec vs am gear carb ## 17.848750 0.437500 0.406250 3.687500 2.812500
library("purrr")
library("dplyr", warn.conflicts = FALSE)
funs <- list(mean = mean, median = median, sd = sd)
funs %>%
map(~ mtcars %>% map_dbl(.x))
## $mean ## mpg cyl disp hp drat wt ## 20.090625 6.187500 230.721875 146.687500 3.596563 3.217250 ## qsec vs am gear carb ## 17.848750 0.437500 0.406250 3.687500 2.812500 ## ## $median ## mpg cyl disp hp drat wt qsec vs am ## 19.200 6.000 196.300 123.000 3.695 3.325 17.710 0.000 0.000 ## gear carb ## 4.000 2.000 ## ## $sd ## mpg cyl disp hp drat wt ## 6.0269481 1.7859216 123.9386938 68.5628685 0.5346787 0.9784574 ## qsec vs am gear carb ## 1.7869432 0.5040161 0.4989909 0.7378041 1.6152000
Linear model per country
library("gapminder")
library("tidyr")
by_country_lm <- gapminder %>%
mutate(year1950 = year - 1950) %>%
group_by(continent, country) %>%
nest() %>%
mutate(model = map(data, ~ lm(lifeExp ~ year1950, data = .x)))
by_country_lm
## Source: local data frame [142 x 4] ## ## continent country data model ## (fctr) (fctr) (chr) (chr) ## 1 Asia Afghanistan <tbl_df [12,5]> <S3:lm> ## 2 Europe Albania <tbl_df [12,5]> <S3:lm> ## 3 Africa Algeria <tbl_df [12,5]> <S3:lm> ## 4 Africa Angola <tbl_df [12,5]> <S3:lm> ## 5 Americas Argentina <tbl_df [12,5]> <S3:lm> ## 6 Oceania Australia <tbl_df [12,5]> <S3:lm> ## 7 Europe Austria <tbl_df [12,5]> <S3:lm> ## 8 Asia Bahrain <tbl_df [12,5]> <S3:lm> ## 9 Asia Bangladesh <tbl_df [12,5]> <S3:lm> ## 10 Europe Belgium <tbl_df [12,5]> <S3:lm> ## .. ... ... ... ...

Use broom to extract, as neat data frames out of lm():
library("broom")
models <- by_country_lm %>%
mutate(glance = map(model, glance),
rsq = glance %>% map_dbl("r.squared"),
tidy = map(model, tidy),
augment = map(model, augment))
models
## Source: local data frame [142 x 8] ## ## continent country data model glance ## (fctr) (fctr) (chr) (chr) (chr) ## 1 Asia Afghanistan <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 2 Europe Albania <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 3 Africa Algeria <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 4 Africa Angola <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 5 Americas Argentina <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 6 Oceania Australia <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 7 Europe Austria <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 8 Asia Bahrain <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 9 Asia Bangladesh <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## 10 Europe Belgium <tbl_df [12,5]> <S3:lm> <data.frame [1,11]> ## .. ... ... ... ... ... ## Variables not shown: rsq (dbl), tidy (chr), augment (chr)
library("ggplot2")
theme_set(theme_bw(14))
models %>%
ggplot(aes(x = rsq, y = reorder(country, rsq)))+
geom_point(aes(colour = continent))+
theme(axis.text.y = element_text(size = 6))
models %>% filter(rsq < 0.55) %>% unnest(data) %>% ggplot(aes(x = year, y = lifeExp))+ geom_line(aes(colour = continent))+ facet_wrap(~ country)
library("shiny")
inputPanel(
selectInput("country", "Select Country", levels(models$country))
)
output$rsq <- renderPlot({
models %>%
filter(country == input$country) %>%
unnest(data) %>%
ggplot(aes(x = year, y = lifeExp))+
geom_line(aes(colour = continent))
})
renderUI({
plotOutput("rsq", height = "400", width = "600")
})
library("shiny")
inputPanel(
selectInput("country", "Select Country", levels(models$country))
)
output$country <- renderPlot({
models %>%
filter(country == input$country) %>%
unnest(data) %>%
ggplot(aes(x = year, y = lifeExp))+
geom_line(aes(colour = continent))
})
renderUI({
plotOutput("country", height = "400", width = "600")
})
library("shiny")
inputPanel(
sliderInput("rsq", "Select rsquared", min = 0, max = 1,
value = c(0, 0.2), dragRange = TRUE)
)
output$rsq <- renderPlot({
models %>%
filter(rsq >= input$rsq[1], rsq <= input$rsq[2]) %>%
unnest(data) %>%
ggplot(aes(x = year, y = lifeExp))+
geom_line(aes(colour = continent))+
facet_wrap(~ country)
})
renderUI({
plotOutput("rsq", height = "400", width = "600")
})
models %>%
unnest(tidy) %>%
select(continent, country, rsq, term, estimate) %>%
#filter(continent != "Africa") %>%
spread(term, estimate) %>%
ggplot(aes(x = `(Intercept)`, y = year1950))+
geom_point(aes(colour = continent, size = rsq))+
geom_smooth(se = FALSE)+
scale_size_area()+
labs(x = "Life expectancy (1950)",
y = "Yearly improvement")
purrr proposes safely() and possibly() to enable error-handling.
safely() is a type-stable version of try. It always returns a list of two elements, the result and the error, and one will always be NULL.
safely(log)(10)
## $result ## [1] 2.302585 ## ## $error ## NULL
safely(log)("a")
## $result ## NULL ## ## $error ## <simpleError in .f(...): non-numeric argument to mathematical function>
to be investigated