Jesse Cambon 30 November, 2019
Simple tidyverse code for common data science operations in R.
library(tidyverse)
library(ggplot2)
# Set default ggplot theme
theme_set(theme_bw()+
theme(legend.position = "top",
plot.subtitle= element_text(face="bold",hjust=0.5),
plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)))
Initial ‘mpg’ Dataset:
manufacturer | model | displ | year | cyl | trans | drv | cty | hwy | fl | class |
---|---|---|---|---|---|---|---|---|---|---|
audi | a4 | 1.8 | 1999 | 4 | auto(l5) | f | 18 | 29 | p | compact |
audi | a4 | 1.8 | 1999 | 4 | manual(m5) | f | 21 | 29 | p | compact |
audi | a4 | 2.0 | 2008 | 4 | manual(m6) | f | 20 | 31 | p | compact |
Use View(mpg)
to preview the dataset in R.
mpg_subset <- mpg %>%
filter(cyl==4 & year >= 2005 & manufacturer == "nissan") %>%
mutate(ratio=hwy/cty,
make_model=str_c(manufacturer,' ',model)) %>%
select(make_model,cyl,year,hwy,cty,ratio)
make_model | cyl | year | hwy | cty | ratio |
---|---|---|---|---|---|
nissan altima | 4 | 2008 | 31 | 23 | 1.347826 |
nissan altima | 4 | 2008 | 32 | 23 | 1.391304 |
count_cyl <- mpg %>%
count(cyl)
cyl | n |
---|---|
4 | 81 |
5 | 4 |
6 | 79 |
8 | 70 |
mpg_stats <- mpg %>% select(class,hwy) %>%
mutate(class_c=case_when(class %in% c("2seater","subcompact") ~ "subcompact",
TRUE ~ class)) %>%
group_by(class_c) %>%
summarize(count=n(),
max_hwy=max(hwy),
min_hwy=min(hwy),
median_hwy=median(hwy),
mean_hwy=mean(hwy)) %>%
ungroup() %>%
arrange(desc(count)) # sort dataset
Note that ‘2seater’ is reclassified as ‘subcompact’
class_c | count | max_hwy | min_hwy | median_hwy | mean_hwy |
---|---|---|---|---|---|
suv | 62 | 27 | 12 | 17.5 | 18.12903 |
compact | 47 | 44 | 23 | 27.0 | 28.29787 |
midsize | 41 | 32 | 23 | 27.0 | 27.29268 |
subcompact | 40 | 44 | 20 | 26.0 | 27.72500 |
pickup | 33 | 22 | 12 | 17.0 | 16.87879 |
minivan | 11 | 24 | 17 | 23.0 | 22.36364 |
Initial ‘mpg’ Dataset:
manufacturer | model | displ | year | cyl | trans | drv | cty | hwy | fl | class |
---|---|---|---|---|---|---|---|---|---|---|
audi | a4 | 1.8 | 1999 | 4 | auto(l5) | f | 18 | 29 | p | compact |
audi | a4 | 1.8 | 1999 | 4 | manual(m5) | f | 21 | 29 | p | compact |
audi | a4 | 2.0 | 2008 | 4 | manual(m6) | f | 20 | 31 | p | compact |
mpg1 <- mpg %>% slice(1:2) %>%
select(manufacturer,model,hwy,cty) %>%
mutate(dataset=1)
mpg2 <- mpg %>% slice(44:45) %>%
select(manufacturer,model,hwy,cty) %>%
mutate(dataset=2)
mpg3 <- mpg %>% slice(1:2,5:6) %>%
select(displ,year)
Stack vertically and horizontally
mpg_stack_vert <- mpg1 %>%
bind_rows(mpg2)
mpg_stack_horz <- mpg_stack_vert %>%
bind_cols(mpg3)
car_type <- mpg %>% select(manufacturer,model,class) %>%
distinct() # distinct rows only
joined <- mpg_stack_horz %>%
left_join(car_type,by=c('manufacturer','model')) %>%
select(-dataset,everything())
Initial Data:
GEOID | NAME | variable | estimate | moe |
---|---|---|---|---|
01 | Alabama | income | 24476 | 136 |
01 | Alabama | rent | 747 | 3 |
02 | Alaska | income | 32940 | 508 |
02 | Alaska | rent | 1200 | 13 |
- pivot_wider
- names_from: column containing values that we will use for our new column names
col_ratio <- us_rent_income %>%
select(-GEOID,-moe) %>%
pivot_wider(names_from = variable, values_from = estimate) %>%
drop_na() %>% # drop missing values
mutate(income_rent_ratio = income / (12*rent))
Income and Rent are now in separate columns:
NAME | income | rent | income_rent_ratio |
---|---|---|---|
Alabama | 24476 | 747 | 2.730478 |
Alaska | 32940 | 1200 | 2.287500 |
Arizona | 27517 | 972 | 2.359139 |
Arkansas | 23789 | 709 | 2.796074 |
Initial Data:
country | indicator | 2000 | 2001 | 2002 | 2003 | 2004 | 2005 | 2006 | 2007 | 2008 | 2009 | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ABW | SP.URB.TOTL | 42444.000000 | 43048.000000 | 43670.000000 | 44246.00000 | 4.466900e+04 | 4.488900e+04 | 4.48810e+04 | 4.468600e+04 | 4.437500e+04 | 4.405200e+04 | 4.377800e+04 | 4.382200e+04 | 4.406400e+04 | 4.43600e+04 | 4.467400e+04 | 4.497900e+04 | 4.52750e+04 | 4.557200e+04 |
ABW | SP.URB.GROW | 1.182632 | 1.413021 | 1.434559 | 1.31036 | 9.514777e-01 | 4.913027e-01 | -1.78233e-02 | -4.354289e-01 | -6.984006e-01 | -7.305493e-01 | -6.239346e-01 | 1.004566e-01 | 5.507148e-01 | 6.69504e-01 | 7.053514e-01 | 6.804037e-01 | 6.55929e-01 | 6.538489e-01 |
ABW | SP.POP.TOTL | 90853.000000 | 92898.000000 | 94992.000000 | 97017.00000 | 9.873700e+04 | 1.000310e+05 | 1.00832e+05 | 1.012200e+05 | 1.013530e+05 | 1.014530e+05 | 1.016690e+05 | 1.020530e+05 | 1.025770e+05 | 1.03187e+05 | 1.037950e+05 | 1.043410e+05 | 1.04822e+05 | 1.052640e+05 |
- pivot_longer
- cols (1st arg): what columns do we want to pivot? (ie. subtract ones we don’t want to)
- names_to : the name of new column holding the column names as values
- values_to : name of new column containing values
- seq(start, stop, increment) -> generates sequence
wb_pop <- world_bank_pop %>%
pivot_longer(c(-country,-indicator), names_to = "year", values_to = "value") %>%
mutate(year=as.numeric(year)) %>% # convert to numeric
filter(year %in% seq(2000,2016,2))
After:
country | indicator | year | value |
---|---|---|---|
ABW | SP.URB.TOTL | 2000 | 42444 |
ABW | SP.URB.TOTL | 2002 | 43670 |
ABW | SP.URB.TOTL | 2004 | 44669 |
- use fill argument in ggplot() to set bar color based on a variable
- reorder() orders the bars
# A simple bar chart - average heights of the species
# the reorder command orders our bars in order of descending height
ggplot(data=mpg_stats,
aes(x = reorder(class_c,-mean_hwy), y=mean_hwy)) +
geom_bar(stat='identity',position='dodge',color='black') +
scale_y_continuous(expand = expand_scale(mult = c(0, .1))) + # plot margins
geom_text(aes(label=round(mean_hwy)), vjust=-0.5) + # labelling
theme(legend.position="none", # no legend (in case we want to use fill)
panel.grid = element_blank()) + # turn off grid
labs(title='') +
xlab('') +
ylab('')
# Histogram with autobinning based on gender
ggplot(mpg,aes(hwy)) +
geom_histogram(aes(fill=cyl),binwidth=1) +
scale_y_continuous(expand = expand_scale(mult = c(0, .05))) +
xlab('Highway mpg') + ylab('Count')
We divide the value
field by 100 since to convert it to a decimal
percentage value.
SP.POP.GROW is the % population growth
ggplot(wb_pop %>% filter(country %in% c("USA","CAN","MEX") & indicator == "SP.POP.GROW"),
aes(x=year,y=value/100,color = country)) +
theme_classic() +
geom_line() + geom_point() + # lines and points
scale_x_continuous(expand = expand_scale(mult = c(.05, .05))) +
scale_y_continuous(labels=scales::percent) +
labs(title='',
caption='') +
theme(legend.title = element_blank(),
panel.grid.minor.x = element_blank(),
legend.text=element_text(size=10),
legend.position='right') +
xlab('Year') +
ylab('Population Growth') +
# make legend items bigger
guides(colour = guide_legend(override.aes = list(size=2)))
ggplot(data=col_ratio %>% arrange(desc(rent)) %>% head(15), aes(x=NAME, y=rent) ) +
geom_segment( aes(x=reorder(NAME,rent) ,xend=NAME, y=0, yend=rent), color="grey") +
geom_point(size=3) +
theme_minimal() +
theme(plot.subtitle= element_text(face="bold",hjust=0.5),
plot.title = element_text(lineheight=1, face="bold",hjust = 0.5),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank()
) +
coord_flip() +
scale_y_continuous(labels=scales::dollar,expand = expand_scale(mult = c(0, .1))) +
labs(title='States With Highest Rent',
caption='Source: 2017 American Community Survey (Census)') +
xlab('') + ylab('Median Monthly Rent')