Skip to content

Latest commit

 

History

History
286 lines (224 loc) · 9.64 KB

R-Quickstart.md

File metadata and controls

286 lines (224 loc) · 9.64 KB

R Quickstart

Jesse Cambon 30 November, 2019

Simple tidyverse code for common data science operations in R.

Setup

library(tidyverse)
library(ggplot2)

# Set default ggplot theme
theme_set(theme_bw()+
  theme(legend.position = "top",
            plot.subtitle= element_text(face="bold",hjust=0.5),
            plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)))

Data Manipulation

Warm Up

Initial ‘mpg’ Dataset:

manufacturer model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
audi a4 2.0 2008 4 manual(m6) f 20 31 p compact

Use View(mpg) to preview the dataset in R.

mpg_subset <- mpg %>%
  filter(cyl==4 & year >= 2005  & manufacturer == "nissan") %>%
  mutate(ratio=hwy/cty,
         make_model=str_c(manufacturer,' ',model)) %>%
  select(make_model,cyl,year,hwy,cty,ratio)
make_model cyl year hwy cty ratio
nissan altima 4 2008 31 23 1.347826
nissan altima 4 2008 32 23 1.391304

Counting

count_cyl <- mpg %>%
  count(cyl)
cyl n
4 81
5 4
6 79
8 70

Calculate Summary Stats

mpg_stats <- mpg %>% select(class,hwy) %>%
  mutate(class_c=case_when(class %in% c("2seater","subcompact") ~ "subcompact",
                               TRUE ~ class)) %>%
  group_by(class_c) %>%
  summarize(count=n(),
            max_hwy=max(hwy),
            min_hwy=min(hwy),
            median_hwy=median(hwy),
            mean_hwy=mean(hwy)) %>%
  ungroup() %>%
  arrange(desc(count)) # sort dataset

Note that ‘2seater’ is reclassified as ‘subcompact’

class_c count max_hwy min_hwy median_hwy mean_hwy
suv 62 27 12 17.5 18.12903
compact 47 44 23 27.0 28.29787
midsize 41 32 23 27.0 27.29268
subcompact 40 44 20 26.0 27.72500
pickup 33 22 12 17.0 16.87879
minivan 11 24 17 23.0 22.36364

Stacking Data

Initial ‘mpg’ Dataset:

manufacturer model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
mpg1 <- mpg %>% slice(1:2) %>% 
  select(manufacturer,model,hwy,cty) %>%
  mutate(dataset=1)

mpg2 <- mpg %>% slice(44:45) %>%
  select(manufacturer,model,hwy,cty) %>%
  mutate(dataset=2)

mpg3 <- mpg %>% slice(1:2,5:6) %>%
  select(displ,year)

Stack vertically and horizontally

mpg_stack_vert <- mpg1 %>% 
  bind_rows(mpg2)

mpg_stack_horz <- mpg_stack_vert %>%
  bind_cols(mpg3)

Joining

car_type <- mpg %>% select(manufacturer,model,class) %>%
  distinct() # distinct rows only

joined <- mpg_stack_horz %>%
  left_join(car_type,by=c('manufacturer','model')) %>% 
  select(-dataset,everything())

Long to Wide

Initial Data:

GEOID NAME variable estimate moe
01 Alabama income 24476 136
01 Alabama rent 747 3
02 Alaska income 32940 508
02 Alaska rent 1200 13
  • pivot_wider
    • names_from: column containing values that we will use for our new column names
col_ratio <- us_rent_income %>%
  select(-GEOID,-moe) %>%
  pivot_wider(names_from = variable, values_from = estimate) %>% 
  drop_na() %>%   # drop missing values
  mutate(income_rent_ratio = income / (12*rent))

Income and Rent are now in separate columns:

NAME income rent income_rent_ratio
Alabama 24476 747 2.730478
Alaska 32940 1200 2.287500
Arizona 27517 972 2.359139
Arkansas 23789 709 2.796074

Wide to Long

Initial Data:

country indicator 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
ABW SP.URB.TOTL 42444.000000 43048.000000 43670.000000 44246.00000 4.466900e+04 4.488900e+04 4.48810e+04 4.468600e+04 4.437500e+04 4.405200e+04 4.377800e+04 4.382200e+04 4.406400e+04 4.43600e+04 4.467400e+04 4.497900e+04 4.52750e+04 4.557200e+04
ABW SP.URB.GROW 1.182632 1.413021 1.434559 1.31036 9.514777e-01 4.913027e-01 -1.78233e-02 -4.354289e-01 -6.984006e-01 -7.305493e-01 -6.239346e-01 1.004566e-01 5.507148e-01 6.69504e-01 7.053514e-01 6.804037e-01 6.55929e-01 6.538489e-01
ABW SP.POP.TOTL 90853.000000 92898.000000 94992.000000 97017.00000 9.873700e+04 1.000310e+05 1.00832e+05 1.012200e+05 1.013530e+05 1.014530e+05 1.016690e+05 1.020530e+05 1.025770e+05 1.03187e+05 1.037950e+05 1.043410e+05 1.04822e+05 1.052640e+05
  • pivot_longer
    • cols (1st arg): what columns do we want to pivot? (ie. subtract ones we don’t want to)
    • names_to : the name of new column holding the column names as values
    • values_to : name of new column containing values
  • seq(start, stop, increment) -> generates sequence
wb_pop <- world_bank_pop %>%
  pivot_longer(c(-country,-indicator), names_to = "year", values_to = "value") %>%
  mutate(year=as.numeric(year)) %>% # convert to numeric
  filter(year %in% seq(2000,2016,2))

After:

country indicator year value
ABW SP.URB.TOTL 2000 42444
ABW SP.URB.TOTL 2002 43670
ABW SP.URB.TOTL 2004 44669

Visualizations

Bar Chart

  • use fill argument in ggplot() to set bar color based on a variable
  • reorder() orders the bars
# A simple bar chart - average heights of the species
# the reorder command orders our bars in order of descending height
ggplot(data=mpg_stats,
    aes(x = reorder(class_c,-mean_hwy), y=mean_hwy)) +
geom_bar(stat='identity',position='dodge',color='black') +
scale_y_continuous(expand = expand_scale(mult = c(0, .1))) +    # plot margins
geom_text(aes(label=round(mean_hwy)), vjust=-0.5) +  # labelling
theme(legend.position="none", # no legend (in case we want to use fill)
      panel.grid = element_blank()) + # turn off grid
labs(title='') +
xlab('') +
ylab('')

# Histogram with autobinning based on gender
ggplot(mpg,aes(hwy)) +
geom_histogram(aes(fill=cyl),binwidth=1) +
scale_y_continuous(expand = expand_scale(mult = c(0, .05))) +
xlab('Highway mpg') + ylab('Count')

Line

We divide the value field by 100 since to convert it to a decimal percentage value.

SP.POP.GROW is the % population growth

ggplot(wb_pop %>% filter(country %in% c("USA","CAN","MEX") & indicator == "SP.POP.GROW"),
          aes(x=year,y=value/100,color = country)) +
  theme_classic() +
geom_line() + geom_point() + # lines and points
scale_x_continuous(expand = expand_scale(mult = c(.05, .05))) +
scale_y_continuous(labels=scales::percent) + 
labs(title='',
     caption='') +
theme(legend.title = element_blank(),
      panel.grid.minor.x = element_blank(),
      legend.text=element_text(size=10),
      legend.position='right') +
xlab('Year') +
ylab('Population Growth') +
# make legend items bigger
guides(colour = guide_legend(override.aes = list(size=2))) 

Lollipop

  ggplot(data=col_ratio %>% arrange(desc(rent)) %>% head(15), aes(x=NAME, y=rent) ) +
    geom_segment( aes(x=reorder(NAME,rent) ,xend=NAME, y=0, yend=rent), color="grey") +
    geom_point(size=3) +
   theme_minimal() +
  theme(plot.subtitle= element_text(face="bold",hjust=0.5),
      plot.title = element_text(lineheight=1, face="bold",hjust = 0.5),
      panel.grid.minor.y = element_blank(),
      panel.grid.major.y = element_blank(),
      panel.grid.minor.x = element_blank()
    ) +
  coord_flip() +
    scale_y_continuous(labels=scales::dollar,expand = expand_scale(mult = c(0, .1))) + 
    labs(title='States With Highest Rent',
        caption='Source: 2017 American Community Survey (Census)') +
    xlab('') + ylab('Median Monthly Rent')