Load in data again, keeping only complete cases
fhs <- read.csv(url("https://raw.githubusercontent.com/molepi/Molecular-Data-Science/master/RIntro_practical/data.csv"))
fhs <- fhs[complete.cases(fhs), ]
Load tidyverse
library(tidyverse)
A bar plot of MI, coloured by smoking status
fhs %>%
ggplot() +
geom_bar(aes(MI, fill=CURSMOKE)) +
ggtitle("Bar chart of MI coloured by smoking status") +
xlab("MI status")
A histogram of total cholesterol in overweight individuals
fhs %>%
filter(BMI >= 30) %>%
ggplot() +
geom_histogram(aes(TOTCHOL), binwidth = 5, colour="cadetblue3") +
ggtitle("Histogram of total cholesterol in overweight individuals") +
xlab("Total cholesterol (mg/dL)") +
geom_vline(aes(xintercept=240), colour='red', size=1.5)
Four density plots of age for each education level
fhs %>%
ggplot() +
geom_density(aes(AGE, fill=EDUC), show.legend = F) +
facet_wrap(~EDUC, labeller=label_both) +
ggtitle("Density plots of age for each education level") +
xlab("Age (years)")
Two bar plots of sex in current and non-smokers for those who experienced MI
fhs %>%
filter(MI == "Yes") %>%
ggplot() +
geom_bar(aes(SEX, fill=SEX)) +
facet_wrap(~ CURSMOKE) +
ggtitle("Bar chart of sex in current and non-smokers", subtitle="for those who experienced MI") +
xlab("Sex")
A scatter plot of glucose levels against BMI, coloured by diabetes status
fhs %>%
ggplot() +
geom_point(aes(GLUCOSE, BMI, colour=DIABETES)) +
ggtitle("Scatter plot of glucose levels against BMI", subtitle="coloured by diabetes status") +
xlab("Glucose level (mg/dL)") +
ylab("BMI (kg/m^2)")
A horizontal box plot of systolic BP by sex for people over the age of 60
fhs %>%
filter(AGE >= 60) %>%
ggplot() +
geom_boxplot(aes(SEX, SYSBP, fill=SEX), show.legend=F) +
coord_flip() +
ggtitle("Horizontal box plot of systolic BP by sex", subtitle="for people over the age of 60")
A scatter plot of total cholesterol against glucose for categories of BMI
fhs %>%
ggplot() +
geom_point(aes(TOTCHOL, GLUCOSE), colour='grey30') +
facet_wrap(~cut(BMI, c(10,20,25,30,100))) +
ggtitle("Scatter plot of total cholesterol against glucose", subtitle="for categories of BMI") +
xlab("Total cholesterol (mg/dL)") +
ylab("Glucose levels (mg/dL)")
Define table of smoking against MI
SMOKE_MI <- xtabs(~CURSMOKE+MI, fhs)
SMOKE_MI
## MI
## CURSMOKE No Yes
## No 1671 291
## Yes 1537 352
Chi-square test
chisq <- chisq.test(SMOKE_MI)
chisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: SMOKE_MI
## X-squared = 9.7325, df = 1, p-value = 0.00181
This shows we have strong evidence (p=0.001) that smoking status and MI are associated.
Looking at the Pearson residuals, we see that people who had MI contribute most to this association.
round(chisq$residuals, 3)
## MI
## CURSMOKE No Yes
## No 0.905 -2.022
## Yes -0.923 2.061
Pearson’s correlation coefficient
cor <- cor.test(~ SYSBP + DIABP, fhs)
cor
##
## Pearson's product-moment correlation
##
## data: SYSBP and DIABP
## t = 78.845, df = 3849, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7734942 0.7976652
## sample estimates:
## cor
## 0.7858797
There is very strong evidence (p<0.001) that systolic and diastolic blood pressure are strongly correlated (r2=0.786).
T-test for age difference between MI
tt <- t.test(AGE ~ MI, fhs)
tt
##
## Welch Two Sample t-test
##
## data: AGE by MI
## t = -9.0315, df = 918.42, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.072404 -2.618479
## sample estimates:
## mean in group No mean in group Yes
## 49.34352 52.68896
There is very strong (p<0.001) evidence that people who have had MI are older than those who didn’t.
Logistic regression
glmfit <- glm(MI ~ BMI + AGE + SEX, fhs, family = "binomial")
summary(glmfit)
##
## Call:
## glm(formula = MI ~ BMI + AGE + SEX, family = "binomial", data = fhs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2636 -0.6548 -0.4721 -0.3237 2.4734
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.14563 0.39969 -15.376 < 2e-16 ***
## BMI 0.05965 0.01089 5.476 4.36e-08 ***
## AGE 0.04537 0.00522 8.691 < 2e-16 ***
## SEXMale 1.19593 0.09413 12.705 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3474 on 3850 degrees of freedom
## Residual deviance: 3191 on 3847 degrees of freedom
## AIC: 3199
##
## Number of Fisher Scoring iterations: 5
Odds ratios can also be accessed.
exp(coef(glmfit))
## (Intercept) BMI AGE SEXMale
## 0.002142822 1.061466118 1.046414460 3.306636063
Is age correlated with total cholesterol? Yes, there is strong evidence (p<0.001) that age is mildly correlated (r=0.252) with total cholesterol
cor <- cor.test(~ AGE + TOTCHOL, fhs)
cor
##
## Pearson's product-moment correlation
##
## data: AGE and TOTCHOL
## t = 16.187, df = 3849, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2226523 0.2818005
## sample estimates:
## cor
## 0.2524622
Do diabetes have significantly different glucose levels when compared to non-diabetics? Yes, there is strong evidence (t=-10.99, p<0.001) that diabetics have higher glucose levels than non-diabetics
tt <- t.test(GLUCOSE ~ DIABETES, fhs)
tt
##
## Welch Two Sample t-test
##
## data: GLUCOSE by DIABETES
## t = -10.989, df = 108.15, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -106.3437 -73.8425
## sample estimates:
## mean in group No mean in group Yes
## 79.55826 169.65138
Is MI associated with total cholesterol in individuals over the age of 50? Yes, there is strong evidence (p=0.003) that total cholesterol is associated with an increased risk of MI. Increasing total cholesterol by 1 unit confers a 0.3% increased odds of MI
fhs50 <- filter(fhs, AGE > 50)
glmfit <- glm(MI ~ TOTCHOL, fhs50, family = "binomial")
summary(glmfit)
##
## Call:
## glm(formula = MI ~ TOTCHOL, family = "binomial", data = fhs50)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8979 -0.7050 -0.6660 -0.6099 1.9374
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.287175 0.334193 -6.844 7.71e-12 ***
## TOTCHOL 0.003870 0.001305 2.965 0.00303 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1817.4 on 1763 degrees of freedom
## Residual deviance: 1808.7 on 1762 degrees of freedom
## AIC: 1812.7
##
## Number of Fisher Scoring iterations: 4
exp(coef(glmfit))
## (Intercept) TOTCHOL
## 0.1015529 1.0038778
Write a loop that returns the square of the first 3 elements in a vector
x <- c(1,2,3,4)
for (i in 1:3) {
out <- paste("The square of element", i, "of the vector is", x[i]^2)
print(out)
}
## [1] "The square of element 1 of the vector is 1"
## [1] "The square of element 2 of the vector is 4"
## [1] "The square of element 3 of the vector is 9"
Write a loop that prints each column name in the FHS dataset, alongside the number of characters in it.
for (i in colnames(fhs)) {
out <- paste(i, "-", nchar(i))
print(out)
}
## [1] "SEX - 3"
## [1] "TOTCHOL - 7"
## [1] "AGE - 3"
## [1] "SYSBP - 5"
## [1] "DIABP - 5"
## [1] "CURSMOKE - 8"
## [1] "BMI - 3"
## [1] "DIABETES - 8"
## [1] "BPMEDS - 6"
## [1] "GLUCOSE - 7"
## [1] "EDUC - 4"
## [1] "MI - 2"
Write a loop that tells you at what integer the product of all previous positive integers is over 5 million
x <- 1
product <- 1
while (product < 5000000) {
x <- x + 1
product <- product * x
}
print(x)
## [1] 11
Write a function that takes x, y, and z as arguments and returns their product
product <- function(x, y, z) {
p <- x * y * z
return(p)
}
product(1, 2, 3)
## [1] 6
Write a function that returns the correlation of all variables in the FHS data with BMI
correlation <- function(x) {
results <- vector()
for (i in colnames(x)) {
cor <- cor.test(~ as.numeric(get(i)) + BMI, x)
results <- c(results, cor$estimate)
}
results <- data.frame(corr = results, row.names = colnames(x))
results
}
correlation(fhs)
## corr
## SEX 0.06410327
## TOTCHOL 0.12684902
## AGE 0.13456282
## SYSBP 0.33017719
## DIABP 0.38256340
## CURSMOKE -0.16090414
## BMI 1.00000000
## DIABETES 0.09208700
## BPMEDS 0.10339616
## GLUCOSE 0.08743118
## EDUC -0.14131073
## MI 0.10653525
A function that associates all variables in the FHS data with age
linearAllAge <- function(x) {
results <- data.frame()
fhsCov <- select(x, -AGE)
for (i in colnames(fhsCov)) {
fit <- lm(AGE ~ get(i), x)
results <- rbind(results, summary(fit)$coefficients[2, , drop=F])
}
results <- cbind(Covariate = colnames(fhsCov), results)
results %>%
arrange(`Pr(>|t|)`) %>%
print
}
linearAllAge(fhs)
## Covariate Estimate Std. Error t value Pr(>|t|)
## 1 SYSBP 0.15212358 0.005704571 26.6669623 6.502856e-144
## 2 TOTCHOL 0.04942907 0.003053592 16.1871905 4.575363e-57
## 3 DIABP 0.14970225 0.011252509 13.3039004 1.618282e-39
## 4 CURSMOKE -3.57134119 0.273466274 -13.0595307 3.590806e-38
## 5 EDUC -1.37785630 0.134173502 -10.2692133 1.998360e-24
## 6 BPMEDS 7.05546048 0.762248279 9.2561186 3.428656e-20
## 7 MI 3.34544180 0.370692989 9.0248316 2.781334e-19
## 8 BMI 0.28580468 0.033923632 8.4249434 5.032135e-17
## 9 GLUCOSE 0.04329230 0.005690712 7.6075365 3.490933e-14
## 10 DIABETES 5.86953452 0.837071643 7.0119858 2.762753e-12
## 11 SEX -0.19218729 0.280655552 -0.6847799 4.935240e-01