Serology_QCOVID_March2022.Rmd

---
title: "Serology Data Exploration March 2022"
output: html_notebook
---
## Datasets


Setup dependancies
```{r}
library(dplyr)
library(tibble)
library("tidyr")
library(ggplot2)
require(scales) 
library(remotes)
#remotes::install_github("Public-Health-Scotland/phsstyles",
##  upgrade = "never"
#)git
```

```{r}
library(phsstyles)
```


### Vaccinations

Vaccinations that include the time, for those who have had a serology sample taken

```{r}
df_vacc <- readRDS("/conf/EAVE/GPanalysis/data/temp/sero_vacc_time.rds") %>% as_tibble()
nrow(df_vacc)
```


### Serology Primary Care 

Load serology Primary Care and print the number of rows of data there are. Also, extract the IgG quantative result as a number.
```{r}

df_serology_pc <- readRDS("/conf/EAVE/GPanalysis/data/serology_primcare_march22.rds") %>% 
                  as_tibble() %>%
                  dplyr::mutate(IgG = readr::parse_number(test_result_quant))

nrow(df_serology_pc)
```

See the column names...
```{r}
colnames(df_serology_pc)
```

```

```

Make a quick histogram plot of the number of samples collected over time
```{r}
qplot(df_serology_pc$Sampledate_iso, geom="histogram", fill=I("red"), xlab='Sample Date', ylab='Number of Collected Samples', bins=30) 
```


Count how many times each person has had a sample taken, and then count how many occurrences there are of people with `nsamples`
```{r}
df_serology_pc %>% dplyr::group_by(EAVE_LINKNO) %>% dplyr::summarise(nsamples = dplyr::n()) %>% 
                   dplyr::group_by(nsamples) %>% dplyr::summarise(noccurrences = dplyr::n())
```


Filter the dataframe to get the number of rows where there has been a positive test result
```{r}

df_serology_pc_pos <- df_serology_pc %>% dplyr::filter(test_result_qual == "Positive")
nrow(df_serology_pc_pos)
```


### Serology SNBTS (blood donors)


```{r}
df_serology_bd <- readRDS("/conf/EAVE/GPanalysis/data/serology_snbts_march22.rds") %>% 
                  as_tibble() %>%
                  dplyr::mutate(IgG = readr::parse_number(test_result_quant))

nrow(df_serology_bd)

```

```{r}
library(phsstyles)
```


Plot the number of samples collected over time for the blood donors
```{r}
qplot(df_serology_bd$Sampledate_iso, 
      geom="histogram", 
      xlab='Sample Date', ylab='Number of Collected Samples', bins=40) + scale_colour_discrete_phs(palette = "main")
```


Again, filter the dataframe to get the number of rows where there has been a positive test result:
```{r}

df_serology_bd_pos <- df_serology_bd %>% dplyr::filter(test_result_qual == "Positive")
nrow(df_serology_bd_pos)
```

Check the number of repeat measurements of a person.. 
```{r}
df_serology_bd %>% dplyr::group_by(EAVE_LINKNO) %>% dplyr::summarise(nsamples = dplyr::n()) %>% 
                   dplyr::group_by(nsamples) %>% dplyr::summarise(noccurrences = dplyr::n())
```


### QCOVID (Feb22 Update)

Here we load the QCOVID dataframe and filter it on those EAVE studies that are present in the serology datasets.

```{r}
df_qcovid <- readRDS("/conf/EAVE/GPanalysis/data/cleaned_data/QCOVID_feb22.rds") %>% 
             as_tibble() 

df_qcovid_pc <- df_qcovid %>% dplyr::filter(EAVE_LINKNO %in% df_serology_pc$EAVE_LINKNO)
nrow(df_qcovid_pc)

df_qcovid_bd <- df_qcovid %>% dplyr::filter(EAVE_LINKNO %in% df_serology_bd$EAVE_LINKNO)
nrow(df_qcovid_bd)

```


```{r}
df_ana_pc <- df_serology_pc %>% dplyr::left_join(df_qcovid_pc) %>% 
          dplyr::select(c("EAVE_LINKNO","Sampledate_iso","test_result_qual","IgG","n_risk_gps")) %>%
          dplyr::mutate(n_risk_gps = ifelse(is.na(n_risk_gps), "0", levels(n_risk_gps)[n_risk_gps]))
          

df_ana_pc_pos <- df_ana_pc %>% dplyr::filter(test_result_qual == "Positive")
nrow(df_ana_pc_pos)
```

```{r}
df_ana_pc %>% group_by(n_risk_gps) %>% summarise(counts=n())
```


Plot a histogram showing the linking of the serology data (primary care) with QCOVID


```{r}
colnames(df_ana_pc)
```


```{r}
colors <- c("Negative" = phs_colours(c("phs-blue")),"Equivocal" = phs_colours(c("phs-magenta")),"Positive" = phs_colours(c("phs-purple")))
colors
```

```{r}

n <- nrow(df_ana_pc)
colors <- c("Negative" = phs_colours(c("phs-blue")),"Equivocal" = phs_colours(c("phs-magenta")),"Positive" = phs_colours(c("phs-purple")))


p <- ggplot(df_ana_pc, aes(x=IgG)) +
     geom_histogram(position = "stack", bins=30, data=subset(df_ana_pc,test_result_qual == 'Negative'), aes(fill='Negative')) +
     geom_histogram(position = "stack", bins=30, data=subset(df_ana_pc,test_result_qual == 'Positive'), aes(fill='Positive')) +
     geom_histogram(position = "stack", bins=30, data=subset(df_ana_pc,test_result_qual == 'Equivocal'), aes(fill='Equivocal')) +
     labs(title='Primary Care',x="Antibody Levels [IgG]", y="Number of Samples", fill="Result") +
     scale_fill_manual(values = colors) +
     theme(aspect.ratio = 0.5)  +
     scale_y_log10() +
     scale_x_log10()

p
```


```{r}

n <- nrow(df_ana_pc)
p <- ggplot(df_ana_pc, aes(x=IgG, fill=test_result_qual)) +
     geom_histogram(position = "stack", bins=30) +
     labs(title='Primary Care',x="Antibody Levels [IgG]", y="Number of Samples") +
     theme(aspect.ratio = 0.5)  +
     scale_y_log10() +
     scale_x_log10()
p
```


```{r}
bw <- 50
n <- nrow(df_ana_pc)
p <- ggplot(df_ana_pc, aes(IgG)) +
     geom_histogram(bins=20) +
     labs(x="Primary Care IgG", y="Number of Samples", fill="Number of Risks (QCOVID)") +
     scale_fill_discrete_phs(palette = "main") + 
     theme(aspect.ratio = 0.5)  
p
```


```{r}
df_ana <- df_ana_pc %>% left_join(df_vacc %>% filter(serology_source=='primary_care_serology'))
```


Measurements taken after 1st Vac, before 2nd Vac
```{r}
df_ana_v1 <- df_ana %>% filter(Sampledate_iso > d1_datetime & Sampledate_iso < d2_datetime )
```


```{r}
library(lubridate)
```

```{r}

# create breaks
breaks <- hour(hm("00:00", "6:00", "12:00", "18:00", "23:59"))
# labels for the breaks
labels <- c("Night", "Morning", "Afternoon", "Evening")

df_ana_v1$Time_of_day <- cut(x=hour(df_ana_v1$d1_datetime), breaks = breaks, labels = labels, include.lowest=TRUE)
```

Calculate the number of days the measurement was taken after the first vaccine 

```{r}
df_ana_v1 <- df_ana_v1 %>% mutate(days_passed = as.numeric(Sampledate_iso - d1_datetime,units='days'))
```


```{r}
ggplot(df_ana_v1, aes(x=days_passed)) + 
  geom_histogram()
```

```{r}
ggplot(df_ana_v1, aes(x=IgG)) + 
  geom_histogram()
```


```{r}
ggplot(df_ana_v1, aes(x=days_passed, y=mean(IgG))) + 
  geom_boxplot(width=500)

```


```{r}
bin_size <- 20

df_ana_v1 %>% 
  mutate(bin_dist = factor(days_passed%/%bin_size)) %>% 
  ggplot(aes(x = bin_dist, y = IgG)) +
  geom_boxplot()
```

```{r}
df_ana_v1 %>% select(c("Sampledate_iso","IgG","n_risk_gps","d1_product","d1_datetime","Time_of_day","days_passed")) %>% write.csv('test.csv')
```


Plot a histogram showing the linking of the serology data with QCOVID
```{r}
df_ana_bd <- df_serology_bd %>% dplyr::left_join(df_qcovid_bd) %>% 
          dplyr::select(c("EAVE_LINKNO","Sampledate_iso","test_result_qual","IgG","n_risk_gps")) %>%
          dplyr::mutate(n_risk_gps = ifelse(is.na(n_risk_gps), "0", levels(n_risk_gps)[n_risk_gps]))
          

df_ana_bd_pos <- df_ana_bd %>% dplyr::filter(test_result_qual == "Positive")
```
```{r}
min(df_ana_bd_pos$IgG) 
max(df_ana_bd_pos$IgG) 
```


Number of risk groups in the dataset

```{r}
df_ana_bd %>% group_by(n_risk_gps) %>% summarise(n=n())
```


```{r}

n <- nrow(df_ana_bd)
colors <- c("Negative" = phs_colours(c("phs-blue")),"Equivocal" = phs_colours(c("phs-magenta")),"Positive" = phs_colours(c("phs-purple")))


p <- ggplot(df_ana_bd, aes(x=IgG)) +
     geom_histogram(position = "stack", bins=40, data=subset(df_ana_bd,test_result_qual == 'Negative'), aes(fill='Negative')) +
     geom_histogram(position = "stack", bins=40, data=subset(df_ana_bd,test_result_qual == 'Positive'), aes(fill='Positive')) +
     geom_histogram(position = "stack", bins=40, data=subset(df_ana_bd,test_result_qual == 'Equivocal'), aes(fill='Equivocal')) +
     labs(title='Blood Donors',x="Antibody Levels [IgG]", y="Number of Samples", fill="Result") +
     scale_fill_manual(values = colors) +
     theme(aspect.ratio = 0.5)  +
     scale_y_log10() 
p
```


All data

```{r}
bw <- 0.5
n <- nrow(df_ana_bd_pos)
p <- ggplot(df_ana_bd_pos, aes(x=IgG, color=n_risk_gps)) +
     stat_bin(geom="step",bins=10,position='identity', size=2, alpha=0.8) +
     xlim(0,10) +
     labs(title='Blood Donors',x="IgG", y="Number of Samples", color="Number of Risks (QCOVID)") +
     scale_colour_discrete_phs(palette = "all") + 
     scale_y_continuous(trans = "log10") +
     theme(aspect.ratio = 0.5) 
p

```

Positive samples only...

```{r}
df_ana_bd_pos %>% group_by(n_risk_gps) %>% summarise(n=n())
```

```{r}
bw <- 100
n <- nrow(df_ana_pc_pos)
p <- ggplot(df_ana_pc_pos, aes(x=IgG, color=n_risk_gps)) +
     stat_bin(geom="step",bins=10,position='identity', size=2, alpha=0.8) +
     xlim(0,2000) +
     labs(title='Primary Care',x="IgG", y="Number of Samples", color="Number of Risks (QCOVID)") +
     scale_colour_discrete_phs(palette = "all") + 
     scale_y_continuous(trans = "log10") +
     theme(aspect.ratio = 0.5) 
p
```


```{r}
bw <- 0.5
n <- nrow(df_ana_bd_pos)
p <- ggplot(df_ana_bd_pos, aes(x=IgG, fill=n_risk_gps)) +
     geom_histogram(binwidth = bw) + xlim(0,11+bw) +
     labs(x="Blood Donors IgG", y="Number of Samples", fill="Number of Risks (QCOVID)") +
     scale_fill_discrete_phs(palette = "main") + 
     scale_y_log10() +
     theme(aspect.ratio = 0.5) 
p
```

```{r}
df_ana_bd
```

```{r}

df <- readRDS("/conf/EAVE/GPanalysis/data/serology_primcare_march22.rds") %>% dplyr::left_join(df_qcovid_pc) %>% mutate(age_group = cut(age,right=FALSE, breaks = c(0,10,20,30,40,50,60,70,80,90,1000),
                               label=c("0-9", 
                                       "10-19", 
                                       "20-29",
                                       "30-39", 
                                       "40-49",
                                       "50-59",
                                       "60-69",
                                       "70-79",
                                       "80-89",
                                       "90+"))) 
df
```


```{r}
max_prop <- 30     # choose the highest proportion you want to show 
step <- 5           # choose the space you want beween labels 

## this part defines vector using the above numbers with axis breaks
breaks <- c(
    seq(max_prop/100 * -1, 0 - step/100, step/100), 
    0, 
    seq(0 + step / 100, max_prop/100, step/100)
    )

## this part defines vector using the above numbers with axis limits
limits <- c(max_prop/100 * -1, max_prop/100)

## this part defines vector using the above numbers with axis labels
labels <-  c(
      seq(max_prop, step, -step), 
      0, 
      seq(step, max_prop, step)
    )


p1 <- df %>% 
  ## make sure the variables are factors
  mutate(age_group = factor(n_risk_gps), 
         sex = factor(sex)) %>%
  age_pyramid(
    age_group = "age_group",
    split_by = "sex", 
    proportion = TRUE) +
  ## only show the x axis label (otherwise repeated in all three plots)
  labs(title = "Primary Care", 
       x = "N Risk Groups", 
       y = "% of Sample")  + 
  ## make the x axis the same for all plots 
  scale_y_continuous(breaks = breaks, 
    limits = limits, 
    labels = labels)

p1
```

```{r}

df <- readRDS("/conf/EAVE/GPanalysis/data/serology_snbts_march22.rds") %>% dplyr::left_join(df_qcovid_bd)
df
```
```{r}
max_prop <- 40     # choose the highest proportion you want to show 
step <- 5           # choose the space you want beween labels 

## this part defines vector using the above numbers with axis breaks
breaks <- c(
    seq(max_prop/100 * -1, 0 - step/100, step/100), 
    0, 
    seq(0 + step / 100, max_prop/100, step/100)
    )

## this part defines vector using the above numbers with axis limits
limits <- c(max_prop/100 * -1, max_prop/100)

## this part defines vector using the above numbers with axis labels
labels <-  c(
      seq(max_prop, step, -step), 
      0, 
      seq(step, max_prop, step)
    )


p2 <- df %>% 
  ## make sure the variables are factors
  mutate(age_group = factor(n_risk_gps), 
         sex = factor(sex)) %>%
  age_pyramid(
    age_group = "age_group",
    split_by = "sex", 
    proportion = TRUE) +
  ## only show the x axis label (otherwise repeated in all three plots)
  labs(title = "Blood Donors", 
       x = "N Risk Groups", 
       y = "% of Sample")  + 
  ## make the x axis the same for all plots 
  scale_y_continuous(breaks = breaks, 
    limits = limits, 
    labels = labels)

p2
```


```{r}
df <- readRDS("/conf/EAVE/GPanalysis/data/EAVE_demographics_SK.rds") %>% left_join(df_qcovid)
df
```

```{r}

max_prop <- 30     # choose the highest proportion you want to show 
step <- 5           # choose the space you want beween labels 

## this part defines vector using the above numbers with axis breaks
breaks <- c(
    seq(max_prop/100 * -1, 0 - step/100, step/100), 
    0, 
    seq(0 + step / 100, max_prop/100, step/100)
    )

## this part defines vector using the above numbers with axis limits
limits <- c(max_prop/100 * -1, max_prop/100)

## this part defines vector using the above numbers with axis labels
labels <-  c(
      seq(max_prop, step, -step), 
      0, 
      seq(step, max_prop, step)
    )


p3 <- df %>% 
  ## make sure the variables are factors
  mutate(age_group = factor(n_risk_gps), 
         sex = factor(Sex)) %>%
  age_pyramid(
    age_group = "age_group",
    split_by = "sex", 
    proportion = TRUE) +
  ## only show the x axis label (otherwise repeated in all three plots)
  labs(title = "EAVE-II", 
       x = "N Risk Groups", 
       y = "% of Sample")  + 
  ## make the x axis the same for all plots 
  scale_y_continuous(breaks = breaks, 
    limits = limits, 
    labels = labels)

p3
```


```{r,fig.height=2.5,fig.width=12}
grid.arrange(p3,p1,p2,ncol=3,nrow=1)
```


```{r,fig.height=2.5,fig.width=12}

max_prop <- 40     # choose the highest proportion you want to show 
step <- 5           # choose the space you want beween labels 

## this part defines vector using the above numbers with axis breaks
breaks <- c(
    seq(max_prop/100 * -1, 0 - step/100, step/100), 
    0, 
    seq(0 + step / 100, max_prop/100, step/100)
    )

## this part defines vector using the above numbers with axis limits
limits <- c(max_prop/100 * -1, max_prop/100)

## this part defines vector using the above numbers with axis labels
labels <-  c(
      seq(max_prop, step, -step), 
      0, 
      seq(step, max_prop, step)
    )


p2 <- df %>% 
  ## make sure the variables are factors
  mutate(age_group = factor(age_group), 
         sex = factor(Sex)) %>%
  age_pyramid(
    age_group = "age_group",
    split_by = "sex", 
    proportion = TRUE) +
  ## only show the x axis label (otherwise repeated in all three plots)
  labs(title = "Blood Donors", 
       x = "N Risk Groups", 
       y = "% of Sample")  + 
  ## make the x axis the same for all plots 
  scale_y_continuous(breaks = breaks, 
    limits = limits, 
    labels = labels)

grid.arrange(p1,p2,p2,ncol=3,nrow=1)
```