-
Notifications
You must be signed in to change notification settings - Fork 0
/
Aim1_Analysis.Rmd
1296 lines (1024 loc) · 55.9 KB
/
Aim1_Analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Aim1_diss_dat"
author: "Emily Wissel"
date: "11/9/2022"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
#install.packages("pacman")
pacman::p_load("devtools","phyloseq","tidyverse","qiime2R", "ggplot2",
"reshape2", "GUniFrac", "vegan", "LDM")
med_dat$Preg_BV <- as.factor(med_dat$Preg_BV) ## because R thinks it is a numerical values, but "0" designates no diagnosis and "1" designates a diagnosis
med_dat %>% ggplot(aes(x=Preg_BV, fill = Preg_BV)) +
geom_bar(width = 0.5) +
theme_light() + ## change theme and background colors
scale_fill_manual(values = c("lightcyan2", "palevioletred1"), ## use color names to change plot colors
name = "BV Diagnosis", labels = c("No", "Yes")) + ## fix legend labels so it's not "0" and "1"
labs(title = "BV Diagnosis during Pregnany", x = "BV Diagnosis") + ## add information to the plot
theme(axis.ticks.x=element_blank(), axis.text.x=element_blank()) ## remove the "1" and "0" codes on the x axis
med_dat$Preg_UTI <- as.factor(med_dat$Preg_UTI)
med_dat %>% ggplot(aes(x=Preg_UTI, fill = Preg_UTI)) +
geom_bar(width = 0.5) +
theme_light() + ## change theme and background colors
scale_fill_manual(values = c("lightcyan2", "palevioletred1"), ## use color names to change plot colors
name = "UTI Diagnosis", labels = c("No", "Yes")) + ## fix legend labels so it's not "0" and "1"
labs(title = "UTI Diagnosis during Pregnany", x = "UTI Diagnosis") + ## add information to the plot
theme(axis.ticks.x=element_blank(), axis.text.x=element_blank())
med_dat$Preg_Chlam <- as.factor(med_dat$Preg_Chlam)
med_dat %>% ggplot(aes(x=Preg_Chlam, fill = Preg_Chlam)) +
geom_bar(width = 0.5) +
theme_light() + ## change theme and background colors
scale_fill_manual(values = c("lightcyan2", "palevioletred1"), ## use color names to change plot colors
name = "Chlamydia Diagnosis", labels = c("No", "Yes")) + ## fix legend labels so it's not "0" and "1"
labs(title = "Chlamydia Diagnosis during Pregnany", x = "Chlamydia Diagnosis") + ## add information to the plot
theme(axis.ticks.x=element_blank(), axis.text.x=element_blank())
```
## Aim 1 Analysis, Species Level
This will be the LDM (linear decomp model) analysis.
```{r read in the data }
dat_otu <- read.csv("../biob_outputs/metadat_wPlates_passedQC.csv")
dat_otu <- dat_otu %>% select(-X ) %>%
mutate(bodysite = ifelse(str_detect(sample, 'Vag'), "vaginal",
ifelse(str_detect(sample, "Rec"), "rectal",
"control"))) %>%
relocate(bodysite, .after = "timepoint") %>%
mutate(timepoint = ifelse(timepoint=="1"|timepoint == "2" | timepoint == "PP" | timepoint == "3", timepoint, "control"))
dat_otu$timepoint[dat_otu$timepoint == '3'] <- 'PP'
dat_otu <- dat_otu %>% filter(timepoint != "PP")
# head(dat_otu) ## makes the knit too long
```
# Aim 1: Examine how microbes maintain population structures in the gut and vaginal microbiome.
### Hypothesis 1: The microbiome will be less diverse at T2 compred to T1.
For a version of this that shoulds more scientific, refer to my dissertation proposal. This is straightforward as it's the only consistent trend in pregnancy microbiome literature. Maria Gloria-Dominguez seems to think the microbiome gets more divers ~30 weeks, but this is based on vibes and not data.
here is a (link for the alpha diversity tutorial)[https://scienceparkstudygroup.github.io/microbiome-lesson/04-alpha-diversity/index.html]. Alpha-diversity is calculated on the raw data, here data_otu or data_phylo if you are using phyloseq.
It is important to not use filtered data because many richness estimates are modeled on singletons and doubletons in the occurrence table. So, you need to leave them in the dataset if you want a meaningful estimate.
Moreover, we usually not using normalized data because we want to assess the diversity on the raw data and we are not comparing samples to each other but only assessing diversity within each sample.
So let's test this out!
```{r calculate alpha abundances }
motu <- dat_otu %>% select(-Plate, -timepoint, -subjectID, -bodysite)
motu <- motu[order(motu$sample),]
motu <- motu %>% select(-sample)
motu <- data.matrix(motu)
rownames(motu) <- dat_otu$sample
#motu <- as.integer(motu)
#motu#motu ## taxa are columns, samples are row
mindf <- dat_otu %>% select(Plate, subjectID, timepoint, bodysite, sample)
#dat_otu %>%
# pivot_longer(cols = Gardnerella_vaginalis:last_col(), names_to = genus)
#data_richness <- motu%>% estimateR() # calculate richness and Chao1 using vegan package
## this one required count data, which we don't have
data_evenness <- diversity(motu) / log(specnumber(motu)) # calculate evenness index using vegan package
data_shannon <- diversity(motu, index = "shannon") # calculate Shannon index using vegan package
data_alphadiv <- cbind(mindf, data_shannon, data_evenness) # combine all indices in one data table
## remove dat for simplicity and space in r
rm(data_evenness, data_shannon, mindf) # remove the unnecessary data/vector
head(data_alphadiv)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
library(psych) ## here because it masks some packages in phyloseq
describe(data_alphadiv$data_evenness)
describe(data_alphadiv$data_shannon)
data_alphadiv %>%
filter(timepoint == 1 | timepoint == 2) %>%
ggplot(aes(x = timepoint, y = data_shannon)) +
geom_boxplot() +
geom_jitter() +
facet_wrap(.~bodysite) +
theme_minimal() +
labs(title = "Shannon Diversity")
```
Pielou’s evenness provides information about the equity in species abundance in each sample, in other words are some species dominating others or do all species have quite the same abundances.Richness represents the number of species observed in each sample. Shannon index provides information about both richness and evenness.
Now let's do some basic plots and stats to see if there is a difference in diversity between timepoints.
```{r alpha end game stats}
cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#999999")
data_alphadiv %>%
filter(bodysite != 'control' & timepoint != 'control') %>%
ggplot(aes(x = timepoint, y = data_shannon, fill = bodysite)) +
geom_boxplot(outlier.size=-1) +
geom_jitter(height = 0, width = 0.3, alpha= 0.6 ) +
facet_wrap(.~bodysite) +
theme_minimal() +
scale_fill_manual(values=cbPalette) +
labs(title = "Shannon Diversity")
data_alphadiv %>%
filter(bodysite != 'control' & timepoint != 'control') %>%
ggplot(aes(x = timepoint, y = data_evenness, fill = bodysite)) +
geom_boxplot(outlier.size=-1) +
geom_jitter(height = 0, width = 0.3, alpha= 0.6) +
facet_wrap(.~bodysite) +
theme_minimal() +
scale_fill_manual(values=cbPalette) +
labs(title = "Evenness (alpha diversity)")
summary(aov(data_shannon ~ timepoint, data = data_alphadiv)) # not a significant p value
summary(aov(data_evenness ~ timepoint, data = data_alphadiv)) # not a significant p value
```
The P-value of `0.803` indicates that there is not a significant difference in shannon alpha diversity between timepoints. Similarly, a p value of `0.603` for evenness indicates there is not a significant difference in evenness between timepoints.
## normalization and beta diversity
Since beta diversity is comparing diversity across sample, we DO need normalization and batch correction. I will be following [this tutorial](https://scienceparkstudygroup.github.io/microbiome-lesson/05-data-filtering-and-normalisation/index.html).
We have already filtered out species detected less than 1% so I'm going to skip this step of the tutorial. It's also focused on 16S whereas I do metagenomics.
First we need to see if there is a difference in the counts per sample. Unforunately this is going to be difficult to get and integrate as these numbers were not made accessible or as part of tiny data. But let's see what we can do.
* as explained McKnight and collaborators (DOI: 10.1111/2041-210X.13115) DESeq2 or edgeR‐TMM are recommended based on studies that focused on differential abundance testing and variance standardization, rather than community-level comparisons (i.e. beta-diversity)
*
```{r get number of reads}
readdat <- read.csv("../quality_control/calc_reads/general_stats_table.tsv", sep = "\t")
readdat2 <- read.csv("../quality_control/calc_reads/fastqc_sequence_counts_plot_reupload_files.tsv", sep = "\t")
readdat2$Total.Sequences..millions <- (readdat2$Unique.Reads + readdat2$Duplicate.Reads) / 1000000
#readdat$Total.Sequences..millions <- (readdat$Total.Sequences..millions*1000000)
## remove currently useless columns
readdat <- readdat %>% select(-Average...GC.Content, - Percentage.of.modules.failed.in.FastQC.report..includes.those.not.plotted.here., - Average.Sequence.Length..bp.)
colnames(readdat) <- c("Sample", "Duplicate.Reads", "Total.Sequences")
colnames(readdat2) <- c("Sample", "Unique.Reads", "Duplicate.Reads", "Total.Sequences")
## now merge the two
reads <- bind_rows(readdat, readdat2)
reads <- reads %>%
mutate(read = ifelse(str_detect(Sample, "_1"), "forward", "reverse")) %>%
mutate(bodysite = ifelse(str_detect(Sample, "Rec"), "rectal",
ifelse(str_detect(Sample, "Vag"), "vaginal", "control"))) %>%
mutate(timepoint = ifelse(str_detect(Sample, "-1-"), "1",
ifelse(str_detect(Sample, "-2-"), "2", "idk")))
head(reads)
reads %>%
filter(timepoint != "idk") %>%
ggplot(aes( y = Total.Sequences, x = timepoint, fill = bodysite)) +
geom_boxplot(outlier.size=-1) +
geom_jitter(height = 0, width = 0.3, alpha = 0.5) +
## don't plot outliers since they are reflected in geom_jitter
theme_minimal() +
scale_fill_manual(values=cbPalette) +
labs(title = "Total Reads in All Samples", y = "Total Sequences, Millions", x = "Timepoint") +
facet_wrap(.~bodysite)
vreads <- reads %>% filter(bodysite == "vaginal")
rreads <- reads %>% filter(bodysite == "rectal")
describe(rreads$Total.Sequences)
describe(vreads$Total.Sequences)
```
Rareify the data, in a separate code block since its chunky.
```{r normalize data maybe}
## first make data a phyloseq object
#library(phyloseq)
#data_phylo_filt <- otu_table(object = motu, taxa_are_rows = FALSE)
set.seed(11062015) # set seed for analysis reproducibility, and obviously this is the date the first minion movie came out so it is the ideal seed
## "Please note that the authors of phyloseq do not advocate using this as a normalization procedure, despite its recent popularity. "
## ok so we will use DEseq2
## jk I can't do this because the processing we used doesn't provide count data :sadface:
```
jk I can't do this because the processing we used doesn't provide count data :sadface:
We can use this for justification too https://journals.plos.org/ploscompbiol/article/file?id=10.1371/journal.pcbi.1003531&type=printable
## get some info on the cohort
```{r vag infection rates}
#read in medical data
med_dat <- read_csv("../medical_data/MPTB_Sociodemographic Outcome Health Behaviors update4, 16S, WGS indicator.csv")
med_dat <- med_dat %>% filter(Sequencing_WGS == "1" )
# cofounder variables: age, income, parity, TobaccoUse_MRm AlcoholUse_MR, MarijuanaUse_MR,
# covariate variables: birthoutcome, labor, Preg_Chlam, Preg_UTI
# merge med_dat and small
med_dat$subjectID <- med_dat$subjectid
small <- dat_otu %>% select(sample, subjectID, Plate, timepoint, bodysite)
small <- small [order(small$sample),]
med_data <- merge(small, med_dat, all = TRUE, by.x = "subjectID", by.y = "subjectid" )
#med_data # 937 rows
med_data[order(med_data$sample),] ## motu and med_dat should have same order of samples in rows
#write.csv(med_dat, "med_data_for_mj.csv")
med_dat_vag <- med_data %>% filter(bodysite == "vaginal")
med_dat_rectal <- med_data %>% filter(bodysite == "rectal")
table(med_data$Preg_BV)
table(med_data$Preg_UTI)
table(med_data$Preg_Chlam)
#med_dat_vag #%>% unique(subjectID)
med_data %>%
#filter(timepoint == "1" & bodysite == "vaginal") %>%
filter(Preg_UTI == "1" ) #& Preg_Chlam == "1" & Preg_BV == "1")
```
plot it.
## Set up for LDM
To read a benchmarking paper comparing differential modelling methods in microbiome data, (look here)[https://www.biorxiv.org/content/10.1101/2022.07.22.501190v1.full]. To read the OG manuscript introducing LDM, (look here)[https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-021-01034-9].
*What is LDM?*
"The LDM models microbial abundance in the form of counts transformed into relative abundances as an outcome of interest given experimental covariates of interest. LDM provides users with both global and local hypothesis tests of differential abundance given covariates of interest and microbial count data. LDM decomposes the model sum of squares into parts explained by each variable in the model. From these sub-models we can see the amount of variability that each variable is contributing to the overall variability explained by the model’s covariates of interest... LDM can handle covariates both categorical and continuous and can control for confounders."
- (source)[https://rpubs.com/jrandall7/EICC16s]
"Next, to remove any statistical noise we may still have to detect relationships between the covariates and microbial composition, we decide to keep only those taxa which appear in at least 2 samples. This is a parameter that will vary by project."
* Do we want to do this with this data? I am inclined to say no, but here is the code in case we want it later: `otu_pres = which(colSums(asvt>0)>=1)` then `asvt = asvt[,otu_pres]` for dat_otu in its current form instead of asvt.
* "The OTU table should have rows corresponding to samples and columns corresponding to
OTUs (ldm will transpose the OTU table if the number of rows is not equal to the length of
the covariates in the metadata but this consistency check will fail in the unlikely case that the
number of OTUs and samples are equal)"
```{r set up ldm models and dat }
(seed=sample.int(11062015, size=1))
small_otu <- dat_otu %>% select(-sample, -subjectID)
rownames(small_otu) <- dat_otu$sample
# Recode factor levels by name
#small_otu$Plate <- recode_factor(small_otu$Plate, plate10 = "10", plate11= "11",
# plate12 = "12", `plate3-4` = "34", plate5 = "5",
# plate6 = "6", plate7 = "7", `plate8-9` = "89", prelim = "0")
#small_otu$bodysite <- recode_factor(small_otu$bodysite, rectal = "0", vaginal = "2", control = "3")
small_otu$timepoint <- recode_factor(small_otu$timepoint, "1" = "1", "2" = "2", "idk" = "0")
#small_otu$Plate <- as.numeric(small_otu$Plate)
#small_otu$bodysite <- as.numeric(small_otu$bodysite)
small_otu$timepoint <- as.numeric(small_otu$timepoint)
#head(dat_otu)
small <- dat_otu %>% select(sample, subjectID, Plate, timepoint, bodysite)
small <- small [order(small$sample),]
##
####################
motu_rectal <- dat_otu %>% filter(bodysite == "rectal" & timepoint != "idk" & timepoint != "0") %>% select(-Plate, -timepoint, -subjectID, -bodysite)
motu_rectal <- motu_rectal [order(motu_rectal $sample),]
motu_rectal <- motu_rectal %>% select(-sample)
motu_rectal <- data.matrix(motu_rectal )
motu_vaginal <- dat_otu %>% filter(bodysite == "vaginal" & timepoint != "idk" & timepoint != "0") %>% select(-Plate, -timepoint, -subjectID, -bodysite)
motu_vaginal <- motu_vaginal[order(motu_vaginal$sample),]
motu_vaginal <- motu_vaginal %>% select(-sample)
motu_vaginal <- data.matrix(motu_vaginal)
#rownames(motu) <- dat_otu$sample
```
now that LDM is set up, lets run it
```{r actually running ldm rectal }
##### running the ldm function
# confounders: Parity, BMI, Age, Tobacco Use, Alcohol Use, Marijuana Use, Socioeconomic status (income, % fed poverty level), Cohabitation, EOD scale
# co variates:Birth outcome,BV, UTI, chlamydia, timepoint
## matched LDM for variables that change between timepoints (like timepoint)
form_matched_rectal <- motu_rectal | (Plate + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR + Preg_Chlam + Preg_UTI + Preg_BV) ~ timepoint
res.ldm.matched.rectal.spec.meta <-ldm(formula= form_matched_rectal,
data= med_dat_rectal,
cluster.id = "subjectID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints
res.ldm.matched.rectal.spec.meta$n.perm.completed # number of permutations
res.ldm.matched.rectal.spec.meta$global.tests.stopped # did the global tests neet the stopping criteria?
res.ldm.matched.rectal.spec.meta$otu.tests.stopped # did the otu-specific tests neet the stopping criteria?
res.ldm.matched.rectal.spec.meta$p.global.omni # global test p value
res.ldm.matched.rectal.spec.meta$detected.otu.omni # signiticant OTUs detected
### look at significant OTUs
raw.pvalue.matched.rectal.spec.meta=as.data.frame(signif(res.ldm.matched.rectal.spec.meta$p.otu.omni,3))
raw.pvalue.matched.rectal.spec.meta <- cbind(covariate = c("timepoint"), raw.pvalue.matched.rectal.spec.meta)
raw.pvalue.matched.rectal.spec.meta <- raw.pvalue.matched.rectal.spec.meta %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "raw_p_value")
########################
adj.pvalue.matched.rectal.meta.spec=as.data.frame(signif(res.ldm.matched.rectal.spec.meta$q.otu.omni,3))
adj.pvalue.matched.rectal.meta.spec <- cbind(covariate = c("timepoint"), adj.pvalue.matched.rectal.meta.spec)
adj.pvalue.matched.rectal.meta.spec <- adj.pvalue.matched.rectal.meta.spec %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "adj_p_value")
## merge
sig_otu_rec_mta_spec <- full_join(raw.pvalue.matched.rectal.spec.meta, adj.pvalue.matched.rectal.meta.spec, by = c("species", "covariate"))
options(scipen = 50)
only_sig_otu <- sig_otu_rec_mta_spec %>%
filter(adj_p_value < 0.05)
only_sig_otu
tidy_otu_rec <- dat_otu %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = 'species',
values_to = "rel_abun") %>%
filter(rel_abun > 0)
tidy_otu_rec$species <- as.factor(tidy_otu_rec$species)
#write.csv(tidy_otu,"tidy_otu_rel_abun_after_QC.csv")
spec_counts <- tidy_otu_rec %>%
count(species)
spec_counts$number_times_occur <- spec_counts$n
spec_counts <- spec_counts %>% select(-n)
## merge
only_sig_otu_matched <- left_join(only_sig_otu, spec_counts, by = genus)
only_sig_otu_matched
sig_otu_rec_mta_spec
```
Now do the same for vaginal
```{r actually running ldm vaginal}
##### running the ldm function
# confounders: Parity, BMI, Age, Tobacco Use, Alcohol Use, Marijuana Use, Socioeconomic status (income, % fed poverty level), Cohabitation, EOD scale
# co variates:Birth outcome,BV, UTI, chlamydia, timepoint
## matched LDM for variables that change between timepoints (like timepoint)
form_matched_vag <- motu_vaginal | (Plate + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR + Preg_Chlam + Preg_UTI + Preg_BV) ~ timepoint
res.ldm.matched.vag.spec.meta <-ldm(formula= form_matched_vag,
data= med_dat_vag,
cluster.id = "subjectID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints
res.ldm.matched.vag.spec.meta$n.perm.completed # number of permutations
res.ldm.matched.vag.spec.meta$global.tests.stopped # did the global tests neet the stopping criteria?
res.ldm.matched.vag.spec.meta$otu.tests.stopped # did the otu-specific tests neet the stopping criteria?
res.ldm.matched.vag.spec.meta$p.global.omni # global test p value
res.ldm.matched.vag.spec.meta$detected.otu.omni # signiticant OTUs detected
### look at significant OTUs
raw.pvalue.matched.vag.spec.meta=as.data.frame(signif(res.ldm.matched.vag.spec.meta$p.otu.omni,3))
raw.pvalue.matched.vag.spec.meta <- cbind(covariate = c("timepoint"), raw.pvalue.matched.vag.spec.meta)
raw.pvalue.matched.vag.spec.meta <- raw.pvalue.matched.vag.spec.meta %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "raw_p_value")
########################
adj.pvalue.matched.vag.meta.spec=as.data.frame(signif(res.ldm.matched.vag.spec.meta$q.otu.omni,3))
adj.pvalue.matched.vag.meta.spec <- cbind(covariate = c("timepoint"), adj.pvalue.matched.vag.meta.spec)
adj.pvalue.matched.vag.meta.spec <- adj.pvalue.matched.vag.meta.spec %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "adj_p_value")
adj.pvalue.matched.vag.meta.spec
## merge
sig_otu_vag_mta_spec <- full_join(raw.pvalue.matched.vag.spec.meta, adj.pvalue.matched.vag.meta.spec, by = c("species"))
options(scipen = 50)
only_sig_otu_vag <- sig_otu_vag_mta_spec %>%
filter(adj_p_value < 0.05)
only_sig_otu_vag
tidy_otu_vag <- dat_otu %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = genus,
values_to = "rel_abun") %>%
filter(rel_abun > 0)
tidy_otu_vag$species <- as.factor(tidy_otu_vag$species)
#write.csv(tidy_otu,"tidy_otu_rel_abun_after_QC.csv")
spec_counts_vag <- tidy_otu_vag %>%
count(species)
spec_counts_vag$number_times_occur <- spec_counts_vag$n
spec_counts_vag <- spec_counts_vag %>% select(-n)
## merge
only_sig_otu_matched_vag <- left_join(only_sig_otu_vag, spec_counts_vag, by = genus)
only_sig_otu_matched_vag
sig_otu_vag_mta_spec
```
Now we want to do things with the output to make sense of what we just processed in LDM.
```{r}
### cluster LDM for variables that remain the same between time points
form_cluster_rectal <- motu_rectal | (timepoint + Plate + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR ) ~ Preg_Chlam + Preg_UTI + Preg_BV
res.ldm.cluster <-ldm(formula= form_cluster_rectal,
data= med_dat_rectal,
cluster.id = "subjectID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints
res.ldm.cluster$n.perm.completed # number of permutations 25,000
res.ldm.cluster$global.tests.stopped # did the global tests neet the stopping criteria?
res.ldm.cluster$otu.tests.stopped # did the otu-specific tests neet the stopping criteria?
res.ldm.cluster$p.global.omni # global test p value
res.ldm.cluster$detected.otu.omni # signiticant OTUs detected
### look at significant OTUs
raw.pvalue=as.data.frame(signif(res.ldm.cluster$p.otu.omni,3))
raw.pvalue <- cbind(covariate = c( "Preg_Chlam", "Preg_UTI", "Preg_BV"), raw.pvalue)
raw.pvalue <- raw.pvalue %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "raw_p_value")
#raw.pvalue
########################
adj.pvalue=as.data.frame(signif(res.ldm.cluster$q.otu.omni,3))
adj.pvalue <- cbind(covariate = c( "Preg_Chlam", "Preg_UTI", "Preg_BV"), adj.pvalue)
adj.pvalue <- adj.pvalue %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "adj_p_value")
## merge
sig_otu_clust <- full_join(raw.pvalue, adj.pvalue, by = c("species", "covariate"))
options(scipen = 50)
only_sig_otu <- sig_otu_clust %>%
filter(adj_p_value < 0.05)
#head(dat_otu)
tidy_otu <- dat_otu %>%
filter(bodysite == "rectal") %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "rel_abun") %>%
filter(rel_abun > 0)
tidy_otu$species <- as.factor(tidy_otu$species)
#write.csv(tidy_otu,"tidy_o-
spec_counts <- tidy_otu %>%
count(species)
spec_counts$number_times_occur <- spec_counts$n
spec_counts <- spec_counts %>% select(-n)
## merge
only_sig_otu <- left_join(only_sig_otu, spec_counts, by = 'species')
only_sig_otu
#spec_counts
write.csv(only_sig_otu, file = "significant_otus_cluster_ldm_rectal.csv")
```
now for vaginal
```{r vag cluster}
### cluster LDM for variables that remain the same between time points
form_cluster_vag <- motu_vaginal | (timepoint + Plate + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR ) ~ Preg_Chlam + Preg_UTI + Preg_BV
res.ldm.cluster.vag <-ldm(formula= form_cluster_vag,
data= med_dat_vag,
cluster.id = "subjectID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints
res.ldm.cluster.vag$n.perm.completed # number of permutations 25,000
res.ldm.cluster.vag$global.tests.stopped # did the global tests neet the stopping criteria?
res.ldm.cluster.vag$otu.tests.stopped # did the otu-specific tests neet the stopping criteria?
res.ldm.cluster.vag$p.global.omni # global test p value
res.ldm.cluster.vag$detected.otu.omni # signiticant OTUs detected
### look at significant OTUs
raw.pvalue.vag.cluster=as.data.frame(signif(res.ldm.cluster.vag$p.otu.omni,3))
raw.pvalue.vag.cluster <- cbind(covariate = c( "Preg_Chlam", "Preg_UTI", "Preg_BV"), raw.pvalue.vag.cluster)
raw.pvalue.vag.cluster <- raw.pvalue.vag.cluster %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "raw_p_value")
#raw.pvalue
########################
adj.pvalue.vag.cluster=as.data.frame(signif(res.ldm.cluster.vag$q.otu.omni,3))
adj.pvalue.vag.cluster <- cbind(covariate = c("Preg_Chlam", "Preg_UTI", "Preg_BV"), adj.pvalue.vag.cluster)
adj.pvalue.vag.cluster <- adj.pvalue.vag.cluster %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "adj_p_value")
## merge
sig_otu_clust_vag <- full_join(raw.pvalue.vag.cluster, adj.pvalue.vag.cluster, by = c("species", "covariate"))
options(scipen = 50)
only_sig_otu_vag <- sig_otu_clust_vag %>%
filter(adj_p_value < 0.05)
#head(dat_otu)
tidy_otu_vag_clust <- dat_otu %>%
filter(bodysite == "vaginal") %>%
pivot_longer(cols = Gardnerella_vaginalis:last_col(),
names_to = "species",
values_to = "rel_abun") %>%
filter(rel_abun > 0)
tidy_otu_vag_clust$species <- as.factor(tidy_otu_vag_clust$species)
#write.csv(tidy_otu,"tidy_o-
spec_counts_vag_clust <- tidy_otu_vag_clust %>%
count(species)
spec_counts_vag_clust$number_times_occur <- spec_counts_vag_clust$n
spec_counts_vag_clust <- spec_counts_vag_clust %>% select(-n)
## merge
only_sig_otu_vag <- left_join(only_sig_otu_vag, spec_counts_vag_clust, by = 'species')
only_sig_otu_vag
#spec_counts
#spec_counts_vag_clust # Gammapapillomavirus_6
write.csv(only_sig_otu_vag, file = "significant_otus_cluster_ldm_vaginal_species.csv")
```
### test at genus level
```{r read in genus dat}
gendat <- read.csv("../biob_outputs/merged_abundance_table.txt", sep = "\t")
gendat <- gendat %>%
filter(ID != "#SampleID") %>%
separate(col = ID,
into = c("kindgom", "phylum", "class", "order", "family", "genus", "species", "strain"),
sep = "[|].__",
remove = FALSE,
fill = "right")
gendat_piv1 <- gendat %>%
pivot_longer(cols = Blank.W34P.USPD16084006.A57_profile:last_col(),
names_to = "sample",
values_to = "rel_abun") %>%
filter(!is.na(genus) & is.na(species)) %>%
group_by(sample, genus) %>%
mutate(rel_abun = sum(as.numeric(rel_abun))) %>%
ungroup()
options(scipen = 50)
gendat_piv <- gendat_piv1 %>%
select(sample, genus, rel_abun) %>%
na.omit() %>%
filter(rel_abun >= 1) %>%
separate(col = sample, remove = FALSE, into = c("sample", "timepoint", "trash"),sep = "[.]") %>%
filter(timepoint == "1" | timepoint == "2") %>%
mutate(bodysite = ifelse(grepl("Rec", trash), "rectal",
ifelse(grepl("Vag", trash), "vaginal",
"other"))) %>%
filter(bodysite != "other") %>%
mutate(sampleID = paste0(sample, ".", timepoint,".", trash))
head(gendat_piv)
gendat_piv <- gendat_piv[order(gendat_piv$sample),]
gendat_otu <- gendat_piv %>%
filter(timepoint != "PP") %>%
select(-timepoint, -trash, - bodysite, -sample)%>%
pivot_wider(id_cols = sampleID,
names_from = genus,
values_from = rel_abun,
values_fill = 0)
gendat_otu_rectal <- gendat_piv %>%
filter(bodysite == "rectal" & timepoint != "PP") %>%
select(-timepoint, -trash, - bodysite, -sample)%>%
pivot_wider(id_cols = sampleID,
names_from = genus,
values_from = rel_abun,
values_fill = 0)
gendat_otu_vag <- gendat_piv %>%
filter(bodysite == "vaginal") %>%
select(-timepoint, -trash, - bodysite, -sample)%>%
pivot_wider(id_cols = sampleID,
names_from = genus,
values_from = rel_abun,
values_fill = 0)
```
now set up for LDM. we need the motu and to confirm the genuses are in the same order. sample x genus matrix.
```{r set up for genus LDM}
motu_gen <- gendat_otu
motu_gen <- motu_gen[order(motu_gen$sampleID),]
motu_gen <- motu_gen %>% select(-sampleID)
motu_gen <- data.matrix(motu_gen)
rownames(motu_gen) <- gendat_otu$sampleID
#motu <- as.integer(motu)
## clean it first
med_dat_rectal <- med_dat_rectal %>% filter(timepoint != "PP" )
med_dat_rectal <- med_dat_rectal %>% mutate(sampleID_int = str_replace(sample, "-", ".")) %>% mutate(sampleID = str_replace(sampleID_int, "-", ".")) %>%
select(sampleID, everything())
## med dat sample IDs in gen dat
med_dat_rectal <- med_dat_rectal %>% filter(sampleID %in% gendat_otu_rectal$sampleID)
gendat_otu_rectal <- gendat_otu_rectal %>% filter(sampleID %in% med_dat_rectal$sampleID)
# add to motu
motu_gen_rectal <- gendat_otu_rectal
motu_gen_rectal<- motu_gen_rectal[order(motu_gen_rectal$sampleID),]
motu_gen_rectal <- motu_gen_rectal %>% select(-sampleID)
motu_gen_rectal <- data.matrix(motu_gen_rectal)
#rownames(motu_gen) <- gendat_otu$sampleID
## repeat for vag
## clean it first
med_dat_vag <- med_dat_vag %>% filter(timepoint != "PP" & timepoint != "0" & timepoint != "idk")
med_dat_vag <- med_dat_vag%>% mutate(sampleID_int = str_replace(sample, "-", ".")) %>% mutate(sampleID = str_replace(sampleID_int, "-", ".")) %>%
select(sampleID, everything())
## med dat sample IDs in gen dat
med_dat_vag <- med_dat_vag %>% filter(sampleID %in% gendat_otu_vag$sampleID)
gendat_otu_vag <- gendat_otu_vag %>% filter(sampleID %in% med_dat_vag$sampleID)
motu_gen_vag <- gendat_otu_vag
motu_gen_vag<- motu_gen_vag[order(motu_gen_vag$sampleID),]
motu_gen_vag <- motu_gen_vag %>% select(-sampleID)
motu_gen_vag <- data.matrix(motu_gen_vag)
```
Now the cluster LDM
```{r gen ldm cluster}
form_cluster_gen_rectal <- motu_gen_rectal | (timepoint + Plate + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR ) ~ Preg_Chlam + Preg_UTI + Preg_BV
res.ldm.cluster.gen.rectal <-ldm(formula= form_cluster_gen_rectal,
data= med_dat_rectal,
cluster.id = "subjectID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints
res.ldm.cluster.gen.rectal$n.perm.completed # number of permutations 25,000
res.ldm.cluster.gen.rectal$global.tests.stopped # did the global tests neet the stopping criteria?
res.ldm.cluster.gen.rectal$otu.tests.stopped # did the otu-specific tests neet the stopping criteria?
res.ldm.cluster.gen.rectal$p.global.omni # global test p value
res.ldm.cluster.gen.rectal$detected.otu.omni # signiticant OTUs detected
### look at significant OTUs
raw.pvalue.gen.rectal=as.data.frame(signif(res.ldm.cluster.gen.rectal$p.otu.omni,3))
raw.pvalue.gen.rectal <- cbind(covariate = c("Preg_Chlam", "Preg_UTI", "Preg_BV"), raw.pvalue.gen.rectal)
raw.pvalue.gen.rectal <- raw.pvalue.gen.rectal %>%
pivot_longer(cols = Bifidobacterium:last_col(),
names_to = "genus",
values_to = "raw_p_value")
raw.pvalue.gen.rectal
########################
adj.pvalue.gen.rectal=as.data.frame(signif(res.ldm.cluster.gen.rectal$q.otu.omni,3))
adj.pvalue.gen.rectal <- cbind(covariate = c("Preg_Chlam", "Preg_UTI", "Preg_BV"), adj.pvalue.gen.rectal)
adj.pvalue.gen.rectal <- adj.pvalue.gen.rectal %>%
pivot_longer(cols = Bifidobacterium:last_col(),
names_to = "genus",
values_to = "adj_p_value")
## merge
sig_otu_clust_gen_rec <- full_join(raw.pvalue.gen.rectal, adj.pvalue.gen.rectal, by = c("genus", "covariate"))
options(scipen = 50)
only_sig_otu_gen_rec <- sig_otu_clust_gen_rec %>%
filter(adj_p_value < 0.05)
tidy_otu_gen_rec <- gendat_otu_rectal %>%
pivot_longer(cols = Bifidobacterium:last_col(),
names_to = "genus",
values_to = "rel_abun") %>%
filter(rel_abun > 0)
tidy_otu_gen_rec$genus <- as.factor(tidy_otu_gen_rec$genus)
#write.csv(tidy_otu,"tidy_o-
gen_counts_rec <- tidy_otu_gen_rec %>%
count(genus)
gen_counts_rec$number_times_occur <- gen_counts_rec$n
gen_counts_rec <- gen_counts_rec %>% select(-n)
## merge
only_sig_otu_gen_rec <- left_join(only_sig_otu_gen_rec, gen_counts_rec, by = "genus")
only_sig_otu_gen_rec
write.csv(only_sig_otu_gen_rec, file = "significant_otus_cluster_ldm_genus_rectal.csv")
```
copy for vaginal body site.
```{r gen ldm cluster vaginal}
form_cluster_gen_vag <- motu_gen_vag | (timepoint + Plate + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR ) ~ Preg_Chlam + Preg_UTI + Preg_BV
res.ldm.cluster.gen.vag <-ldm(formula= form_cluster_gen_vag,
data= med_dat_vag,
cluster.id = "subjectID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints
res.ldm.cluster.gen.vag$n.perm.completed # number of permutations 25,000
res.ldm.cluster.gen.vag$global.tests.stopped # did the global tests neet the stopping criteria?
res.ldm.cluster.gen.vag$otu.tests.stopped # did the otu-specific tests neet the stopping criteria?
res.ldm.cluster.gen.vag$p.global.omni # global test p value
res.ldm.cluster.gen.vag$detected.otu.omni # signiticant OTUs detected
### look at significant OTUs
raw.pvalue.gen.vag=as.data.frame(signif(res.ldm.cluster.gen.vag$p.otu.omni,3))
raw.pvalue.gen.vag <- cbind(covariate = c("Preg_Chlam", "Preg_UTI", "Preg_BV"), raw.pvalue.gen.vag)
raw.pvalue.gen.vag <- raw.pvalue.gen.vag %>%
pivot_longer(cols = Gardnerella:last_col(),
names_to = "genus",
values_to = "raw_p_value")
########################
adj.pvalue.gen.vag=as.data.frame(signif(res.ldm.cluster.gen.vag$q.otu.omni,3))
adj.pvalue.gen.vag <- cbind(covariate = c("Preg_Chlam", "Preg_UTI", "Preg_BV"), adj.pvalue.gen.vag)
adj.pvalue.gen.vag <- adj.pvalue.gen.vag %>%
pivot_longer(cols = Gardnerella:last_col(),
names_to = "genus",
values_to = "adj_p_value")
## merge
sig_otu_clust_gen_vag <- full_join(raw.pvalue.gen.vag, adj.pvalue.gen.vag, by = c("genus", "covariate"))
options(scipen = 50)
only_sig_otu_gen_vag <- sig_otu_clust_gen_vag %>%
filter(adj_p_value < 0.05)
tidy_otu_gen_vag <- gendat_otu_vag %>%
pivot_longer(cols = Gardnerella:last_col(),
names_to = "genus",
values_to = "rel_abun") %>%
filter(rel_abun > 0)
tidy_otu_gen_vag$genus <- as.factor(tidy_otu_gen_vag$genus)
#write.csv(tidy_otu,"tidy_o-
gen_counts_vag <- tidy_otu_gen_vag %>%
count(genus)
gen_counts_vag$number_times_occur <- gen_counts_vag$n
gen_counts_vag <- gen_counts_vag %>% select(-n)
## merge
only_sig_otu_gen_vag <- left_join(only_sig_otu_gen_vag, gen_counts_vag, by = "genus")
only_sig_otu_gen_vag
write.csv(only_sig_otu_gen_vag, file = "significant_otus_cluster_ldm_genus_vaginal.csv")
```
Now do this for the matched LMD
```{r genus matched LDM rectal}
## matched LDM for variables that change between timepoints (like timepoint)
form_matched_gen_rectal <- motu_gen_rectal | (Plate + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR + Preg_Chlam + Preg_UTI + Preg_BV) ~ timepoint
res.ldm.matched.gen.rectal <-ldm(formula= form_matched_gen_rectal,
data= med_dat_rectal,
cluster.id = "subjectID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints
res.ldm.matched.gen.rectal$n.perm.completed # number of permutations
res.ldm.matched.gen.rectal$global.tests.stopped # did the global tests neet the stopping criteria?
res.ldm.matched.gen.rectal$otu.tests.stopped # did the otu-specific tests neet the stopping criteria?
res.ldm.matched.gen.rectal$p.global.omni # global test p value
res.ldm.matched.gen.rectal$detected.otu.omni # signiticant OTUs detected
### look at significant OTUs
raw.pvalue.matched.gen=as.data.frame(signif(res.ldm.matched.gen.rectal$p.otu.omni,3))
raw.pvalue.matched.gen <- cbind(covariate = c("timepoint"), raw.pvalue.matched.gen)
raw.pvalue.matched.gen <- raw.pvalue.matched.gen %>%
pivot_longer(cols = Gardnerella:last_col(),
names_to = "genus",
values_to = "raw_p_value")
########################
adj.pvalue.matched.gen=as.data.frame(signif(res.ldm.matched.gen.rectal$q.otu.omni,3))
adj.pvalue.matched.gen <- cbind(covariate = c("timepoint"), adj.pvalue.matched.gen)
adj.pvalue.matched.gen <- adj.pvalue.matched.gen %>%
pivot_longer(cols = Gardnerella:last_col(),
names_to = "genus",
values_to = "adj_p_value")
## merge
sig_otu_GEN_MATCHED <- full_join(raw.pvalue.matched.gen, adj.pvalue.matched.gen, by = c("genus", "covariate"))
options(scipen = 50)
only_sig_otu_matched_gen <- sig_otu_GEN_MATCHED %>%
filter(adj_p_value < 0.05)
only_sig_otu_matched_gen
## merge
only_sig_otu_matched_gen <- left_join(only_sig_otu_matched_gen, gen_counts, by = "genus")
only_sig_otu_matched_gen
adj.pvalue.matched.gen %>% filter(adj_p_value < 0.05)
```
And copy the same exact thing for vaginal matched LDM at genus level
```{r genus matched LDM vag}
## matched LDM for variables that change between timepoints (like timepoint)
form_matched_gen_vag <- motu_gen_vag | (Plate + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR + Preg_Chlam + Preg_UTI + Preg_BV) ~ timepoint
res.ldm.matched.gen.vag <-ldm(formula= form_matched_gen_vag,
data= med_dat_vag,
cluster.id = "subjectID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints
res.ldm.matched.gen.vag$n.perm.completed # number of permutations
res.ldm.matched.gen.vag$global.tests.stopped # did the global tests neet the stopping criteria?
res.ldm.matched.gen.vag$otu.tests.stopped # did the otu-specific tests neet the stopping criteria?
res.ldm.matched.gen.vag$p.global.omni # global test p value
res.ldm.matched.gen.vag$detected.otu.omni # signiticant OTUs detected
### look at significant OTUs
raw.pvalue.matched.gen.vag=as.data.frame(signif(res.ldm.matched.gen.vag$p.otu.omni,3))
raw.pvalue.matched.gen.vag <- cbind(covariate = c("timepoint"), raw.pvalue.matched.gen.vag)
raw.pvalue.matched.gen.vag <- raw.pvalue.matched.gen.vag %>%
pivot_longer(cols = Gardnerella:last_col(),
names_to = "genus",
values_to = "raw_p_value")
########################
adj.pvalue.matched.gen.vag =as.data.frame(signif(res.ldm.matched.gen.vag$q.otu.omni,3))
adj.pvalue.matched.gen.vag <- cbind(covariate = c("timepoint"), adj.pvalue.matched.gen.vag )
adj.pvalue.matched.gen.vag <- adj.pvalue.matched.gen.vag %>%
pivot_longer(cols = Gardnerella:last_col(),
names_to = "genus",
values_to = "adj_p_value")
## merge
sig_otu_GEN_MATCHED_vag <- full_join(raw.pvalue.matched.gen.vag, adj.pvalue.matched.gen.vag , by = c("genus", "covariate"))
options(scipen = 50)
only_sig_otu_matched_gen_vag <- sig_otu_GEN_MATCHED_vag %>%
filter(adj_p_value < 0.05)
## merge
only_sig_otu_matched_gen_vag <- left_join(only_sig_otu_matched_gen_vag, gen_counts, by = "genus")
only_sig_otu_matched_gen_vag
adj.pvalue.matched.gen.vag %>% filter(adj_p_value < 0.05)
```
## 16s vs metagenomic comparison
now we need to read in the 16S data and do this
```{r read in 16s data}
sixteens <- read.csv("../biob_outputs/16S/silva_taxonomy_raw_species.csv")
rownames(sixteens) <- sixteens$index
indx <- grepl('.s__', colnames(sixteens))
spec16 <- sixteens[indx]
spec16$subjectID <- sixteens$Subject_ID
spec16$sampleID <- sixteens$index
spec16$pcrplate <- sixteens$pcrplateID
spec16 <- spec16 %>%
mutate(bodysite = ifelse(grepl("Rec", sampleID), "rectal",
ifelse(grepl("Vag", sampleID), "vaginal", "other"))) %>%
mutate(timepoint = ifelse(grepl(".1.", sampleID), "1",
ifelse(grepl(".2.", sampleID), "2",
ifelse(grepl(".V3.", sampleID), "3", "other")))) %>%
relocate(subjectID, sampleID, pcrplate, bodysite, timepoint)
table(spec16$timepoint)
table(spec16$bodysite) # remove controls, which make up all the others
spec16 <- spec16 %>%
filter(!bodysite == "other")
## now filter so that it is only the subect IDs that are in the metagenome data.
# make list of metagenome data subject ids
id_list <- med_dat$subjectid
spec16 <- subset(spec16, subjectID %in% id_list)
```
Now we have a dataframe, `spec16`, that contains the 16S results from the same samples. Let's run the LDM on these results. Consider as well that I ONLY have the rectal samples, not vaginal, here.
```{r set up for sixteen2 LDM}
## # Transform data to proportions to match compositional aspect of metagenome data
#ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
motu_16s <- spec16 %>%
filter(!grepl('NA', sampleID)) %>%
filter(timepoint =="1" | timepoint == "2") %>%
select( -pcrplate, -bodysite, -timepoint)
incl_list <- motu_16s$subjectID
incl <- as.data.frame(motu_16s$subjectID)
motu_16s <- motu_16s[order(motu_16s$subjectID),]
rownames(motu_16s) <- motu_16s$sampleID
motu_16s <- motu_16s %>% select(-subjectID, -sampleID)
motu_16s <- round((motu_16s/rowSums(motu_16s))*100, 3)
motu_16s[is.na(motu_16s)] = 0
motu_16s <- motu_16s[rowSums(motu_16s[])>0,]
motu_16s <- motu_16s[as.logical(rowSums(motu_16s != 0)), ]
incl_list2 <- rownames(motu_16s)
#motu_16s <- gendat_otu
#motu_16s <- motu_16s %>% select(-sampleID)
motu_16s <- data.matrix(motu_16s)
#motu <- as.integer(motu)
## now make sure meddate is correct
med_dat_16_b4 <- med_data %>% filter(bodysite != "vaginal" ) %>% arrange(subjectID)
#med_data
min <- sixteens %>%
select(index, pcrplateID , Subject_ID, SubjectGroup, PrenatalVisit) %>%
arrange(Subject_ID)
med_dat_16 <- merge(min, med_dat_16_b4, by.x = "Subject_ID", by.y = "subjectID", all.x = FALSE, all.y = TRUE)
incl <- incl %>%
separate(col = `motu_16s$subjectID` ,into = c("trash", "subjectID", "timepoint","bodycode"), sep = "[.]")
med_dat_16 <- med_dat_16 %>%
#filter(subjectID.y %in% incl$trash) %>%
filter(index %in% incl_list2)
med_dat_16
#%>%
# filter(Sequencing_16S == "1" & Sequencing_WGS == "1")%>%
# filter(PrenatalVisit != 3)
#med_dat_16 # subjectIid
med_dat_16 <- med_dat_16[!duplicated(med_dat_16[,c('index')]),] #'Subject_ID', 'sample'
motu_16s_subset <- motu_16s[rownames(motu_16s) %in% med_dat_16$index, ]
str(motu_16s) # 440
str(med_dat_16) # 432
```
Now the cluster LDM to test the effect of timepoint on the mb
```{r sixteensldm cluster}
form_cluster_16s <- motu_16s_subset| (PrenatalVisit + pcrplateID + age + income + parity+ TobaccoUse_MR + AlcoholUse_MR + MarijuanaUse_MR ) ~ Preg_Chlam + Preg_UTI + Preg_BV
res.ldm.cluster.16s <-ldm(formula= form_cluster_16s,
data= med_dat_16,
cluster.id = "Subject_ID",
seed=11062015,
perm.within.type="free", perm.between.type="none") # matched sets for diff between timepoints