-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PREDICT extra records and higher taxonomy added
- Loading branch information
Colin J. Carlson
committed
Jul 19, 2021
1 parent
42fabe5
commit affc13b
Showing
21 changed files
with
37,461 additions
and
23,204 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
|
||
if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')} | ||
|
||
library(tidyverse) | ||
library(magrittr) | ||
library(lubridate) | ||
library(naniar) | ||
|
||
predict.1 <- read_csv("~/Github/ept/PredictData (2).csv") | ||
predict.2 <- read_csv("~/Github/ept/PREDICT_PCR_Tests.csv") | ||
|
||
predict.1 %<>% # select(`Species Scientific Name Based on Field Morphology`, | ||
#Virus) %>% | ||
rename(Host = "Species Scientific Name Based on Field Morphology") %>% | ||
distinct() %>% | ||
mutate(Host = str_replace(Host, " \\*", "")) %>% | ||
mutate(Host = str_replace(Host, " cf.", "")) %>% | ||
mutate(Virus = str_replace(Virus, "strain of ", "")) %>% | ||
mutate(Host = str_to_lower(Host), Virus = str_to_lower(Virus)) | ||
|
||
predict.2 %<>% # select(ScientificName, Virus) %>% | ||
rename(Host = ScientificName) %>% | ||
filter(!is.na(Virus)) %>% | ||
mutate(Host = str_replace(Host, " cf.", "")) %>% | ||
distinct() %>% | ||
mutate(Host = str_to_lower(Host), Virus = str_to_lower(Virus)) | ||
|
||
# Only grab the 50 or so records the original data are missing | ||
predict.raw <- anti_join(predict.2, predict.1, by = c("Host", "Virus")) | ||
|
||
# A couple sanity checks | ||
# table(predict.raw$TestResult) | ||
# table(predict.raw$TestType) | ||
|
||
predict.raw %<>% select(Host, | ||
Virus, | ||
PREDICT_SampleID, | ||
GenbankAccessionNumber) %>% | ||
|
||
# Rename the columns | ||
rename(NCBIAccession = "GenbankAccessionNumber") %>% | ||
|
||
# Collapse the Genbank info | ||
group_by_at(vars(-NCBIAccession)) %>% | ||
summarize(NCBIAccession = str_c(NCBIAccession, collapse = ", ")) %>% | ||
unique() %>% | ||
|
||
# Clean up the host info | ||
# First, remove fuzzy names | ||
mutate(HostFlagID = str_detect(Host, "cf."), | ||
Host = str_replace(Host, " cf.", "")) %>% | ||
mutate(Virus = word(Virus, 1, sep = "\\(")) | ||
|
||
# Let's do some higher classifications to this | ||
|
||
twowords <- function(x) { | ||
q = word(x, 1:2, sep=" ") | ||
if(is.na(q[1])) {return(x)} else {return(str_c(na.omit(q), collapse = " "))} | ||
} | ||
host.tax <- hdict(predict.raw$Host %>% unique() %>% sapply(., twowords)) | ||
predict.raw %>% rename(HostOriginal = "Host") %>% | ||
left_join(host.tax) -> predict.raw | ||
|
||
# Now the viruses | ||
|
||
# First some cleaning | ||
|
||
predict.raw %<>% mutate(Virus = recode(Virus, !!!c("influenza a" = "influenza a virus", | ||
"alpha coronavirus nl63" = "coronavirus nl63"))) | ||
|
||
predict.raw %>% pull(Virus) %>% unique() %>% sort() -> ncbi.names | ||
|
||
ncbi.tax <- vdict(ncbi.names) | ||
|
||
ncbi.tax[str_detect(ncbi.tax$Virus, "predict_cov"),"VirusFamily"] <- "coronaviridae" | ||
ncbi.tax[str_detect(ncbi.tax$Virus, "predict_cov"),"VirusOrder"] <- "nidovirales" | ||
|
||
ncbi.tax[str_detect(ncbi.tax$Virus, "predict_pmv"),"VirusFamily"] <- "paramyxoviridae" | ||
ncbi.tax[str_detect(ncbi.tax$Virus, "predict_pmv"),"VirusOrder"] <- "mononegavirales" | ||
|
||
ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusGenus"] <- "betacoronavirus" | ||
ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusFamily"] <- "coronaviridae" | ||
ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusOrder"] <- "nidovirales" | ||
|
||
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusGenus"] <- "alphacoronavirus" | ||
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusFamily"] <- "coronaviridae" | ||
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusOrder"] <- "nidovirales" | ||
|
||
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusGenus"] <- "alphacoronavirus" | ||
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusFamily"] <- "coronaviridae" | ||
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusOrder"] <- "nidovirales" | ||
|
||
if(ncbi.tax[ncbi.tax$VirusOriginal=="predict_pmv-95", "Virus"]=="Atractiella rhizophila") { | ||
ncbi.tax[ncbi.tax$VirusOriginal=="predict_pmv-95",] <- c("predict_pmv-95", NA, FALSE, "predict_pmv-95", NA, "paramyxoviridae", "mononegavirales", "monjiviricetes") | ||
} | ||
|
||
predict.raw %<>% rename(VirusOriginal = "Virus") %>% | ||
left_join(ncbi.tax, by = "VirusOriginal") | ||
|
||
# Finally, grab the date info | ||
|
||
meta <- read_csv("~/Github/ept/PREDICT_Animals_Sampled.csv") | ||
|
||
meta %<>% | ||
rename(PREDICT_SampleID = PREDICT_IndividualID) %>% | ||
select(PREDICT_SampleID, SampleDate) %>% | ||
mutate(CollectionYear = year(SampleDate), | ||
CollectionMonth = month(SampleDate), | ||
CollectionDay = day(SampleDate)) %>% | ||
select(-SampleDate) | ||
|
||
predict.raw %<>% left_join(meta)%>% | ||
select(-PREDICT_SampleID) | ||
|
||
write_csv(predict.raw, "Intermediate/Unformatted/PREDICTPCRUnformatted.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
|
||
library(tidyverse) | ||
|
||
predict <- read_csv("Intermediate/Unformatted/PREDICTSupplementUnformatted.csv") | ||
|
||
temp <- data.frame(Host = character(), | ||
Virus = character(), | ||
HostTaxID = double(), | ||
VirusTaxID = double(), | ||
HostNCBIResolved = logical(), | ||
VirusNCBIResolved = logical(), | ||
HostGenus = character(), | ||
HostFamily = character(), | ||
HostOrder = character(), | ||
HostClass = character(), | ||
HostOriginal = character(), | ||
HostSynonyms = character(), | ||
VirusGenus = character(), | ||
VirusFamily = character(), | ||
VirusOrder = character(), | ||
VirusClass = character(), | ||
VirusOriginal = character(), | ||
HostFlagID = logical(), | ||
VirusFlagContaminant = logical(), | ||
DetectionMethod = character(), | ||
DetectionOriginal = character(), | ||
Database = character(), | ||
DatabaseVersion = character(), | ||
PublicationYear = double(), | ||
ReferenceText = character(), | ||
PMID = double(), | ||
NCBIAccession = character(), | ||
ReleaseYear = double(), | ||
ReleaseMonth = double(), | ||
ReleaseDay = double(), | ||
CollectionYear = double(), | ||
CollectionMonth = double(), | ||
CollectionDay = double(), | ||
stringsAsFactors = FALSE) | ||
|
||
# Grab the VirusClass values | ||
|
||
classer <- data.frame(VirusOrder = unique(na.omit(predict$VirusOrder)), | ||
VirusClass = NA) | ||
|
||
for (i in 1:nrow(classer)) { | ||
ncbi.high <- taxize::classification(get_uid(classer$VirusOrder[i]), db = "ncbi") | ||
classer$VirusClass[i] <- ncbi.high[[1]]$name[which(ncbi.high[[1]]$rank=='class')] | ||
} | ||
|
||
predict %<>% left_join(classer) | ||
|
||
### Format | ||
|
||
predict <- bind_rows(temp, predict) | ||
|
||
predict %<>% mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder", "HostClass", | ||
"Virus", "VirusGenus", "VirusFamily", "VirusOrder", "VirusClass"), | ||
tolower) | ||
|
||
predict %<>% mutate(DetectionMethod = "PCR/Sequencing", | ||
DetectionOriginal = "PREDICT", | ||
Database = "PREDICT", | ||
DatabaseVersion = "June282021PCRTests", | ||
ReleaseYear = 2021, | ||
ReleaseMonth = 8, | ||
ReleaseDay = 28) | ||
|
||
|
||
write_csv(predict, "Intermediate/Formatted/PREDICTPCRFormatted.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
|
||
p1 <- read_csv("Intermediate/Formatted/PREDICTMainFormatted.csv") | ||
p2 <- read_csv("Intermediate/Formatted/PREDICTPCRFormatted.csv") | ||
|
||
predict <- bind_rows(p1, p2) | ||
|
||
spill <- read_csv("~/Github/ept/SpilloverRankings.csv") | ||
spill %<>% mutate(`Virus Species` = str_replace(`Virus Species`, "PREDICT ", "PREDICT_")) %>% | ||
mutate(`Virus Species` = str_replace(`Virus Species`, "Adeno-Associated Virus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Adenovirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Arenavirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Coronavirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Lentivirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Mamastrovirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Mammastrovirus PREDICT", "PREDICT"), # There's one typo :) | ||
`Virus Species` = str_replace(`Virus Species`, "Paramyxovirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Picobirnavirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Polyomaovirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Posavirus PREDICT", "PREDICT"), | ||
`Virus Species` = str_replace(`Virus Species`, "Poxvirus PREDICT", "PREDICT")) %>% | ||
select(`Virus Species`, `Virus Genus`) %>% | ||
filter(!(`Virus Genus`=="Unassigned")) | ||
|
||
for (i in 1:nrow(spill)){ | ||
if(nrow(predict[str_to_lower(str_replace(predict$VirusOriginal,"strain of ","")) == str_to_lower(spill$`Virus Species`[i]),'VirusGenus'])>0){ | ||
predict[str_to_lower(str_replace(predict$VirusOriginal,"strain of ","")) == str_to_lower(spill$`Virus Species`[i]),'VirusGenus'] <- str_to_lower(spill[spill$`Virus Species`==spill$`Virus Species`[i],'Virus Genus']) | ||
} | ||
} | ||
|
||
predict$VirusGenus[predict$VirusOriginal=="strain of Eidolon bat coronavirus"] <- "betacoronavirus" | ||
predict$VirusGenus[predict$VirusOriginal=="strain of Bat coronavirus Hipposideros"] <- "betacoronavirus" # this can be reconstructed from the predict.2 object (the PCR Tests) but NOT the HealthMap copy | ||
|
||
write_csv(predict, "Intermediate/Formatted/PREDICTAllFormatted.csv") |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Oops, something went wrong.