Skip to content

Commit

Permalink
PREDICT extra records and higher taxonomy added
Browse files Browse the repository at this point in the history
  • Loading branch information
Colin J. Carlson committed Jul 19, 2021
1 parent 42fabe5 commit affc13b
Show file tree
Hide file tree
Showing 21 changed files with 37,461 additions and 23,204 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ predict %>% filter(is.na(Host)) %>% pull(HostOriginal) %>% unique()

predict %<>% select(-VirusIntermediate)

write_csv(predict, "Intermediate/Unformatted/PREDICTUnformatted.csv")
write_csv(predict, "Intermediate/Unformatted/PREDICTMainUnformatted.csv")

# ##### Double check the NCBItaxonomy on the viruses I guess
# # test %>% filter(is.na(HostGenus))
5 changes: 3 additions & 2 deletions Code/02_2b_Format PREDICT.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ predict %<>% left_join(classer)

### Format

predict <- bind_rows(temp, predict)
predict <- bind_rows(temp, predict %>% mutate(HostTaxID = as.double(HostTaxID),
VirusTaxID = as.double(VirusTaxID)))

predict %<>% mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder", "HostClass",
"Virus", "VirusGenus", "VirusFamily", "VirusOrder", "VirusClass"),
tolower)

write_csv(predict, "Intermediate/Formatted/PREDICTFormatted.csv")
write_csv(predict, "Intermediate/Formatted/PREDICTMainFormatted.csv")
115 changes: 115 additions & 0 deletions Code/02_2c_Digest PREDICT PCR.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@

if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')}

library(tidyverse)
library(magrittr)
library(lubridate)
library(naniar)

predict.1 <- read_csv("~/Github/ept/PredictData (2).csv")
predict.2 <- read_csv("~/Github/ept/PREDICT_PCR_Tests.csv")

predict.1 %<>% # select(`Species Scientific Name Based on Field Morphology`,
#Virus) %>%
rename(Host = "Species Scientific Name Based on Field Morphology") %>%
distinct() %>%
mutate(Host = str_replace(Host, " \\*", "")) %>%
mutate(Host = str_replace(Host, " cf.", "")) %>%
mutate(Virus = str_replace(Virus, "strain of ", "")) %>%
mutate(Host = str_to_lower(Host), Virus = str_to_lower(Virus))

predict.2 %<>% # select(ScientificName, Virus) %>%
rename(Host = ScientificName) %>%
filter(!is.na(Virus)) %>%
mutate(Host = str_replace(Host, " cf.", "")) %>%
distinct() %>%
mutate(Host = str_to_lower(Host), Virus = str_to_lower(Virus))

# Only grab the 50 or so records the original data are missing
predict.raw <- anti_join(predict.2, predict.1, by = c("Host", "Virus"))

# A couple sanity checks
# table(predict.raw$TestResult)
# table(predict.raw$TestType)

predict.raw %<>% select(Host,
Virus,
PREDICT_SampleID,
GenbankAccessionNumber) %>%

# Rename the columns
rename(NCBIAccession = "GenbankAccessionNumber") %>%

# Collapse the Genbank info
group_by_at(vars(-NCBIAccession)) %>%
summarize(NCBIAccession = str_c(NCBIAccession, collapse = ", ")) %>%
unique() %>%

# Clean up the host info
# First, remove fuzzy names
mutate(HostFlagID = str_detect(Host, "cf."),
Host = str_replace(Host, " cf.", "")) %>%
mutate(Virus = word(Virus, 1, sep = "\\("))

# Let's do some higher classifications to this

twowords <- function(x) {
q = word(x, 1:2, sep=" ")
if(is.na(q[1])) {return(x)} else {return(str_c(na.omit(q), collapse = " "))}
}
host.tax <- hdict(predict.raw$Host %>% unique() %>% sapply(., twowords))
predict.raw %>% rename(HostOriginal = "Host") %>%
left_join(host.tax) -> predict.raw

# Now the viruses

# First some cleaning

predict.raw %<>% mutate(Virus = recode(Virus, !!!c("influenza a" = "influenza a virus",
"alpha coronavirus nl63" = "coronavirus nl63")))

predict.raw %>% pull(Virus) %>% unique() %>% sort() -> ncbi.names

ncbi.tax <- vdict(ncbi.names)

ncbi.tax[str_detect(ncbi.tax$Virus, "predict_cov"),"VirusFamily"] <- "coronaviridae"
ncbi.tax[str_detect(ncbi.tax$Virus, "predict_cov"),"VirusOrder"] <- "nidovirales"

ncbi.tax[str_detect(ncbi.tax$Virus, "predict_pmv"),"VirusFamily"] <- "paramyxoviridae"
ncbi.tax[str_detect(ncbi.tax$Virus, "predict_pmv"),"VirusOrder"] <- "mononegavirales"

ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusGenus"] <- "betacoronavirus"
ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusFamily"] <- "coronaviridae"
ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusOrder"] <- "nidovirales"

ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusGenus"] <- "alphacoronavirus"
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusFamily"] <- "coronaviridae"
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusOrder"] <- "nidovirales"

ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusGenus"] <- "alphacoronavirus"
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusFamily"] <- "coronaviridae"
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusOrder"] <- "nidovirales"

if(ncbi.tax[ncbi.tax$VirusOriginal=="predict_pmv-95", "Virus"]=="Atractiella rhizophila") {
ncbi.tax[ncbi.tax$VirusOriginal=="predict_pmv-95",] <- c("predict_pmv-95", NA, FALSE, "predict_pmv-95", NA, "paramyxoviridae", "mononegavirales", "monjiviricetes")
}

predict.raw %<>% rename(VirusOriginal = "Virus") %>%
left_join(ncbi.tax, by = "VirusOriginal")

# Finally, grab the date info

meta <- read_csv("~/Github/ept/PREDICT_Animals_Sampled.csv")

meta %<>%
rename(PREDICT_SampleID = PREDICT_IndividualID) %>%
select(PREDICT_SampleID, SampleDate) %>%
mutate(CollectionYear = year(SampleDate),
CollectionMonth = month(SampleDate),
CollectionDay = day(SampleDate)) %>%
select(-SampleDate)

predict.raw %<>% left_join(meta)%>%
select(-PREDICT_SampleID)

write_csv(predict.raw, "Intermediate/Unformatted/PREDICTPCRUnformatted.csv")
70 changes: 70 additions & 0 deletions Code/02_2d_Format PREDICT PCR.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@

library(tidyverse)

predict <- read_csv("Intermediate/Unformatted/PREDICTSupplementUnformatted.csv")

temp <- data.frame(Host = character(),
Virus = character(),
HostTaxID = double(),
VirusTaxID = double(),
HostNCBIResolved = logical(),
VirusNCBIResolved = logical(),
HostGenus = character(),
HostFamily = character(),
HostOrder = character(),
HostClass = character(),
HostOriginal = character(),
HostSynonyms = character(),
VirusGenus = character(),
VirusFamily = character(),
VirusOrder = character(),
VirusClass = character(),
VirusOriginal = character(),
HostFlagID = logical(),
VirusFlagContaminant = logical(),
DetectionMethod = character(),
DetectionOriginal = character(),
Database = character(),
DatabaseVersion = character(),
PublicationYear = double(),
ReferenceText = character(),
PMID = double(),
NCBIAccession = character(),
ReleaseYear = double(),
ReleaseMonth = double(),
ReleaseDay = double(),
CollectionYear = double(),
CollectionMonth = double(),
CollectionDay = double(),
stringsAsFactors = FALSE)

# Grab the VirusClass values

classer <- data.frame(VirusOrder = unique(na.omit(predict$VirusOrder)),
VirusClass = NA)

for (i in 1:nrow(classer)) {
ncbi.high <- taxize::classification(get_uid(classer$VirusOrder[i]), db = "ncbi")
classer$VirusClass[i] <- ncbi.high[[1]]$name[which(ncbi.high[[1]]$rank=='class')]
}

predict %<>% left_join(classer)

### Format

predict <- bind_rows(temp, predict)

predict %<>% mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder", "HostClass",
"Virus", "VirusGenus", "VirusFamily", "VirusOrder", "VirusClass"),
tolower)

predict %<>% mutate(DetectionMethod = "PCR/Sequencing",
DetectionOriginal = "PREDICT",
Database = "PREDICT",
DatabaseVersion = "June282021PCRTests",
ReleaseYear = 2021,
ReleaseMonth = 8,
ReleaseDay = 28)


write_csv(predict, "Intermediate/Formatted/PREDICTPCRFormatted.csv")
33 changes: 33 additions & 0 deletions Code/02_2e_Merge PREDICT and add genera.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@

p1 <- read_csv("Intermediate/Formatted/PREDICTMainFormatted.csv")
p2 <- read_csv("Intermediate/Formatted/PREDICTPCRFormatted.csv")

predict <- bind_rows(p1, p2)

spill <- read_csv("~/Github/ept/SpilloverRankings.csv")
spill %<>% mutate(`Virus Species` = str_replace(`Virus Species`, "PREDICT ", "PREDICT_")) %>%
mutate(`Virus Species` = str_replace(`Virus Species`, "Adeno-Associated Virus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Adenovirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Arenavirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Coronavirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Lentivirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Mamastrovirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Mammastrovirus PREDICT", "PREDICT"), # There's one typo :)
`Virus Species` = str_replace(`Virus Species`, "Paramyxovirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Picobirnavirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Polyomaovirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Posavirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Poxvirus PREDICT", "PREDICT")) %>%
select(`Virus Species`, `Virus Genus`) %>%
filter(!(`Virus Genus`=="Unassigned"))

for (i in 1:nrow(spill)){
if(nrow(predict[str_to_lower(str_replace(predict$VirusOriginal,"strain of ","")) == str_to_lower(spill$`Virus Species`[i]),'VirusGenus'])>0){
predict[str_to_lower(str_replace(predict$VirusOriginal,"strain of ","")) == str_to_lower(spill$`Virus Species`[i]),'VirusGenus'] <- str_to_lower(spill[spill$`Virus Species`==spill$`Virus Species`[i],'Virus Genus'])
}
}

predict$VirusGenus[predict$VirusOriginal=="strain of Eidolon bat coronavirus"] <- "betacoronavirus"
predict$VirusGenus[predict$VirusOriginal=="strain of Bat coronavirus Hipposideros"] <- "betacoronavirus" # this can be reconstructed from the predict.2 object (the PCR Tests) but NOT the HealthMap copy

write_csv(predict, "Intermediate/Formatted/PREDICTAllFormatted.csv")
File renamed without changes.
2 changes: 1 addition & 1 deletion Code/03_Merge clean files.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ library(tidyverse); library(magrittr); library(vroom)
gb <- vroom("Intermediate/Formatted/GenbankFormatted.csv.gz")
clo <- read_csv("Intermediate/Formatted/CloverFormatted.csv")
#sra <- read_csv("Intermediate/Formatted/SRAFormatted.csv")
pred <- read_csv("Intermediate/Formatted/PredictFormatted.csv")
pred <- read_csv("Intermediate/Formatted/PREDICTAllFormatted.csv")
globi <- read_csv("Intermediate/Formatted/GLOBIFormatted.csv")

if(class(clo$NCBIAccession)=='numeric') {clo %<>% mutate(NCBIAccession = as.character(NCBIAccession))}
Expand Down
File renamed without changes.
Loading

0 comments on commit affc13b

Please sign in to comment.