Massive cleanup and project-wide update

New GenBank download and six months worth of updates to that data; pipeline-wide scrub of the SRA architecture and any other vestigial code or datasets; updated PREDICT data architecture including the two-file cross validation; consistent HostFlagID system and a better cf. handler in hdict()
viralemergence · Aug 1, 2021 · 88d3692 · 88d3692
1 parent 52d2398
commit 88d3692
Show file tree

Hide file tree

Showing 41 changed files with 192,048 additions and 193,850 deletions.
diff --git a/.gitignore b/.gitignore
@@ -28,3 +28,4 @@ Intermediate/Formatted/VIRIONUnprocessed.csv
 Intermediate/Formatted/GenbankFormatted.csv
 wc5/alt.bil
 wc5/alt.hdr
+Source/AllNuclMetadata.csv
diff --git a/Code/001_TaxizeFunctions.R b/Code/001_TaxizeFunctions.R
@@ -15,6 +15,7 @@ mutate_cond <- function(.data, condition, ..., envir = parent.frame()) {
 
 hdict <- function(names) { 
   names.orig <- names
+  names <- str_replace(names, " cf\\.","")
   names <- str_replace(names, " sp\\.","")
   names <- str_replace(names, " gen\\.","")
   u <- get_uid(names, rank_filter = c("subspecies", "species", "genus", "family", "order", "class"), 

diff --git a/Code/002_TaxiseCleaner.R b/Code/002_TaxiseCleaner.R
diff --git a/Code/00b_TaxizeVirusesGenBank.R b/Code/00b_TaxizeVirusesGenBank.R
diff --git a/Code/01_Generate Template.R b/Code/01_Generate Template.R
@@ -20,7 +20,6 @@ temp <- data.frame(Host = character(),
                    VirusClass = character(),
                    VirusOriginal = character(),
                    HostFlagID = logical(),
-                   VirusFlagContaminant = logical(),
                    DetectionMethod = character(),
                    DetectionOriginal = character(),
                    Database = character(),
@@ -37,4 +36,4 @@ temp <- data.frame(Host = character(),
                    CollectionDay = double(),
                    stringsAsFactors = FALSE)
 
-write_csv(df, "Intermediate/Template.csv")
+write_csv(temp, "Intermediate/Template.csv")
diff --git a/Code/02_0_Format CLOVER.R b/Code/02_0_Format CLOVER.R
@@ -23,7 +23,6 @@ temp <- data.frame(Host = character(),
                  VirusClass = character(),
                  VirusOriginal = character(),
                  HostFlagID = logical(),
-                 VirusFlagContaminant = logical(),
                  DetectionMethod = character(),
                  DetectionOriginal = character(),
                  Database = character(),

diff --git a/Code/02_1a_Download GenBank.R b/Code/02_1a_Download GenBank.R
@@ -0,0 +1,12 @@
+
+library(RCurl)
+library(readr)
+library(tidyverse)
+library(vroom)
+
+url = "https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNuclMetadata/AllNuclMetadata.csv"
+download.file(url, destfile = "./Source/AllNuclMetadata.csv")
+seq <- data.table::fread("./Source/AllNuclMetadata.csv",
+                         select = c("#Accession", "Release_Date", "Species", "Host", "Collection_Date"))
+seq %>% rename(Accession = "#Accession") %>% 
+  vroom_write("./Source/sequences.csv")
diff --git a/Code/02_1a_Digest GenBank.R → Code/02_1b_Digest GenBank.R b/Code/02_1a_Digest GenBank.R → Code/02_1b_Digest GenBank.R
@@ -30,7 +30,8 @@ gb %<>% rename(HostOriginal = "Host") %>%
                           "Lepidosauria",
                           "Mammalia",
                           "Myxini",
-                          "Reptilia") | HostOrder == "Testudines")
+                          "Reptilia") | HostOrder %in% c("Testudines", "Crocodylia"))
+ # Reptilia is defunct but left in case GLOBI has something on it or it's reinstituted or something weird
 
 gb %>% pull(Species) %>% unique() %>% sort() -> virus.list
 virus.table <- vdict(virus.list)

diff --git a/Code/02_1b_Format GenBank.R → Code/02_1c_Format GenBank.R b/Code/02_1b_Format GenBank.R → Code/02_1c_Format GenBank.R
@@ -19,7 +19,6 @@ temp <- data.frame(Host = character(),
                    VirusClass = character(),
                    VirusOriginal = character(),
                    HostFlagID = logical(),
-                   VirusFlagContaminant = logical(),
                    DetectionMethod = character(),
                    DetectionOriginal = character(),
                    Database = character(),
@@ -55,8 +54,7 @@ gb %<>%
   separate(Release_Date, sep = "-", into = paste0("Release", c("Year", "Month", "Day"))) %>% 
   mutate_at(vars(matches("Year|Month|Day")), as.numeric)
 
-gb %<>% mutate(HostFlagID = FALSE,
-               VirusFlagContaminant = FALSE,
+gb %<>% mutate(HostFlagID = str_detect(HostOriginal, "cf."),
                Database = "GenBank",
                DatabaseVersion = "Jan2021FlatFile",
                DetectionMethod = "PCR/Sequencing", # Choice to call Nucleotide all sequence and not isolation is potentially problematic - revisit 

diff --git a/Code/02_2a_Digest PREDICT.R b/Code/02_2a_Digest PREDICT.R
@@ -1,6 +1,5 @@
 
 if(!exists('jncbi')) {source('Code/001_Julia functions.R')}
-if(!exists('findSyns3')) {source('Code/002_TaxiseCleaner.R')}
 if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')}
 
 library(tidyverse)
@@ -307,6 +306,3 @@ predict %>% filter(is.na(Host)) %>% pull(HostOriginal) %>% unique()
 predict %<>% select(-VirusIntermediate)
 
 write_csv(predict, "Intermediate/Unformatted/PREDICTMainUnformatted.csv")
-
-# ##### Double check the NCBItaxonomy on the viruses I guess
-# # test %>% filter(is.na(HostGenus))
diff --git a/Code/02_2b_Format PREDICT.R b/Code/02_2b_Format PREDICT.R
@@ -1,7 +1,7 @@
 
 library(tidyverse)
 
-predict <- read_csv("Intermediate/Unformatted/PREDICTUnformatted.csv")
+predict <- read_csv("Intermediate/Unformatted/PREDICTMainUnformatted.csv")
 
 temp <- data.frame(Host = character(),
                    Virus = character(),
@@ -21,7 +21,6 @@ temp <- data.frame(Host = character(),
                    VirusClass = character(),
                    VirusOriginal = character(),
                    HostFlagID = logical(),
-                   VirusFlagContaminant = logical(),
                    DetectionMethod = character(),
                    DetectionOriginal = character(),
                    Database = character(),

diff --git a/Code/02_2c_Digest PREDICT PCR.R b/Code/02_2c_Digest PREDICT PCR.R
@@ -5,9 +5,10 @@ library(tidyverse)
 library(magrittr)
 library(lubridate)
 library(naniar)
+library(vroom)
 
 predict.1 <- read_csv("~/Github/ept/PredictData (2).csv")
-predict.2 <- read_csv("~/Github/ept/PREDICT_PCR_Tests.csv")
+predict.2 <- vroom("~/Github/ept/PREDICT_PCR_Tests.csv.gz")
 
 predict.1 %<>% # select(`Species Scientific Name Based on Field Morphology`, 
                       #Virus) %>%