Skip to content

Commit

Permalink
Massive cleanup and project-wide update
Browse files Browse the repository at this point in the history
New GenBank download and six months worth of updates to that data; pipeline-wide scrub of the SRA architecture and any other vestigial code or datasets; updated PREDICT data architecture including the two-file cross validation; consistent HostFlagID system and a better cf. handler in hdict()
  • Loading branch information
Colin J. Carlson committed Aug 1, 2021
1 parent 52d2398 commit 88d3692
Show file tree
Hide file tree
Showing 41 changed files with 192,048 additions and 193,850 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ Intermediate/Formatted/VIRIONUnprocessed.csv
Intermediate/Formatted/GenbankFormatted.csv
wc5/alt.bil
wc5/alt.hdr
Source/AllNuclMetadata.csv
1 change: 1 addition & 0 deletions Code/001_TaxizeFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ mutate_cond <- function(.data, condition, ..., envir = parent.frame()) {

hdict <- function(names) {
names.orig <- names
names <- str_replace(names, " cf\\.","")
names <- str_replace(names, " sp\\.","")
names <- str_replace(names, " gen\\.","")
u <- get_uid(names, rank_filter = c("subspecies", "species", "genus", "family", "order", "class"),
Expand Down
232 changes: 0 additions & 232 deletions Code/002_TaxiseCleaner.R

This file was deleted.

25 changes: 0 additions & 25 deletions Code/00b_TaxizeVirusesGenBank.R

This file was deleted.

3 changes: 1 addition & 2 deletions Code/01_Generate Template.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ temp <- data.frame(Host = character(),
VirusClass = character(),
VirusOriginal = character(),
HostFlagID = logical(),
VirusFlagContaminant = logical(),
DetectionMethod = character(),
DetectionOriginal = character(),
Database = character(),
Expand All @@ -37,4 +36,4 @@ temp <- data.frame(Host = character(),
CollectionDay = double(),
stringsAsFactors = FALSE)

write_csv(df, "Intermediate/Template.csv")
write_csv(temp, "Intermediate/Template.csv")
1 change: 0 additions & 1 deletion Code/02_0_Format CLOVER.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ temp <- data.frame(Host = character(),
VirusClass = character(),
VirusOriginal = character(),
HostFlagID = logical(),
VirusFlagContaminant = logical(),
DetectionMethod = character(),
DetectionOriginal = character(),
Database = character(),
Expand Down
12 changes: 12 additions & 0 deletions Code/02_1a_Download GenBank.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

library(RCurl)
library(readr)
library(tidyverse)
library(vroom)

url = "https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNuclMetadata/AllNuclMetadata.csv"
download.file(url, destfile = "./Source/AllNuclMetadata.csv")
seq <- data.table::fread("./Source/AllNuclMetadata.csv",
select = c("#Accession", "Release_Date", "Species", "Host", "Collection_Date"))
seq %>% rename(Accession = "#Accession") %>%
vroom_write("./Source/sequences.csv")
3 changes: 2 additions & 1 deletion Code/02_1a_Digest GenBank.R → Code/02_1b_Digest GenBank.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ gb %<>% rename(HostOriginal = "Host") %>%
"Lepidosauria",
"Mammalia",
"Myxini",
"Reptilia") | HostOrder == "Testudines")
"Reptilia") | HostOrder %in% c("Testudines", "Crocodylia"))
# Reptilia is defunct but left in case GLOBI has something on it or it's reinstituted or something weird

gb %>% pull(Species) %>% unique() %>% sort() -> virus.list
virus.table <- vdict(virus.list)
Expand Down
4 changes: 1 addition & 3 deletions Code/02_1b_Format GenBank.R → Code/02_1c_Format GenBank.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ temp <- data.frame(Host = character(),
VirusClass = character(),
VirusOriginal = character(),
HostFlagID = logical(),
VirusFlagContaminant = logical(),
DetectionMethod = character(),
DetectionOriginal = character(),
Database = character(),
Expand Down Expand Up @@ -55,8 +54,7 @@ gb %<>%
separate(Release_Date, sep = "-", into = paste0("Release", c("Year", "Month", "Day"))) %>%
mutate_at(vars(matches("Year|Month|Day")), as.numeric)

gb %<>% mutate(HostFlagID = FALSE,
VirusFlagContaminant = FALSE,
gb %<>% mutate(HostFlagID = str_detect(HostOriginal, "cf."),
Database = "GenBank",
DatabaseVersion = "Jan2021FlatFile",
DetectionMethod = "PCR/Sequencing", # Choice to call Nucleotide all sequence and not isolation is potentially problematic - revisit
Expand Down
4 changes: 0 additions & 4 deletions Code/02_2a_Digest PREDICT.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

if(!exists('jncbi')) {source('Code/001_Julia functions.R')}
if(!exists('findSyns3')) {source('Code/002_TaxiseCleaner.R')}
if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')}

library(tidyverse)
Expand Down Expand Up @@ -307,6 +306,3 @@ predict %>% filter(is.na(Host)) %>% pull(HostOriginal) %>% unique()
predict %<>% select(-VirusIntermediate)

write_csv(predict, "Intermediate/Unformatted/PREDICTMainUnformatted.csv")

# ##### Double check the NCBItaxonomy on the viruses I guess
# # test %>% filter(is.na(HostGenus))
3 changes: 1 addition & 2 deletions Code/02_2b_Format PREDICT.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

library(tidyverse)

predict <- read_csv("Intermediate/Unformatted/PREDICTUnformatted.csv")
predict <- read_csv("Intermediate/Unformatted/PREDICTMainUnformatted.csv")

temp <- data.frame(Host = character(),
Virus = character(),
Expand All @@ -21,7 +21,6 @@ temp <- data.frame(Host = character(),
VirusClass = character(),
VirusOriginal = character(),
HostFlagID = logical(),
VirusFlagContaminant = logical(),
DetectionMethod = character(),
DetectionOriginal = character(),
Database = character(),
Expand Down
3 changes: 2 additions & 1 deletion Code/02_2c_Digest PREDICT PCR.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ library(tidyverse)
library(magrittr)
library(lubridate)
library(naniar)
library(vroom)

predict.1 <- read_csv("~/Github/ept/PredictData (2).csv")
predict.2 <- read_csv("~/Github/ept/PREDICT_PCR_Tests.csv")
predict.2 <- vroom("~/Github/ept/PREDICT_PCR_Tests.csv.gz")

predict.1 %<>% # select(`Species Scientific Name Based on Field Morphology`,
#Virus) %>%
Expand Down
Loading

0 comments on commit 88d3692

Please sign in to comment.