Simplify installation

- Moved to conda installation - Improved makefile/reproducibility
Nardus · Jun 29, 2021 · 4aae040 · 4aae040
1 parent 3ed542c
commit 4aae040
Show file tree

Hide file tree

Showing 20 changed files with 1,013 additions and 1,562 deletions.
diff --git a/.Rprofile b/.Rprofile
diff --git a/InternalData/NameChanges2019.csv b/InternalData/NameChanges2019.csv
diff --git a/Makefile b/Makefile
@@ -65,7 +65,7 @@ ExternalData/ICTV_MasterSpeciesList_2018b.xlsx:
 
 ExternalData/WoolhouseBrierley_2018.xlsx:
 	mkdir -p ExternalData
-	curl -L -o $(@D)/WB2018.zip 'http://datashare.is.ed.ac.uk/download/DS_10283_2970.zip'
+	curl -k -L -o $(@D)/WB2018.zip 'https://datashare.is.ed.ac.uk/download/DS_10283_2970.zip'
 	unzip -u -d $(@D) $(@D)/WB2018.zip 'Woolhouse and Brierley RNA virus database.xlsx'
 	mv $(@D)/'Woolhouse and Brierley RNA virus database.xlsx' $(@D)/WoolhouseBrierley_2018.xlsx
 	touch $(@D)/WoolhouseBrierley_2018.xlsx   # Simply updates 'last modified' date, since unzip doesn't do this
@@ -417,7 +417,6 @@ Plots/Figure1.pdf: CalculatedData/SplitData_Training.rds \
 				   InternalData/Taxonomy_UnclassifiedViruses.csv \
 				   RunData/TaxonomyHeuristic/Test_BootstrapPredictions.rds \
 				   $(TRAIN_OUTPUT_FOLDERS) \
-				   RunData/AllGenomeFeatures_LongRun/AllGenomeFeatures_LongRun_Predictions.rds \
 				   RunData/AllGenomeFeatures_LongRun/AllGenomeFeatures_LongRun_Bagged_predictions.rds
 	Rscript Scripts/Plotting/MakeFigure1.R
 
@@ -437,7 +436,7 @@ Plots/Figure3.pdf: RunData/AllGenomeFeatures_LongRun/AllGenomeFeatures_LongRun_B
 				   Plots/Figure1.pdf \
 				   ExternalData/NovelViruses/ICTV_MasterSpeciesList_2019.v1.xlsx \
 				   InternalData/NovelVirus_Hosts_Curated.csv \
-				   ExternalData/NovelViruses/NovelViruses.csv
+				   ExternalData/NovelViruses/NovelViruses.gb
 	Rscript Scripts/Plotting/MakeFigure3.R
 
 
@@ -551,18 +550,20 @@ Plots/TableS1.csv: Plots/Figure3.pdf
 make_plots: Plots/Figure1.pdf \
 			Plots/Figure2.pdf \
 			Plots/Figure3.pdf \
+			Plots/Figure4.pdf \
+			Plots/Figure5.pdf \
 			Plots/Supplement_RawData.pdf \
 			Plots/Supplement_family_auc.pdf \
+			Plots/Supplement_RelatednessModelRanks.pdf \
 			Plots/Supplement_TrainingSetRanks.pdf \
+			Plots/Supplement_ScreeningSuccessRate.pdf \
 			Plots/Supplement_HighPriority_MissingZoonoses.pdf \
 			Plots/Supplement_bk_plots.pdf \
-			Plots/Combine_tanglegrams.pdf \
 			Plots/SupplementaryFigure_FeatureClusters.pdf \
 			Plots/SupplementaryFigure_EffectDirection.pdf \
-			Plots/Supplement_Sarbecovirus_ranks.pdf \
+			Plots/Supplement_NovelVirus_Hosts.pdf \
 			Plots/Supplement_methods_derived_genome_features.pdf \
 			Plots/Supplement_FeatureSelection.pdf \
-			Plots/Supplement_NovelVirus_Hosts.pdf \
 			Plots/TableS1.csv
 
 
@@ -582,9 +583,10 @@ as_distributed: confirm
 	-rm -rfv ExternalData
 	-rm -rfv Plots
 	-rm -rfv Predictions
+	-rm -rfv cached_blast_searches
 	-rm -fv .Renviron
-	-find CalculatedData -maxdepth 1 -not -name CalculatedData -not -name GenomicFeatures-*.rds -not -name SplitData_Training.rds -delete
-	-find RunData -maxdepth 1 -not -name -not RunData -name AllGenomeFeatures_LongRun -not -name PN_LongRun -delete
+	-find CalculatedData -maxdepth 1 -not -name CalculatedData -not -name GenomicFeatures-*.rds -not -name SplitData_Training.rds -exec rm -rf {} \;
+	-find RunData -maxdepth 1 -not -name RunData -not -name AllGenomeFeatures_LongRun -not -name PN_LongRun -exec rm -rf {} \;
 	-rm -fv RunData/AllGenomeFeatures_LongRun/AllGenomeFeatures_LongRun_Bagged_predictions.rds
 	-rm -fv RunData/AllGenomeFeatures_LongRun/AllGenomeFeatures_LongRun_Bagging_AUCs.rds
 	-rm -fv RunData/AllGenomeFeatures_LongRun/AllGenomeFeatures_LongRun_CalculatedData.rds

diff --git a/README.md b/README.md
@@ -17,31 +17,41 @@ For a list of priority categories and ranks for all virus species in the paper,
 
 
 ## Requirements
-- [R](https://www.r-project.org/) (tested using version 3.5.1)
-   - Most required R libraries can be installed using `Rscript -e "renv::restore()"`
-   - Install `ggtree` from bioconductor using: `Rscript -e "install.packages('BiocManager'); BiocManager::install('ggtree')"`
-- [Python](https://www.python.org/) (version >=3.6)
-   - [Biopython](https://biopython.org/)
-   - [Pandas](https://pandas.pydata.org/)
-   - [xlrd](https://xlrd.readthedocs.io/en/latest/)
-- [Java JDK](https://www.oracle.com/uk/java/technologies/javase-downloads.html) (version >=8)
 
-If repeating all analyses (see below), the [BLAST+ suite of applications](https://www.ncbi.nlm.nih.gov/books/NBK279670/?report=classic) are also required (used for "phylogenetic neighbourhood" analyses and predictions, but not otherwise required). If your R session has trouble finding the BLAST+ executables, run `make update_path` and enter the location of the BLAST executables (e.g. `/usr/local/ncbi/blast/bin`). Making figure S9 requires [iqtree](http://www.iqtree.org/).
+- Install the [conda package manager](https://conda.io/)
+- Create the base environment
+This installs everything required for prediction of new viruses
+```
+conda env create -f base_environment.yml
+```
+
+- Before each use, activate this environment using
+```
+conda activate zoonotic_rank
+```
+
+### Repeating published analyses
+If repeating all analyses in the manuscript (see below), a few additional tools and R libraries are needed. The majority of these can be added to the base environment created above using:
+```
+conda env update -n zoonotic_rank -f dev_environment.yml
+```
+
+The [BLAST+ suite of applications](https://www.ncbi.nlm.nih.gov/books/NBK279670/?report=classic) is also required (used for "phylogenetic neighbourhood" analyses and predictions). Version 2.8.1+ was used in the manuscript. If your R session has trouble finding the BLAST+ executables, run `make update_path` and enter the location of the BLAST executables (e.g. `/usr/local/ncbi/blast/bin`). 
 
 
 
 ## Ranking novel viruses
-Ranks for novel viruses can be generated by specifying the input format, paths to genome and metadata files, and a name for output files, e.g:
+Ranks for novel viruses can be generated by specifying the input sequence format, paths to genome and metadata files, and a name for output files, e.g:
 ```
-Scripts/PredictNovel.R fasta \
+Rscript Scripts/PredictNovel.R fasta \
                        InternalData/example_files/genomes.fasta \
                        InternalData/example_files/metadata.csv \
                        example_1
 ```
 
 For detailed instructions and further options, see
 ```
-Scripts/PredictNovel.R --help
+Rscript Scripts/PredictNovel.R --help
 ```
 
 #### Input
@@ -79,10 +89,10 @@ File                         | Description
 Follow instructions below to repeat the analyses described in the manuscript. Note that these steps are _not_ needed to make predictions as described above (pre-trained models are included). Running all analyses takes ~3 weeks on a 4-core, 2.8 GHz i7 processor and requires ~5Gb of disk space.
 
 #### Basic
-These steps will download any missing source data and automatically create/update files as needed.
+These steps will download all external data and re-run the entire pipeline.
 
 _Using Rstudio:_
-1. Open `ZoonosisPredictor.Rproj` in RStudio
+1. Open `zoonotic_rank.Rproj` in RStudio
 2. On the `Build` tab, select `More` > `Clean and Rebuild`
 
 _Using the command-line:_
@@ -93,7 +103,6 @@ make clean all
 #### Advanced options (command-line only)
 
 - Use `make help` to see individual steps in the pipeline. Upstream steps are run automatically if needed. For example, using `make prepare` will run the data cleanup step, but also downloads the raw data if needed.
-- `make <path to file>` runs all steps neccesary to produce/update the specified file (e.g. `make Plots/Figure1.pdf`).
 - `make as_distributed` resets the project to the state in which it was distributed.
 - `make clean` removes all run-related files, allowing a complete re-run (in contrast to `as_distributed`, this includes removing the pre-trained models required for prediction).
 
@@ -103,50 +112,41 @@ make clean all
 
 ```
 └─zoonotic_rank/
-   ├─Makefile ................................. Record of workflow and dependencies
-   │                                            between files
-   ├─options.config ........................... Runtime options (speciefies number
-   │                                            of parrallel threads allowed and 
-   │                                            the random seed)
+   ├─Makefile ................................. Record of workflow and dependencies between files
+   ├─options.config ........................... Runtime options (speciefies number of parrallel
+   │                                            threads allowed and the random seed)
+   ├─base_environment.yml ..................... Record of software and R libraries required for 
+   │                                            prediction
+   ├─dev_environment.yml ...................... Record of additional software required to train and
+   │                                            evaulate models
    ├─InternalData/ ............................ All data unique to this project
-   │   ├─example_files/ ....................... Example input files for 
-   │   │                                        predicting novel viruses 
-   │   ├─Shaw2017_raw/ ........................ Raw ISG data from Shaw et al. 
-   │   │                                        2017 (see https://isg.data.cvr.ac.uk/)
-   │   ├─FinalData_Cleaned.csv .................Final dataset, as used for training. 
-   │   │                                        Created by merging files below 
-   │   │                                        (see Scripts/MergeAndCleanData.R)
-   │   ├─AllInternalData_Checked.csv .......... Metadata for the viruses used 
-   │   │                                        as training data
-   │   ├─Final_Accessions_Unique_Spp.csv ...... Accession numbers of sequences 
-   │   │                                        used for training (replaces 
-   │   │                                        those in the metadata file)
-   │   ├─NameMatches_All.csv .................. Manually curated list used to 
-   │   │                                        match virus names to unique 
-   │   │                                        species across external datasets
-   │   ├─SourcesOfZoonoses_BabayanZoonotic.csv  Additional zoonotic status data 
-   │   │                                        for species not available in 
-   │   │                                        external data sources
-   │   └─Taxonomy_UnclassifiedViruses.csv ..... Taxonomic information for 
-   │                                            unclassified viruses in the 
-   │                                            metadata (unused)
+   │   ├─example_files/ ....................... Example input files for predicting novel viruses
+   │   ├─Shaw2017_raw/ ........................ Raw ISG data from Shaw et al. 2017
+   │   │                                        (see https://isg.data.cvr.ac.uk/)
+   │   ├─FinalData_Cleaned.csv .................Final dataset, as used for training. Created by 
+   │   │                                        merging files below (Scripts/MergeAndCleanData.R)
+   │   ├─AllInternalData_Checked.csv .......... Metadata for the viruses used as training data
+   │   ├─Final_Accessions_Unique_Spp.csv ...... Accession numbers of sequences used for training
+   │   │                                        (replaces those in the metadata file)
+   │   ├─NameMatches_All.csv .................. Manually curated list used to match virus names to
+   │   │                                        unique species across external datasets
+   │   ├─SourcesOfZoonoses_BabayanZoonotic.csv  Additional zoonotic status data for species not
+   │   │                                        available in external data sources
+   │   └─Taxonomy_UnclassifiedViruses.csv ..... Taxonomic information for unclassified viruses in
+   │                                            the metadata (unused)
    │
-   ├─CalculatedData/ .......................... Intermediate calculations ([*], except 
-   │                                            for files required by PredictNovel.R)
-   ├─ExternalData/ ............................ [*] Data from external sources, 
-   │                                            dowloaded as needed (see Makefile)
-   ├─Misc/ .................................... Miscelaneous scripts to download 
-   │                                            external data
+   ├─CalculatedData/ .......................... Intermediate calculations ([*], except for files
+   │                                            required by PredictNovel.R)
+   ├─ExternalData/ ............................ [*] Data from external sources, dowloaded as needed
+   │                                            (see Makefile)
+   ├─Misc/ .................................... Miscelaneous scripts to download external data
    ├─Plots/ ................................... [*] Final plots generated
    ├─Predictions/ ............................. [*] Predictions for case studies
-   ├─renv/ .................................... Record of R libraries required
-   ├─RunData/ ................................. Trained models ([*], except for 
-   │                                            files required by PredictNovel.R)
-   ├─Scripts/ ................................. Main analysis, prediction, and 
-   │   │                                        plotting scripts
+   ├─RunData/ ................................. Trained models ([*], except for files required 
+   │                                            by PredictNovel.R)
+   ├─Scripts/ ................................. Main analysis, prediction, and plotting scripts
    │   └─Plotting/ ............................ Scripts to generate published plots
-   ├─Tests/ ................................... Unit tests for basic functionality 
-   │                                            of utility scripts
-   └─Utils/ ................................... Utility functions and tools called 
-                                                by other scripts
+   ├─Tests/ ................................... Unit tests for basic functionality of utility 
+   │                                            functions/scripts
+   └─Utils/ ................................... Utility functions and tools called by other scripts
 ```
diff --git a/Scripts/CalculateBaggedPredictions.R b/Scripts/CalculateBaggedPredictions.R
@@ -38,7 +38,6 @@ library(rprojroot)
 library(dplyr)
 library(tidyr)
 library(ModelMetrics)
-library(betacal)
 library(parallel)
 
 

diff --git a/Scripts/Plotting/MakeSupplement_FeatureSelection.R b/Scripts/Plotting/MakeSupplement_FeatureSelection.R
@@ -6,7 +6,6 @@ library(dplyr)
 library(tidyr)
 library(readr)
 library(ModelMetrics)
-library(plotly)
 
 source(file.path('Scripts', 'Plotting', 'PlottingConstants.R'))
 source(file.path('Utils', 'rundata_utils.R'))

diff --git a/Scripts/Plotting/MakeSupplement_HumanGeneSetSimilarity.R b/Scripts/Plotting/MakeSupplement_HumanGeneSetSimilarity.R
diff --git a/Scripts/Plotting/MakeSupplementaryFigure_EffectDirection.R b/Scripts/Plotting/MakeSupplementaryFigure_EffectDirection.R
@@ -3,7 +3,9 @@
 ## 
 
 library(scales)
-library(tidyverse)
+library(dplyr)
+library(tidyr)
+library(stringr)
 library(apcluster)
 library(cowplot)
 library(ggbeeswarm)

diff --git a/Scripts/Plotting/MakeSupplementaryFigure_FeatureClusters.R b/Scripts/Plotting/MakeSupplementaryFigure_FeatureClusters.R
@@ -3,7 +3,9 @@
 ## 
 set.seed(1521312)
 
-library(tidyverse)
+library(dplyr)
+library(tidyr)
+library(stringr)
 library(cowplot)
 library(apcluster)