Merge pull request #281 from pirovc/dev

ganon version 2.0.1
pirovc · Jan 12, 2024 · e043f49 · e043f49
2 parents e626f89 + 10c1965
commit e043f49
Show file tree

Hide file tree

Showing 18 changed files with 108 additions and 68 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@
 # =============================================================================
 
 cmake_minimum_required( VERSION 3.4 FATAL_ERROR )
-project( ganon VERSION 2.0.0 LANGUAGES CXX )
+project( ganon VERSION 2.0.1 LANGUAGES CXX )
 
 # -----------------------------------------------------------------------------
 # build setup

diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 [![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458)
 
+[ganon2 pre-print](https://www.biorxiv.org/content/10.1101/2023.12.07.570547)
+
 ganon2 classifies DNA sequences against large sets of genomic reference sequences efficiently. It features:
 
 - integrated download and build of any subset from RefSeq/Genbank/GTDB with incremental updates

diff --git a/docs/custom_databases.md b/docs/custom_databases.md
@@ -17,7 +17,7 @@ It is also possible to use **non-standard accessions and headers** to build cust
     If you just want to build a database without any taxonomic or target information, just sent the files with `--input`, use `--taxonomy skip` and choose between `--input-target file` or `sequence`.
 
 !!! warning
-    the target and specialization fields (2nd and 4th col) cannot be the same as the target (3rd col)
+    the target and specialization fields (2nd and 4th col) cannot be the same as the node (3rd col)
 
 <details>
   <summary>Examples of --input-file</summary>
@@ -121,8 +121,15 @@ wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.go
 wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plastid/"
 wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/mitochondrion/"
 
+# Split sequences in files and retrieve taxonomy
+mkdir sequences/
+zcat plasmid.* plastid.* mitochondrion.* | awk '$0 ~ ">" {accver=(substr($1,2)); print accver}{print $0 > "sequences/"accver".fna"}' | ganon-get-seq-info.sh -e -i - | awk '{print "sequences/"$1".fna\t"$1"\t"$3}' > ppm.tsv
+
 # Build ganon database
-ganon build-custom --input plasmid.* plastid.* mitochondrion.* --db-prefix ppm --input-target sequence --level leaves --threads 32 
+ganon build-custom --input-file ppm.tsv --db-prefix ppm --level species --threads 16
+
+# OPTIONAL Remove temporary folder and downloaded files
+rm -rf sequences/ ppm.tsv plasmid.* plastid.* mitochondrion.* 
 ```
 
 ### UniVec, UniVec_core
@@ -132,13 +139,13 @@ ganon build-custom --input plasmid.* plastid.* mitochondrion.* --db-prefix ppm -
 ```bash
 # UniVec
 wget -O "UniVec.fasta" --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec" 
-grep -o '^>[^ ]*' UniVec.fasta | sed 's/^>//' | awk '{print "UniVec.fasta\t"$1"\t81077"}' > UniVec_ganon_input_file.tsv
-ganon build-custom --input-file UniVec_ganon_input_file.tsv --db-prefix UniVec --input-target sequence --level leaves --threads 8
+echo -e "UniVec.fasta\tUniVec\t81077" > UniVec_Core_ganon_input_file.tsv
+ganon build-custom --input-file UniVec_ganon_input_file.tsv --db-prefix UniVec --level leaves --threads 8
 
 # UniVec_Core
 wget -O "UniVec_Core.fasta" --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec_Core" 
-grep -o '^>[^ ]*' UniVec_Core.fasta | sed 's/^>//' | awk '{print "UniVec_Core.fasta\t"$1"\t81077"}' > UniVec_Core_ganon_input_file.tsv
-ganon build-custom --input-file UniVec_Core_ganon_input_file.tsv --db-prefix UniVec_Core --input-target sequence --level leaves --threads 8
+echo -e "UniVec_Core.fasta\tUniVec_Core\t81077" > UniVec_Core_ganon_input_file.tsv
+ganon build-custom --input-file UniVec_Core_ganon_input_file.tsv --db-prefix UniVec_Core --level leaves --threads 8
 ```
 
 !!! note

diff --git a/docs/index.md b/docs/index.md
@@ -4,6 +4,8 @@
 
 Code: [GitHub repository](https://github.com/pirovc/ganon)
 
+[ganon2 pre-print](https://www.biorxiv.org/content/10.1101/2023.12.07.570547)
+
 ganon is designed to index large sets of genomic reference sequences and to classify reads against them efficiently. The tool uses [Hierarchical Interleaved Bloom Filters](https://doi.org/10.1186/s13059-023-02971-4) as indices based on k-mers with optional minimizers. It was mainly developed, but not limited, to the metagenomics classification problem: quickly assign sequence fragments to their closest reference among thousands of references. After classification, taxonomic or sequence abundances are estimated and reported.
 
 ## Features
@@ -144,7 +146,7 @@ usage: ganon [-h] [-v]
 - - - - - - - - - -
    _  _  _  _  _   
   (_|(_|| |(_)| |  
-   _|   v. 2.0.0
+   _|   v. 2.0.1
 - - - - - - - - - -
 
 positional arguments:
@@ -219,8 +221,8 @@ advanced arguments:
   -k , --kmer-size      The k-mer size to split sequences. (default: 19)
   -w , --window-size    The window-size to build filter with minimizers. (default: 31)
   -s , --hash-functions 
-                        The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value.
-                        (default: 4)
+                        The number of hash functions for the interleaved bloom filter [1-5]. With --filter-type ibf, 0
+                        will try to set optimal value. (default: 4)
   -f , --filter-size    Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. Only valid for --filter-
                         type ibf. (default: 0)
   -j , --mode           Create smaller or faster filters at the cost of classification speed or database size,
@@ -310,8 +312,8 @@ advanced arguments:
   -k , --kmer-size      The k-mer size to split sequences. (default: 19)
   -w , --window-size    The window-size to build filter with minimizers. (default: 31)
   -s , --hash-functions 
-                        The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value.
-                        (default: 4)
+                        The number of hash functions for the interleaved bloom filter [1-5]. With --filter-type ibf, 0
+                        will try to set optimal value. (default: 4)
   -f , --filter-size    Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. Only valid for --filter-
                         type ibf. (default: 0)
   -j , --mode           Create smaller or faster filters at the cost of classification speed or database size,

diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@ def read(filename):
 
 setup(
     name="ganon",
-    version="2.0.0",
+    version="2.0.1",
     url="https://www.github.com/pirovc/ganon",
     license='MIT',
     author="Vitor C. Piro",

diff --git a/src/ganon/build_update.py b/src/ganon/build_update.py
@@ -361,10 +361,13 @@ def build_custom(cfg, which_call: str="build_custom"):
                     suffixes = Path(first_file).suffixes
                     # If last one is gz, get real suffix
                     exts = "".join(suffixes[-2:]) if suffixes[-1]==".gz" else suffixes[-1]
+                    target_file = build_output_folder + new_target + exts
+                    # Attempt to remove if exists +
                     # Create symbolic link with correct name for the first file
-                    Path(build_output_folder + new_target + exts).symlink_to(first_file)
+                    Path(target_file).unlink(missing_ok=True)
+                    Path(target_file).symlink_to(first_file)
                     # Write input file for raptor (space separated)
-                    filehibf.write(build_output_folder + new_target + exts + " " + " ".join(files[1:]) + "\n")
+                    filehibf.write(target_file + " " + " ".join(files[1:]) + "\n")
 
             print_log("raptor prepare", cfg.quiet)
             run_raptor_prepare_cmd = " ".join([cfg.path_exec['raptor'], "prepare",
@@ -556,8 +559,14 @@ def write_tax(tax_file, info, tax, genome_sizes, user_bins_col, level, input_tar
         for target, row in info.iterrows():
             tax_node = row["specialization"] if user_bins_col == "specialization" else target
             tax_name = row["specialization_name"] if user_bins_col == "specialization" else target
-            tax.add(tax_node, row["node"], name=tax_name, rank=tax_rank)
 
+            # Check if node is already present with correct parent
+            # in case of input-target sequence, info has repeated pairs of node/parent
+            if tax.latest(tax_node) is tax.undefined_node:
+                tax.add(tax_node, row["node"], name=tax_name, rank=tax_rank)
+            else:
+                assert tax.parent(tax_node)==row["node"]
+
     # Write filtered taxonomy with added nodes
     rm_files(tax_file)
     tax.write(tax_file)

diff --git a/src/ganon/config.py b/src/ganon/config.py
@@ -8,7 +8,7 @@
 
 class Config:
 
-    version = "2.0.0"
+    version = "2.0.1"
     path_exec = {"build": "", "classify": "", "get_seq_info": "", "genome_updater": ""}
     empty = False
 
@@ -51,7 +51,7 @@ def __init__(self, which: str=None, **kwargs):
         build_default_advanced_args.add_argument("-p", "--max-fp",         type=int_or_float(minval=0, maxval=1), metavar="", default=None,   help="Max. false positive for bloom filters. Mutually exclusive --filter-size. Defaults to 0.001 with --filter-type hibf or 0.05 with --filter-type ibf.")
         build_default_advanced_args.add_argument("-k", "--kmer-size",      type=unsigned_int(minval=1),           metavar="", default=19,     help="The k-mer size to split sequences.")
         build_default_advanced_args.add_argument("-w", "--window-size",    type=unsigned_int(minval=1),           metavar="", default=31,     help="The window-size to build filter with minimizers.")
-        build_default_advanced_args.add_argument("-s", "--hash-functions", type=unsigned_int(minval=0, maxval=5), metavar="", default=4,      help="The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value.", choices=range(6))
+        build_default_advanced_args.add_argument("-s", "--hash-functions", type=unsigned_int(minval=0, maxval=5), metavar="", default=4,      help="The number of hash functions for the interleaved bloom filter [1-5]. With --filter-type ibf, 0 will try to set optimal value.", choices=range(6))
         build_default_advanced_args.add_argument("-f", "--filter-size",    type=unsigned_float(),                 metavar="", default=0,      help="Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. Only valid for --filter-type ibf.")
         build_default_advanced_args.add_argument("-j", "--mode",           type=str,                              metavar="", default="avg",  help="Create smaller or faster filters at the cost of classification speed or database size, respectively [" + ", ".join(self.choices_mode) + "]. If --filter-size is used, smaller/smallest refers to the false positive rate. By default, an average value is calculated to balance classification speed and database size. Only valid for --filter-type ibf.", choices=self.choices_mode)
         build_default_advanced_args.add_argument("-y", "--min-length",     type=unsigned_int(minval=0),           metavar="", default=0,      help="Skip sequences smaller then value defined. 0 to not skip any sequence. Only valid for --filter-type ibf.")
@@ -376,6 +376,10 @@ def validate(self):
                 print_log("--filter-type hibf is currently only supported with --input-target file")
                 return False
 
+            if self.filter_type == "hibf" and self.hash_functions == 0:
+                print_log("--filter-type hibf requires --hash-function value between 1 and 5")
+                return False
+
             if self.level == "custom" and not self.input_file:
                 print_log("--level custom requires --input-file")
                 return False
@@ -484,6 +488,10 @@ def validate(self):
                         print_log("File not found: " + file)
                         return False
 
+            if self.db_prefix and self.taxonomy == "skip":
+                print_log("To skip taxonomy, omit --db-prefix and set --taxonomy skip")
+                return False
+
         elif self.which == "table":
             pass
 

diff --git a/src/ganon/report.py b/src/ganon/report.py
@@ -7,7 +7,7 @@
 from ganon.util import print_log
 from ganon.tax_util import get_genome_size, parse_genome_size_tax
 
-from multitax import CustomTx, NcbiTx, GtdbTx
+from multitax import CustomTx, NcbiTx, GtdbTx, DummyTx
 
 
 def report(cfg):
@@ -46,20 +46,23 @@ def report(cfg):
                     "Failed to get genome sizes from .tax files, run ganon report without -d/--db-prefix")
                 return False
     else:
-        tx = time.time()
-        if cfg.taxonomy_files:
-            print_log("Parsing " + cfg.taxonomy + " taxonomy", cfg.quiet)
+        if cfg.taxonomy == "skip":
+            tax = DummyTx(**tax_args)
         else:
-            print_log("Downloading and parsing " +
-                      cfg.taxonomy + " taxonomy", cfg.quiet)
+            tx = time.time()
+            if cfg.taxonomy_files:
+                print_log("Parsing " + cfg.taxonomy + " taxonomy", cfg.quiet)
+            else:
+                print_log("Downloading and parsing " +
+                          cfg.taxonomy + " taxonomy", cfg.quiet)
 
-        if cfg.taxonomy == "ncbi":
-            tax = NcbiTx(files=cfg.taxonomy_files, **tax_args)
-        elif cfg.taxonomy == "gtdb":
-            tax = GtdbTx(files=cfg.taxonomy_files, **tax_args)
+            if cfg.taxonomy == "ncbi":
+                tax = NcbiTx(files=cfg.taxonomy_files, **tax_args)
+            elif cfg.taxonomy == "gtdb":
+                tax = GtdbTx(files=cfg.taxonomy_files, **tax_args)
 
-        print_log(" - done in " + str("%.2f" %
-                                      (time.time() - tx)) + "s.\n", cfg.quiet)
+            print_log(" - done in " + str("%.2f" %
+                                          (time.time() - tx)) + "s.\n", cfg.quiet)
 
         # In case no tax was provided, generate genome sizes (for the full tree)
         if cfg.report_type in ["abundance", "corr"]:

diff --git a/src/ganon/tax_util.py b/src/ganon/tax_util.py
@@ -54,7 +54,7 @@ def parse_genome_size_files(cfg, build_output_folder):
         if cfg.taxonomy == "ncbi":
             files = download([cfg.ncbi_url + "/genomes/ASSEMBLY_REPORTS/species_genome_size.txt.gz"], build_output_folder)
         elif cfg.taxonomy == "gtdb":
-            files = download([cfg.gtdb_url + "/ar53_metadata.tar.gz", cfg.gtdb_url + "/bac120_metadata.tar.gz"], build_output_folder)
+            files = download([cfg.gtdb_url + "/ar53_metadata.tsv.gz", cfg.gtdb_url + "/bac120_metadata.tsv.gz"], build_output_folder)
     else:
         print_log("Parsing auxiliary files for genome size", cfg.quiet)
         files = cfg.genome_size_files
@@ -73,21 +73,19 @@ def parse_genome_size_files(cfg, build_output_folder):
 
     elif cfg.taxonomy == "gtdb":
         for file in files:
-            with tarfile.open(file, mode='r:gz') as tfile:
-                tfile.extractall(path=build_output_folder)
-                for n in tfile.getnames():
-                    with open(build_output_folder + "/" + n, "r") as f:
-                        # skip first line wiht header
-                        # col 0: accession (with GC_ RF_ prefix), col 13: genome_size, col 16: gtdb_taxonomy (d__Archaea;p__Thermoproteota;...)
-                        next(f)
-                        for line in f:
-                            fields = line.rstrip().split("\t")
-                            t = fields[16].split(";")[-1]  # species taxid (leaf)
-                            # In GTDB, several genome sizes are available for each node
-                            # accumulate them in a list and make average
-                            if t not in leaves_sizes:
-                                leaves_sizes[t] = []
-                            leaves_sizes[t].append(int(fields[13]))
+             with gzip.open(file, "rt") as f:
+                # skip first line wiht header
+                # col 0: accession (with GC_ RF_ prefix), col 13: genome_size, col 16: gtdb_taxonomy (d__Archaea;p__Thermoproteota;...)
+                next(f)
+                for line in f:
+                    fields = line.rstrip().split("\t")
+                    t = fields[16].split(";")[-1]  # species taxid (leaf)
+                    # In GTDB, several genome sizes are available for each node
+                    # accumulate them in a list and make average
+                    if t not in leaves_sizes:
+                        leaves_sizes[t] = []
+                    leaves_sizes[t].append(int(fields[13]))
+
         # Average sizes
         for t in list(leaves_sizes.keys()):
             leaves_sizes[t] = int(sum(leaves_sizes[t])/len(leaves_sizes[t]))

diff --git a/tests/ganon/data/build-custom/ar53_metadata.tar.gz b/tests/ganon/data/build-custom/ar53_metadata.tar.gz
diff --git a/tests/ganon/data/build-custom/ar53_metadata.tsv.gz b/tests/ganon/data/build-custom/ar53_metadata.tsv.gz
diff --git a/tests/ganon/data/build-custom/bac120_metadata.tar.gz b/tests/ganon/data/build-custom/bac120_metadata.tar.gz
diff --git a/tests/ganon/data/build-custom/bac120_metadata.tsv.gz b/tests/ganon/data/build-custom/bac120_metadata.tsv.gz
diff --git a/tests/ganon/data/build-custom/filter_files.sh b/tests/ganon/data/build-custom/filter_files.sh
@@ -9,8 +9,8 @@ wget --quiet --output-document - "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/ass
 wget "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" --output-document base_files/new_taxdump.tar.gz
 wget "https://data.gtdb.ecogenomic.org/releases/latest/bac120_taxonomy.tsv.gz" --output-document base_files/bac120_taxonomy.tsv.gz
 wget "https://data.gtdb.ecogenomic.org/releases/latest/ar53_taxonomy.tsv.gz" --output-document base_files/ar53_taxonomy.tsv.gz
-wget "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tar.gz" --output-document base_files/bac120_metadata.tar.gz
-wget "https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tar.gz" --output-document base_files/ar53_metadata.tar.gz
+wget "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz" --output-document base_files/bac120_metadata.tsv.gz
+wget "https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz" --output-document base_files/ar53_metadata.tsv.gz
 tar xf base_files/bac120_metadata.tar.gz bac120_metadata_r207.tsv
 tar xf base_files/ar53_metadata.tar.gz ar53_metadata_r207.tsv
 tar xf base_files/new_taxdump.tar.gz taxidlineage.dmp nodes.dmp names.dmp
@@ -52,8 +52,8 @@ head -n 1 base_files/bac120_metadata_r207.tsv > bac120_metadata_r207.tsv
 head -n 1 base_files/ar53_metadata_r207.tsv > ar53_metadata_r207.tsv
 join <(cut -f 1 assembly_summary.txt | sort) <(awk 'BEGIN{FS=OFS="\t"}{print substr($1,4,length($1)), $0}' base_files/bac120_metadata_r207.tsv | sort -t$'\t' -k 1,1) -t$'\t' | cut -f 2- >> bac120_metadata_r207.tsv
 join <(cut -f 1 assembly_summary.txt | sort) <(awk 'BEGIN{FS=OFS="\t"}{print substr($1,4,length($1)), $0}' base_files/ar53_metadata_r207.tsv | sort -t$'\t' -k 1,1) -t$'\t' | cut -f 2- >> ar53_metadata_r207.tsv
-tar -czf bac120_metadata.tar.gz bac120_metadata_r207.tsv
-tar -czf ar53_metadata.tar.gz ar53_metadata_r207.tsv
+gzip -c bac120_metadata_r207.tsv > bac120_metadata.tsv.gz
+gzip -c ar53_metadata_r207.tsv > ar53_metadata.tsv.gz
 rm bac120_metadata_r207.tsv ar53_metadata_r207.tsv
 
 # make nucl_gb.accession2taxid.gz