diff --git a/tools/uniprotxml_downloader/macros.xml b/tools/uniprotxml_downloader/macros.xml new file mode 100644 index 000000000..92302116a --- /dev/null +++ b/tools/uniprotxml_downloader/macros.xml @@ -0,0 +1,406 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tools/uniprotxml_downloader/uniprotxml_downloader.py b/tools/uniprotxml_downloader/uniprotxml_downloader.py index f125315de..83f0d7aa5 100755 --- a/tools/uniprotxml_downloader/uniprotxml_downloader.py +++ b/tools/uniprotxml_downloader/uniprotxml_downloader.py @@ -11,7 +11,7 @@ # #------------------------------------------------------------------------------ """ -import optparse +import argparse import re import sys from urllib import parse @@ -20,7 +20,7 @@ from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry -DEFAULT_TIMEOUT = 5 # seconds +DEFAULT_TIMEOUT = 3600 # seconds retry_strategy = Retry( total=5, backoff_factor=2, @@ -46,15 +46,17 @@ def send(self, request, **kwargs): def __main__(): # Parse Command Line - parser = optparse.OptionParser() - parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') - parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs') - parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') - parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') - parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') - parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') - parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') - (options, args) = parser.parse_args() + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') + parser.add_argument('-c', '--column', dest='column', type=int, default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs') + parser.add_argument('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') + parser.add_argument('-r', '--reviewed', dest='reviewed', choices=['yes', 'no'], help='Only uniprot reviewed entries (default: reviewed and unreviewed entries)') + parser.add_argument('-f', '--format', dest='format', choices=['xml', 'fasta', "tab"], default='xml', help='output format') + parser.add_argument('--columns', dest='columns', help='columns for tabular output') + parser.add_argument('--include', dest='include', choices=['yes', 'no'], default="no", help='Include isoforms in FASTA output') + parser.add_argument('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') + parser.add_argument('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') + options = parser.parse_args() taxids = set(options.taxon) if options.input: with open(options.input, 'r') as inputFile: @@ -72,30 +74,34 @@ def __main__(): dest_path = options.output else: dest_path = "uniprot_%s.xml" % '_'.join(taxids) - reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' + reviewed = "AND reviewed:%s" % options.reviewed if options.reviewed else '' try: url = 'https://www.uniprot.org/uniprot/' - query = "%s%s" % (taxon_query, reviewed) + query = "(%s) %s" % (taxon_query, reviewed) params = {'query': query, 'force': 'yes', 'format': options.format} + if options.format == "tab" and options.columns: + params['columns'] = options.columns + if options.format == "fasta": + params['include'] = options.include if options.debug: print("%s ? %s" % (url, params), file=sys.stderr) data = parse.urlencode(params) - print(f"Retrieving: {url+data}") + print(f"Retrieving: {url}?{data}") adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) http = requests.Session() http.mount("https://", adapter) - response = http.post(url, data=params) + response = http.post(url, data=params, stream=True) + if response.encoding is None: + response.encoding = 'utf-8' http.close() with open(dest_path, 'w') as fh: - fh.write(response.text) + for line in response.iter_lines(decode_unicode=True): + fh.write(line + "\n") if options.format == 'xml': with open(dest_path, 'r') as contents: - while True: - line = contents.readline() + for line in contents: if options.debug: print(line, file=sys.stderr) - if line is None: - break if line.startswith(' download proteome as XML or fasta + + macros.xml + requests @@ -11,9 +14,6 @@ python '$__tool_directory__/uniprotxml_downloader.py' #if $taxid.input_choice == 'common': --taxon $taxid.organism - #if $taxid.reviewed: - --reviewed=$taxid.reviewed - #end if #elif $taxid.input_choice == 'taxids': #for $id in $taxid.taxons.split(','): -t '$id' @@ -22,10 +22,31 @@ python '$__tool_directory__/uniprotxml_downloader.py' --input='${taxid.taxon_file}' --column=#echo int(str($taxid.column)) - 1# #end if ---format $format +#if $reviewed: + --reviewed=$reviewed +#end if +--format $output_cond.out_format +#if $output_cond.out_format == 'tab' + --columns '$output_cond.columns' +#else if $output_cond.out_format == 'fasta' + --include $output_cond.include +#end if --output '${proteome}' +&& cp '$ext_config' 'galaxy.json' ]]> + + + @@ -41,17 +62,6 @@ python '$__tool_directory__/uniprotxml_downloader.py' - - - - - - - - - + + + + + + + + + + + + + + + + + + + - - - - - + - - + + + + + @@ -90,8 +121,11 @@ python '$__tool_directory__/uniprotxml_downloader.py' - - + + + + + @@ -102,8 +136,10 @@ python '$__tool_directory__/uniprotxml_downloader.py' - - + + + + @@ -114,9 +150,13 @@ python '$__tool_directory__/uniprotxml_downloader.py' - - + + + + + +