Merge pull request #64 from bigbio/dev

Major changes fixing issues in reading params from configs files
bigbio · Jul 7, 2022 · 6d0b2b9 · 6d0b2b9
2 parents 7dc94aa + 4ff46ab
commit 6d0b2b9
Show file tree

Hide file tree

Showing 42 changed files with 4,209 additions and 4,239 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Python tools for ProteoGenomics Analysis Toolkit
+# ProteoGenomics Analysis Toolkit
 
 
 ![Python application](https://github.com/bigbio/py-pgatk/workflows/Python%20application/badge.svg)
@@ -7,11 +7,11 @@
 [![PyPI version](https://badge.fury.io/py/pypgatk.svg)](https://badge.fury.io/py/pypgatk)
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/pypgatk)
 
-**pypgatk** is a Python library part of the [ProteoGenomics Analysis Toolkit](https://pgatk.readthedocs.io/en/latest). It provides different bioinformatics tools for proteogenomics data analysis.
+**pypgatk** is a Python library - part of the [ProteoGenomics Analysis Toolkit](https://pgatk.readthedocs.io/en/latest). It provides different bioinformatics tools for proteogenomics data analysis.
 
 # Requirements:
 
-This package requirements vary depending on the way that you want to install it (all three are independent, you don't need all these requirements):
+The package requirements vary depending on the way that you want to install it (you need one of the following):
 
 - pip: if installation goes through pip, you will require Python3 and pip3 installed.
 - Bioconda: if installation goes through Bioconda, you will require that [conda is installed and configured to use bioconda channels](https://bioconda.github.io/user/index.html).
@@ -68,43 +68,50 @@ pip3 install .
 
 The pypgatk design combines multiple modules and tools into one framework. All the possible commands are accessible using the commandline tool `pypgatk_cli.py`.
 
+The library provides multiple commands to download, translate and generate protein sequence databases from reference and mutation genome databases.
+
 ```
-$: pypgatk_cli.py -h
+$: pypgatk_cli -h
+
 Usage: pypgatk [OPTIONS] COMMAND [ARGS]...
 
   This is the main tool that give access to all commands and options
   provided by the pypgatk
 
 Options:
+  --version   Show the version and exit.
   -h, --help  Show this message and exit.
 
 Commands:
-  cbioportal-downloader     Command to download the the cbioportal studies
+  cbioportal-downloader    Command to download the the cbioportal studies
   cbioportal-to-proteindb  Command to translate cbioportal mutation data into
                            proteindb
   cosmic-downloader        Command to download the cosmic mutation database
   cosmic-to-proteindb      Command to translate Cosmic mutation data into
                            proteindb
   dnaseq-to-proteindb      Generate peptides based on DNA sequences
+  ensembl-check            Command to check ensembl database for stop codons,
+                           gaps
   ensembl-downloader       Command to download the ensembl information
-  generate-decoy           Create decoy protein sequences. Each protein is
-                           reversed and the cleavage sites switched with
-                           preceding amino acid. Peptides are checked for
-                           existence in target sequences if foundthe tool will
-                           attempt to shuffle them. [email protected]
-                           2015
-  threeframe-translation   Command to perform 3frame translation
-  vcf-to-proteindb         Generate peptides based on DNA variants from
-                           ENSEMBL VEP VCF files
+  generate-decoy           Create decoy protein sequences using multiple
+                           methods DecoyPYrat, Reverse/Shuffled Proteins.
+  generate-deeplc          Generate input for deepLC tool from idXML,mzTab or
+                           consensusXML
+  msrescore-configuration  Command to generate the msrescore configuration
+                           file from idXML
+  peptide-class-fdr        Command to compute the Peptide class FDR
+  threeframe-translation   Command to perform 3'frame translation
+  vcf-to-proteindb         Generate peptides based on DNA variants VCF files
 
 ```
 
-The library provides multiple commands to download, translate and generate protein sequence databases from reference and mutation genome databases.
-
 # Full Documentation
 
 [https://pgatk.readthedocs.io/en/latest/pypgatk.html](https://pgatk.readthedocs.io/en/latest/pypgatk.html)
 
 ## Cite as
-Husen M Umer, Enrique Audain, Yafeng Zhu, Julianus Pfeuffer, Timo Sachsenberg, Janne Lehtiö, Rui M Branca, Yasset Perez-Riverol, Generation of ENSEMBL-based proteogenomics databases boosts the identification of non-canonical peptides, Bioinformatics, Volume 38, Issue 5, 1 March 2022, Pages 1470–1472, https://doi.org/10.1093/bioinformatics/btab838
+Husen M Umer, Enrique Audain, Yafeng Zhu, Julianus Pfeuffer, Timo Sachsenberg, Janne Lehtiö, Rui M Branca, Yasset Perez-Riverol
+Generation of ENSEMBL-based proteogenomics databases boosts the identification of non-canonical peptides
+Bioinformatics, Volume 38, Issue 5, 1 March 2022, Pages 1470–1472
+https://doi.org/10.1093/bioinformatics/btab838
 
diff --git a/pypgatk/cgenomes/cbioportal_downloader.py b/pypgatk/cgenomes/cbioportal_downloader.py
@@ -16,32 +16,73 @@ class CbioPortalDownloadService(ParameterConfiguration):
     CONFIG_CBIOPORTAL_API_CANCER_STUDIES = "cancer_studies"
     CONFIG_LIST_STUDIES = "list_studies"
     CONFIG_MULTITHREADING = "multithreading"
+    PROTEINDB = 'proteindb'
+    FILTER_INFO = 'filter_info'
+    FILTER_COLUMN = 'filter_column'
 
     def __init__(self, config_data, pipeline_arguments):
-      """
+        """
       Init the class with the specific parameters.
       :param config_data configuration file
       :param pipeline_arguments pipelines arguments
       """
 
-      super(CbioPortalDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_data,
+        super(CbioPortalDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_data,
                                                         pipeline_arguments)
 
-      self._cbioportal_studies = []
-      if self.CONFIG_OUTPUT_DIRECTORY in self.get_pipeline_parameters():
-          self._local_path_cbioportal = self.get_pipeline_parameters()[self.CONFIG_OUTPUT_DIRECTORY]
-      else:
-        self._local_path_cbioportal = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_OUTPUT_DIRECTORY]
-
-      self._list_studies = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_LIST_STUDIES]
-      if self.CONFIG_LIST_STUDIES in self.get_pipeline_parameters():
-          self._list_studies = self.get_pipeline_parameters()[self.CONFIG_LIST_STUDIES]
-
-      self._multithreading = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_MULTITHREADING]
-      if self.CONFIG_MULTITHREADING in self.get_pipeline_parameters():
-          self._multithreading = self.get_pipeline_parameters()[self.CONFIG_MULTITHREADING]
-
-      self.prepare_local_cbioportal_repository()
+        self._local_path_cbioportal = 'output_directory'
+        self._list_studies = []
+        self._multithreading = True
+
+        self._cbioportal_base_url = 'https://www.cbioportal.org/webservice.do'
+        self._cancer_studies_command = 'cmd=getCancerStudies'
+
+        self._cbioportal_download_url = 'https://cbioportal-datahub.s3.amazonaws.com'
+
+        if self.CONFIG_OUTPUT_DIRECTORY in self.get_pipeline_parameters():
+            self._local_path_cbioportal = self.get_pipeline_parameters()[self.CONFIG_OUTPUT_DIRECTORY]
+        elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and \
+                self.CONFIG_OUTPUT_DIRECTORY in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]:
+            self._local_path_cbioportal = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
+                self.CONFIG_OUTPUT_DIRECTORY]
+
+        if self.CONFIG_LIST_STUDIES in self.get_pipeline_parameters():
+            self._list_studies = self.get_pipeline_parameters()[self.CONFIG_LIST_STUDIES]
+        elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and \
+                self.CONFIG_LIST_STUDIES in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]:
+            self._list_studies = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
+                self.CONFIG_LIST_STUDIES]
+
+        if self.CONFIG_MULTITHREADING in self.get_pipeline_parameters():
+            self._multithreading = self.get_pipeline_parameters()[self.CONFIG_MULTITHREADING]
+        elif self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and \
+                self.CONFIG_MULTITHREADING in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]:
+            self._multithreading = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
+                self.CONFIG_MULTITHREADING]
+
+        if self.CONFIG_CBIOPORTAL_API_SERVER in self.get_pipeline_parameters():
+            self._cbioportal_base_url = self.get_pipeline_parameters()[self.CONFIG_CBIOPORTAL_API_SERVER]
+        elif (self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and
+              self.CONFIG_CBIOPORTAL_API in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]
+              and self.CONFIG_CBIOPORTAL_API_SERVER in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
+                  self.CONFIG_CBIOPORTAL_API]):
+            self._cbioportal_base_url = \
+                self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_CBIOPORTAL_API][
+                    self.CONFIG_CBIOPORTAL_API_SERVER]
+
+        if self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES in self.get_pipeline_parameters():
+            self._cancer_studies_command = self.get_pipeline_parameters()[self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES]
+        elif (self.CONFIG_KEY_DATA_DOWNLOADER in self.get_default_parameters() and
+              self.CONFIG_CBIOPORTAL_API in self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER]
+              and self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES in
+              self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][
+                  self.CONFIG_CBIOPORTAL_API]):
+            self._cancer_studies_command = \
+                self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_CBIOPORTAL_API][
+                    self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES]
+
+        self.prepare_local_cbioportal_repository()
+        self.get_cancer_studies()
 
     def prepare_local_cbioportal_repository(self):
         self.get_logger().debug("Preparing local cbioportal repository, root folder - '{}'".format(
@@ -53,19 +94,27 @@ def prepare_local_cbioportal_repository(self):
     def get_local_path_root_cbioportal_repo(self):
         return self._local_path_cbioportal
 
+    def get_filter_options(self, variable, default_value):
+        return_value = default_value
+        if variable in self.get_default_parameters():
+            return_value = self.get_default_parameters()[variable]
+        elif self.PROTEINDB in self.get_default_parameters() and \
+                self.FILTER_INFO in self.get_default_parameters()[self.PROTEINDB] and \
+                variable in self.get_default_parameters()[self.PROTEINDB][self.FILTER_INFO]:
+            return_value = self.get_default_parameters()[self.PROTEINDB][self.FILTER_INFO][variable]
+        return return_value
+
     def get_cancer_studies(self):
         """
         This method will print the list of all cancer studies for the user.
         :return:
         """
-        server = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_CBIOPORTAL_API][
-            self.CONFIG_CBIOPORTAL_API_SERVER]
-        endpoint = self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_CBIOPORTAL_API][
-            self.CONFIG_CBIOPORTAL_API_CANCER_STUDIES]
+        server = self._cbioportal_base_url
+        endpoint = self._cancer_studies_command
         self._cbioportal_studies = call_api_raw(server + "?" + endpoint).text
         return self._cbioportal_studies
 
-    def download_study(self, download_study, url_file_name = None):
+    def download_study(self, download_study, url_file_name=None):
         """
         This function will download a study from cBioPortal using the study ID
         :param download_study: Study to be download, if the study is empty or None, all the studies will be
@@ -80,7 +129,6 @@ def download_study(self, download_study, url_file_name = None):
         if url_file_name is not None:
             url_file = open(url_file_name, 'w')
 
-
         if self._cbioportal_studies is None or len(self._cbioportal_studies) == 0:
             self.get_cancer_studies()
 
@@ -106,15 +154,15 @@ def download_study(self, download_study, url_file_name = None):
             else:
                 for row in csv_reader:
                     if line_count != 0:
-                        self.download_one_study(row[0], url_file = url_file)
+                        self.download_one_study(row[0], url_file=url_file)
                     line_count = line_count + 1
 
-    def download_one_study(self, download_study, url_file = None):
+    def download_one_study(self, download_study, url_file=None):
         file_name = '{}.tar.gz'.format(download_study)
-        file_url = '{}/{}'.format(
-            self.get_default_parameters()[self.CONFIG_KEY_DATA_DOWNLOADER][self.CONFIG_KEY_CBIOPORTAL_DOWNLOAD_URL],
-            file_name)
-        file_name = download_file(file_url = file_url, file_name = self.get_local_path_root_cbioportal_repo() + '/' + file_name, log = self.get_logger(), url_file = url_file)
+        file_url = '{}/{}'.format(self._cbioportal_download_url, file_name)
+        file_name = download_file(file_url=file_url,
+                                  file_name=self.get_local_path_root_cbioportal_repo() + '/' + file_name,
+                                  log=self.get_logger(), url_file=url_file)
         if file_name is not None:
             msg = "The following study '{}' has been downloaded. ".format(download_study)
         else: