diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index d09528b..557aecc 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -5,9 +5,9 @@ name: Python application on: push: - branches: [ master ] + branches: [ spectrumAI ] pull_request: - branches: [ master ] + branches: [ spectrumAI ] jobs: build: @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] steps: - uses: actions/checkout@v2 @@ -70,6 +70,6 @@ jobs: pip install pytest-cov python setup.py install cd pypgatk - pytest --cov=./ --cov-report=xml tests/* + pytest -s --cov=./ --cov-report=xml tests/* diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index a52ce4f..fc6ad50 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -5,9 +5,9 @@ name: Python package on: push: - branches: [ master ] + branches: [ spectrumAI ] pull_request: - branches: [ master ] + branches: [ spectrumAI ] jobs: build: @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] steps: - uses: actions/checkout@v2 @@ -47,4 +47,4 @@ jobs: run: | python setup.py install cd pypgatk - python tests/pypgatk_tests.py + python -s tests/pypgatk_tests.py diff --git a/.gitignore b/.gitignore index ebe9f6e..7f129a2 100644 --- a/.gitignore +++ b/.gitignore @@ -113,3 +113,5 @@ pypgatk/database_cbioportal/ pypgatk/database_cosmic/ pypgatk/config/private/ .pypirc +/pypgatk/test_all.bed +/pypgatk/test_annotated.vcf diff --git a/conda-enviroment.yaml b/conda-enviroment.yaml index 6ca1884..186bee1 100644 --- a/conda-enviroment.yaml +++ b/conda-enviroment.yaml @@ -7,13 +7,18 @@ channels: - bioconda dependencies: - biopython - - Click=7.0 - - gffutils=0.10.1 - - numpy=1.16.3 - - PyYAML=5.1.2 - - requests=2.21.0 - - simplejson=3.16.0 - - ratelimit=2.2.1 + - Click + - gffutils + - numpy + - PyYAML + - requests + - simplejson + - ratelimit + - pathos - bioconda::pyteomics - - pybedtools=0.8.2 + - pybedtools + - matplotlib - bioconda::pyopenms + - pytest + - tqdm + - pyahocorasick diff --git a/pypgatk/cgenomes/cbioportal_downloader.py b/pypgatk/cgenomes/cbioportal_downloader.py index 60ae7fb..7e39810 100644 --- a/pypgatk/cgenomes/cbioportal_downloader.py +++ b/pypgatk/cgenomes/cbioportal_downloader.py @@ -34,8 +34,8 @@ def __init__(self, config_data, pipeline_arguments): self._list_studies = [] self._multithreading = True - self._cbioportal_base_url = 'https://www.cbioportal.org/webservice.do' - self._cancer_studies_command = 'cmd=getCancerStudies' + self._cbioportal_base_url = 'https://www.cbioportal.org/api' + self._cancer_studies_command = 'studies' self._cbioportal_download_url = 'https://cbioportal-datahub.s3.amazonaws.com' @@ -111,13 +111,13 @@ def get_cancer_studies(self): """ server = self._cbioportal_base_url endpoint = self._cancer_studies_command - self._cbioportal_studies = call_api_raw(server + "?" + endpoint).text + self._cbioportal_studies = call_api_raw(server + "/" + endpoint).text return self._cbioportal_studies def download_study(self, download_study, url_file_name=None): """ This function will download a study from cBioPortal using the study ID - :param download_study: Study to be download, if the study is empty or None, all the studies will be + :param download_study: Study to be downloaded, if the study is empty or None, all the studies will be downloaded. :param url_file_name: file tsv containing the urls to be downloaded. :return: None diff --git a/pypgatk/commands/blast_get_position.py b/pypgatk/commands/blast_get_position.py new file mode 100644 index 0000000..dede94a --- /dev/null +++ b/pypgatk/commands/blast_get_position.py @@ -0,0 +1,33 @@ +import logging + +import click + +from pypgatk.toolbox.general import read_yaml_from_file +from pypgatk.commands.utils import print_help +from pypgatk.proteogenomics.blast_get_position import BlastGetPositionService + +log = logging.getLogger(__name__) + +@click.command('blast_get_position', short_help='Blast peptide and refence protein database to find variation sites.') +@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline.') +@click.option('-i', '--input_psm_to_blast', help='The file name of the input PSM table to blast.') +@click.option('-o', '--output_psm', help='The file name of the output PSM table.') +@click.option('-r', '--input_reference_database', help='The file name of the refence protein database to blast. The reference database includes Uniprot Proteomes with isoforms, ENSEMBL, RefSeq, etc.') +@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.') + +@click.pass_context +def blast_get_position(ctx, config_file, input_psm_to_blast, output_psm, input_reference_database, number_of_processes): + config_data = None + if config_file is not None: + config_data = read_yaml_from_file(config_file) + + if input_psm_to_blast is None or input_reference_database is None or output_psm is None: + print_help() + pipeline_arguments = {} + if input_reference_database is not None: + pipeline_arguments[BlastGetPositionService.CONFIG_INPUT_REFERENCE_DATABASE] = input_reference_database + if number_of_processes is not None: + pipeline_arguments[BlastGetPositionService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes + + blast_get_position_service = BlastGetPositionService(config_data, pipeline_arguments) + blast_get_position_service.blast(input_psm_to_blast, output_psm) \ No newline at end of file diff --git a/pypgatk/commands/deeplc.py b/pypgatk/commands/deeplc.py deleted file mode 100644 index 253617c..0000000 --- a/pypgatk/commands/deeplc.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging - -import click - -from pypgatk.commands.utils import print_help -from pypgatk.proteomics.openms import OpenmsDataService -from pypgatk.toolbox.general import read_yaml_from_file - -log = logging.getLogger(__name__) - - -@click.command('generate-deeplc', short_help="Generate input for deepLC tool from idXML,mzTab or consensusXML") -@click.option('-c', '--config_file', help='Configuration to perform deepLC configuration file') -@click.option('-in', '--input-file', help='input idxml/consensusxml/mztab containing the peptide identifications') -@click.option('-out', '--output-file', help='Output for DeepLC tool') -@click.option('-d', '--decoy_prefix', help='Set accession prefix for decoy proteins in output. Default=DECOY_', - default='DECOY_') -@click.option('--peptide-classes-prefix', - help='Peptides classes e.g. \"altorf,pseudo,ncRNA,COSMIC,cbiomut,var_mut,var_rs\"') -@click.option('--novel-peptides', help='This parameter will allow to remove from the peptide list the novel peptides', - is_flag=True) -@click.pass_context -def generate_deeplc(ctx, config_file, input_file, output_file, decoy_prefix: str, peptide_classes_prefix: str, - novel_peptides: bool): - - config_data = None - if config_file is not None: - config_data = read_yaml_from_file(config_file) - - if input_file is None or output_file is None: - print_help() - - pipeline_arguments = {} - - openms_analyzer = OpenmsDataService(config_data, pipeline_arguments) - openms_analyzer._generate_deepLC_file(input_xml=input_file, output_deepLC=output_file, decoy_pattern=decoy_prefix, - peptide_class_prefix=peptide_classes_prefix, novel_peptides=novel_peptides) diff --git a/pypgatk/commands/msrescore.py b/pypgatk/commands/msrescore.py deleted file mode 100644 index 27d6c9d..0000000 --- a/pypgatk/commands/msrescore.py +++ /dev/null @@ -1,33 +0,0 @@ -import logging - -import click - -from pypgatk.commands.utils import print_help -from pypgatk.proteomics.openms import OpenmsDataService -from pypgatk.toolbox.general import read_yaml_from_file - -log = logging.getLogger(__name__) - - -@click.command('msrescore-configuration', short_help="Command to generate the msrescore configuration file from idXML") -@click.option('-c', '--config_file', help='Configuration to perform msrescore configuration file') -@click.option('-in', '--input-file', help='input idxml containing the peptide identifications') -@click.option('-out', '--output-file', help='Output json configuration file for msrescore') -@click.option('--quant_method', help='Quantification method TMT or LFQ', type=click.Choice(['LFQ', 'TMT'])) -@click.option('-d', '--decoy_prefix', help='Set accession prefix for decoy proteins in output. Default=DECOY_', - default='DECOY_') -@click.pass_context -def msrescore_configuration(ctx, config_file, input_file, output_file, quant_method, decoy_prefix: str): - - config_data = None - if config_file is not None: - config_data = read_yaml_from_file(config_file) - - if input_file is None or output_file is None: - print_help() - - pipeline_arguments = {} - - openms_analyzer = OpenmsDataService(config_data, pipeline_arguments) - openms_analyzer._generate_msrescore_file(input_xml=input_file, quant_method=quant_method, output_json=output_file, - decoy_pattern=decoy_prefix) diff --git a/pypgatk/commands/mztab_class_fdr.py b/pypgatk/commands/mztab_class_fdr.py new file mode 100644 index 0000000..d70256b --- /dev/null +++ b/pypgatk/commands/mztab_class_fdr.py @@ -0,0 +1,39 @@ +import logging + +import click + +from pypgatk.toolbox.general import read_yaml_from_file +from pypgatk.commands.utils import print_help +from pypgatk.proteogenomics.mztab_class_fdr import MzTabClassFdr + +log = logging.getLogger(__name__) + +@click.command('mztab_class_fdr', short_help='Extract psms from mzTab for global-fdr and class-fdr filtering') +@click.option('-c', '--config_file', help='Configuration file for the fdr peptides pipeline') +@click.option('-i', '--input_mztab', help='The file name of the input mzTab') +@click.option('-o', '--outfile_name', help='The file name of the psm table filtered by global-fdr and class-fdr') +@click.option('-d', '--decoy_prefix', help='Default is "decoy"') +@click.option('-gf', '--global_fdr_cutoff', help='PSM peptide global-fdr cutoff or threshold. Default is 0.01') +@click.option('-cf', '--class_fdr_cutoff', help='PSM peptide class-fdr cutoff or threshold. Default is 0.01') +@click.option('-g', '--peptide_groups_prefix', help="Peptide class " + "groups e.g. \"{non_canonical:[altorf,pseudo,ncRNA];mutations:[COSMIC,cbiomut];variants:[var_mut,var_rs]}\"") +@click.pass_context +def mztab_class_fdr(ctx, config_file, input_mztab, outfile_name, decoy_prefix, global_fdr_cutoff, class_fdr_cutoff, peptide_groups_prefix): + config_data = None + if config_file is not None: + config_data = read_yaml_from_file(config_file) + + if input_mztab is None or outfile_name is None: + print_help() + pipeline_arguments = {} + if decoy_prefix is not None: + pipeline_arguments[MzTabClassFdr.CONFIG_DECOY_PREFIX] = decoy_prefix + if global_fdr_cutoff is not None: + pipeline_arguments[MzTabClassFdr.CONFIG_GLOBAL_FDR_CUTOFF] = global_fdr_cutoff + if class_fdr_cutoff is not None: + pipeline_arguments[MzTabClassFdr.CONFIG_CLASS_FDR_CUTOFF] = class_fdr_cutoff + if peptide_groups_prefix is not None: + pipeline_arguments[MzTabClassFdr.CONFIG_PEPTIDE_GROUPS_PREFIX] = peptide_groups_prefix + + mzTab_class_fdr = MzTabClassFdr(config_data, pipeline_arguments) + mzTab_class_fdr.form_mztab_class_fdr(input_mztab, outfile_name) diff --git a/pypgatk/commands/peptide_class_fdr.py b/pypgatk/commands/peptide_class_fdr.py index d27dbc5..50538e4 100644 --- a/pypgatk/commands/peptide_class_fdr.py +++ b/pypgatk/commands/peptide_class_fdr.py @@ -14,7 +14,7 @@ @click.option('-c', '--config_file', help='Configuration to perform Peptide Class FDR') @click.option('-in', '--input-file', help='input file with the peptides and proteins') @click.option('-out', '--output-file', help='idxml from openms with filtered peptides and proteins') -@click.option("--file-type") +@click.option("--file-type", help="File types supported by the tool (TSV (.tsv), IDXML (.idxml), MZTAB (.mztab))") @click.option('--min-peptide-length', help='minimum peptide length') @click.option('--psm-pep-fdr-cutoff', help="PSM peptide FDR cutoff or threshold") @click.option('--psm-pep-class-fdr-cutoff', help="PSM class peptide FDR cutoff or threshold") @@ -48,7 +48,7 @@ def peptide_class_fdr(ctx, config_file, input_file, output_file, file_type, min_ :param psm_pep_class_fdr_cutoff: Peptide class FDR cutoff :param peptide_groups_prefix: Peptide groups prefix for the Peptide classes FDR :param peptide_classes_prefix: Peptide classes - :param file_type: File type to compute the FDR and class FDR. + :param file_type: File type to compute the FDR and class FDR () :param disable_class_fdr: Do not compute class FDR and not filtering the PSMs :return: """ diff --git a/pypgatk/commands/validate_peptides.py b/pypgatk/commands/validate_peptides.py new file mode 100644 index 0000000..0ecbb47 --- /dev/null +++ b/pypgatk/commands/validate_peptides.py @@ -0,0 +1,55 @@ +import logging + +import click + +from pypgatk.toolbox.general import read_yaml_from_file +from pypgatk.proteogenomics.validate_peptides import ValidatePeptidesService +from pypgatk.commands.utils import print_help + +log = logging.getLogger(__name__) + + +@click.command('validate_peptides', + short_help='Command to inspect MS2 spectra of single-subsititution peptide identifications') +@click.option('-c', '--config_file', help='Configuration file for the validate peptides pipeline') +@click.option('-p', '--mzml_path', help='The mzml file path.You only need to use either mzml_path or mzml_files') +@click.option('-f', '--mzml_files', + help='The mzml files.Different files are separated by ",".You only need to use either mzml_path or mzml_files') +@click.option('-i', '--infile_name', help='Variant peptide PSMs table') +@click.option('-o', '--outfile_name', help='Output file for the results') +@click.option('-ion', '--ions_tolerance', help='MS2 fragment ions mass accuracy') +@click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.') +@click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on', + is_flag=True) +@click.option('-msgf', '--msgf', + help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format', + is_flag=True) +@click.pass_context +def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance, + number_of_processes, relative, msgf): + config_data = None + if config_file is not None: + config_data = read_yaml_from_file(config_file) + + validate_flag = bool(infile_name and (mzml_path or mzml_files) and outfile_name) + if not validate_flag: + print_help() + + pipeline_arguments = {} + + if mzml_path is not None: + pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_PATH] = mzml_path + if mzml_files is not None: + pipeline_arguments[ValidatePeptidesService.CONFIG_MZML_FILES] = mzml_files + if ions_tolerance is not None: + pipeline_arguments[ValidatePeptidesService.CONFIG_IONS_TOLERANCE] = ions_tolerance + if number_of_processes is not None: + pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes + if relative is not None: + pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative + if msgf is not None: + pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf + + validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments) + if validate_flag: + validate_peptides_service.validate(infile_name, outfile_name) diff --git a/pypgatk/config/cbioportal_config.yaml b/pypgatk/config/cbioportal_config.yaml index 6be78e1..1aace6d 100644 --- a/pypgatk/config/cbioportal_config.yaml +++ b/pypgatk/config/cbioportal_config.yaml @@ -2,8 +2,8 @@ cbioportal_data_downloader: output_directory: database_cbioportal list_studies: [] cbioportal_api: - base_url: https://www.cbioportal.org/webservice.do - cancer_studies: cmd=getCancerStudies + base_url: https://www.cbioportal.org/api + cancer_studies: studies cbioportal_download_url: https://cbioportal-datahub.s3.amazonaws.com logger: formatters: diff --git a/pypgatk/config/ensembl_downloader_config.yaml b/pypgatk/config/ensembl_downloader_config.yaml index 938638d..5b24327 100644 --- a/pypgatk/config/ensembl_downloader_config.yaml +++ b/pypgatk/config/ensembl_downloader_config.yaml @@ -26,7 +26,7 @@ ensembl_data_downloader: - chr_patch_hapl_scaff. file_extension: gtf ensembl_api: - server: http://rest.ensembl.org + server: https://rest.ensembl.org species: /info/species logger: formatters: diff --git a/pypgatk/ensembl/data_downloader.py b/pypgatk/ensembl/data_downloader.py index c215270..645bbfa 100644 --- a/pypgatk/ensembl/data_downloader.py +++ b/pypgatk/ensembl/data_downloader.py @@ -56,7 +56,7 @@ def __init__(self, config_file, pipeline_arguments): super(EnsemblDataDownloadService, self).__init__(self.CONFIG_KEY_DATA_DOWNLOADER, config_file, pipeline_arguments) - self._rest_api = 'http://rest.ensembl.org' + self._rest_api = 'https://rest.ensembl.org' self._rest_endpoint = '/info/species' self._skip_protein_database = self.get_data_download_parameters(variable=self.CONFIG_KEY_SKIP_PROTEIN, diff --git a/pypgatk/ensembl/ensembl.py b/pypgatk/ensembl/ensembl.py index 7762292..76a2522 100644 --- a/pypgatk/ensembl/ensembl.py +++ b/pypgatk/ensembl/ensembl.py @@ -252,7 +252,6 @@ def get_features(db, feature_id, feature_types=None): also genomic positions for all its elements (exons/cds&start_codon) :param db: :param feature_id: - :param biotype_str: :param feature_types: :return: """ diff --git a/pypgatk/proteogenomics/__init__.py b/pypgatk/proteogenomics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypgatk/proteogenomics/blast_get_position.py b/pypgatk/proteogenomics/blast_get_position.py new file mode 100644 index 0000000..2874df2 --- /dev/null +++ b/pypgatk/proteogenomics/blast_get_position.py @@ -0,0 +1,171 @@ +import pandas as pd +from Bio import pairwise2, SeqIO +import datetime +from pathos.multiprocessing import ProcessingPool as Pool +from multiprocessing import Manager +from tqdm import tqdm +import ahocorasick + +from pypgatk.toolbox.general import ParameterConfiguration + + +def _blast_set(fasta_set, peptide): + length = len(peptide) + position_set = set() + for fasta in fasta_set: + if len(fasta) >= length: + alignments_score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0, score_only=True) + if alignments_score == length: + return "canonical" + elif alignments_score == length - 1: + alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0) + for alignment in alignments_local: + # insertion e.g., ABCDMEFGH<----ABCDEFGH + if alignment.end - alignment.start == length + 1: + s = fasta[alignment.start:alignment.end] + for i in range(length): + if peptide[i] != s[i]: + position_set.add(i + 1) + break + # substitution e.g., ABCDMFGH<----ABCDEFGH + elif alignment.end - alignment.start == length: + s = fasta[alignment.start:alignment.end] + for i in range(length): + if peptide[i] != s[i]: + position_set.add(i + 1) + break + # substitution e.g., ABCDEFGM<----ABCDEFGH + elif alignment.end - alignment.start == length - 1: + s = fasta[alignment.start:alignment.end] + if peptide[0] != s[0]: + position_set.add(1) + elif peptide[-1] != s[-1]: + position_set.add(length) + elif alignments_score == length - 2: + alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1, + open=-1, extend=0) + for alignment in alignments_local: + # deletion e.g., ABCEFGH<----ABCDEFGH + if alignment.end - alignment.start == length and alignment.score == length - 2: + s = fasta[alignment.start:alignment.end - 1] + if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0, + extend=0, score_only=True) == length - 1: + for i in range(length - 1): + if peptide[i] != s[i]: + position_set.add(i + 1) + break + if position_set: + return position_set + else: + return "non-canonical" + + +class BlastGetPositionService(ParameterConfiguration): + CONFIG_KEY_BlastGetPosition = 'blast_get_position' + # CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix' + CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database' + CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes' + + def __init__(self, config_data, pipeline_arguments): + """ + init the class with the specific parameters. + :param config_data configuration file + :param pipeline_arguments pipelines arguments + """ + + super(BlastGetPositionService, self).__init__(self.CONFIG_KEY_BlastGetPosition, config_data, pipeline_arguments) + self._input_reference_database = self.get_blast_parameters(variable=self.CONFIG_INPUT_REFERENCE_DATABASE, + default_value='') + self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, + default_value='40') + + self.fa_set = set() + for j in SeqIO.parse(self._input_reference_database, "fasta"): + self.fa_set.add(str(j.seq)) + self.blast_dict = Manager().dict() + + def get_blast_parameters(self, variable: str, default_value): + value_return = default_value + if variable in self.get_pipeline_parameters(): + value_return = self.get_pipeline_parameters()[variable] + elif self.CONFIG_KEY_BlastGetPosition in self.get_default_parameters() and \ + variable in self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition]: + value_return = self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition][variable] + return value_return + + def _blast_canonical(self, df): + seq_set = set(df["sequence"].to_list()) + + auto = ahocorasick.Automaton() + seq_dict = dict() + for seq_peptide in seq_set: + auto.add_word(seq_peptide, seq_peptide) + seq_dict[seq_peptide] = "waiting for blast" + + auto.make_automaton() + + for protein_seq in self.fa_set: + for end_ind, found in auto.iter(protein_seq): + seq_dict[found] = "canonical" + print("Found", found, "at position", end_ind, "in protein sequence") + + df["position"] = df["sequence"].map(seq_dict) + return df + + def _result(self, sequence): + self.blast_dict[sequence] = _blast_set(self.fa_set, sequence) + + def blast(self, input_psm_to_blast, output_psm): + start_time = datetime.datetime.now() + print("Start time :", start_time) + + psm = pd.read_table(input_psm_to_blast, header=0, sep="\t") + psm = self._blast_canonical(psm) + + first_filter = psm[psm.position == "canonical"] + psm_to_blast = psm[psm.position == "waiting for blast"] + psm_to_blast = psm_to_blast.copy() + + # Remove duplicate sequences + seq_set = set(psm_to_blast["sequence"].to_list()) + seq_list = list(seq_set) + + pool = Pool(int(self._number_of_processes)) + list(tqdm(pool.imap(self._result, seq_list), total=len(seq_list), desc="Blast", unit="peptide")) + + pool.close() + pool.join() + + psm_to_blast["position"] = psm.apply(lambda x: self.blast_dict.get(x["sequence"]), axis=1) + + second_filter = psm_to_blast[psm_to_blast.position == "canonical"] + non_filter = psm_to_blast[psm_to_blast.position == "non-canonical"] + + psm_to_findpos = psm_to_blast[psm_to_blast.position != "canonical"] + psm_to_findpos = psm_to_findpos[psm_to_findpos.position != "non-canonical"] + + if len(psm_to_findpos) > 0: + psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1) + psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])] + psm_to_findpos["var_num"].iloc[0] = 0 + psm_id = psm_to_findpos["PSM_ID"].iloc[0] + for i in range(1, psm_to_findpos.shape[0]): + if psm_to_findpos["PSM_ID"].iloc[i] == psm_id: + psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1 + else: + psm_id = psm_to_findpos["PSM_ID"].iloc[i] + psm_to_findpos["var_num"].iloc[i] = 0 + psm_to_findpos["position"] = psm_to_findpos.apply( + lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]], + axis=1) + psm_to_findpos.drop(columns="var_num", axis=1, inplace=True) + psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1) + + all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer') + all_psm_out = all_psm_out.sort_values("PSM_ID") + all_psm_out.to_csv(output_psm, header=1, sep="\t", index=None) + + end_time = datetime.datetime.now() + print("End time :", end_time) + set_time_taken = end_time - start_time + print("Time consumption :", set_time_taken) diff --git a/pypgatk/proteogenomics/mztab_class_fdr.py b/pypgatk/proteogenomics/mztab_class_fdr.py new file mode 100644 index 0000000..a5935ff --- /dev/null +++ b/pypgatk/proteogenomics/mztab_class_fdr.py @@ -0,0 +1,146 @@ +import re +import pandas as pd +import datetime + +from pypgatk.toolbox.general import ParameterConfiguration + + +class MzTabClassFdr(ParameterConfiguration): + CONFIG_KEY_MzTabClassFdr = 'mzTab_class_fdr' + CONFIG_DECOY_PREFIX = 'decoy_prefix' + CONFIG_GLOBAL_FDR_CUTOFF = 'global_fdr_cutoff' + CONFIG_CLASS_FDR_CUTOFF = 'class_fdr_cutoff' + CONFIG_PEPTIDE_GROUPS_PREFIX = 'peptide_groups_prefix' + + def __init__(self, config_data, pipeline_arguments): + """ + Init the class with the specific parameters. + :param config_data configuration file + :param pipeline_arguments pipelines arguments + """ + super(MzTabClassFdr, self).__init__(self.CONFIG_KEY_MzTabClassFdr, config_data, pipeline_arguments) + self._decoy_prefix = self.get_fdr_parameters(variable=self.CONFIG_DECOY_PREFIX, default_value='decoy') + self._global_fdr_cutoff = self.get_fdr_parameters(variable=self.CONFIG_GLOBAL_FDR_CUTOFF, default_value=0.01) + self._class_fdr_cutoff = self.get_fdr_parameters(variable=self.CONFIG_CLASS_FDR_CUTOFF, default_value=0.01) + self._peptide_groups_prefix = self.get_fdr_parameters(variable=self.CONFIG_PEPTIDE_GROUPS_PREFIX, + default_value={ + 'non_canonical': ['altorf', 'pseudo', + 'ncRNA'], + 'mutations': ['COSMIC', 'cbiomut'], + 'variants': ['var_mut', 'var_rs']}) + self._psm_search_engine_score_order = {'1003113': True, '1003115': False, '1001493': True, '1001491': False} + + def get_fdr_parameters(self, variable: str, default_value): + value_return = default_value + if variable in self.get_pipeline_parameters(): + value_return = self.get_pipeline_parameters()[variable] + elif self.CONFIG_KEY_MzTabClassFdr in self.get_default_parameters() and \ + variable in self.get_default_parameters()[self.CONFIG_KEY_MzTabClassFdr]: + value_return = self.get_default_parameters()[self.CONFIG_KEY_MzTabClassFdr][variable] + return value_return + + @staticmethod + def _get_mzml_name(run, mtd): + key = run + '-location' + value = mtd.get(key) + return value.split("/")[-1] + + def _is_decoy(self, accessions): + accession_list = accessions.split(',') + if all(self._decoy_prefix in accession for accession in accession_list): + return 0 + else: + return 1 + + @staticmethod + def _is_group(peptide_group_members, accessions): + accession_group = 0 + accession_list = accessions.split(',') + for accession in accession_list: + for class_peptide in peptide_group_members: + if class_peptide in accession: + accession_group += 1 + return len(accession_list) == accession_group + + @staticmethod + def _compute_global_fdr(df_psms, order): + df_psms.sort_values("search_engine_score[1]", ascending=order, inplace=True) + df_psms['FDR'] = (range(1, len(df_psms) + 1) / df_psms['target'].cumsum()) - 1 + df_psms['q-value'] = df_psms['FDR'][::-1].cummin()[::-1] + + df_psms.sort_values("search_engine_score[1]", ascending=order, inplace=True) + + return df_psms + + def _compute_class_fdr(self, df_psms, order): + ls = [] + for c in self._peptide_groups_prefix: + # split the dataframe and save the subset + curr_class = df_psms[ + df_psms['accession'].apply(lambda x: self._is_group(self._peptide_groups_prefix[c], x))] + ls.append(curr_class) + + # If there is no decoy to throw an exception + if len(curr_class[curr_class["target"] == 0]) == 0: + print("Warning:There is no peptide or decoy of " + c + ", and this kind of class-fdr has been skipped.") + + # calculate class-specific q-value + curr_class.sort_values("search_engine_score[1]", ascending=order, inplace=True) + fdr = (range(1, len(curr_class["target"]) + 1) / curr_class["target"].cumsum()) - 1 + curr_class['class-specific-q-value'] = fdr[::-1].cummin()[::-1] + df = pd.concat(ls) + + # df_psms['class-specific-q-value'] = df['class-specific-q-value'] + df_psms = df_psms.merge(df['class-specific-q-value'], left_index=True, right_index=True, how='left') + df_psms.loc[df_psms['class-specific-q-value'].isnull(), 'class-specific-q-value'] = df_psms['q-value'] + df_psms.sort_values("search_engine_score[1]", ascending=order, inplace=True) + + return df_psms + + def form_mztab_class_fdr(self, input_mztab, outfile_name): + start_time = datetime.datetime.now() + print("Start time :", start_time) + + file = open(input_mztab, "r") + list_lines = file.readlines() + + # Extract psms information + psm = [] + mtd_dict = dict() + for i in list_lines: + i = i.strip("\n") + row_list = i.split('\t') + if row_list[0] == "MTD": + mtd_dict[row_list[1]] = row_list[2] + elif row_list[0] == "PSH": + psm_cols = row_list + elif row_list[0] == "PSM": + psm.append(row_list) + + psm_search_engine = mtd_dict.get("psm_search_engine_score[1]").split("MS:")[1][:7] + order = self._psm_search_engine_score_order.get(psm_search_engine) + + # Convert to dataframe + psm = pd.DataFrame(psm, columns=psm_cols) + psm.loc[:, "SpecFile"] = psm.apply(lambda x: self._get_mzml_name(x["spectra_ref"].split(":")[0], mtd_dict), + axis=1) + psm.loc[:, "ScanNum"] = psm.apply(lambda x: re.sub("[^\d]", "", x["spectra_ref"].split(":")[-1].split(" ")[-1]), + axis=1) + + psm.loc[:, "target"] = psm.apply(lambda x: self._is_decoy(x["accession"]), axis=1) + if len(psm[psm["target"] == 0]) == 0: + raise ValueError( + "There is not enough decoys to calculate fdr.") + + psm = self._compute_global_fdr(psm, order) + psm = self._compute_class_fdr(psm, order) + psm = psm[((psm['q-value'] < float(self._global_fdr_cutoff)) + & (psm['class-specific-q-value'] < float(self._class_fdr_cutoff)))] + psm.reset_index(drop=True, inplace=True) + + psm.to_csv(outfile_name, header=1, sep="\t", index=None) + + end_time = datetime.datetime.now() + print("End time :", end_time) + time_taken = end_time - start_time + print("Time consumption :", time_taken) diff --git a/pypgatk/proteogenomics/validate_peptides.py b/pypgatk/proteogenomics/validate_peptides.py new file mode 100644 index 0000000..03b393f --- /dev/null +++ b/pypgatk/proteogenomics/validate_peptides.py @@ -0,0 +1,330 @@ +import datetime +import os.path +import re +import pandas as pd +from pathos.multiprocessing import ProcessingPool as Pool +from multiprocessing import Manager +from pyopenms import (TheoreticalSpectrumGenerator, MSSpectrum, + AASequence, Param, MzMLFile, MSExperiment, SpectrumLookup) +from tqdm import tqdm +from pypgatk.toolbox.general import ParameterConfiguration + + +class ValidatePeptidesService(ParameterConfiguration): + CONFIG_KEY_VALIDATE_PEPTIDES = 'validate_peptides' + CONFIG_MZML_PATH = 'mzml_path' + CONFIG_MZML_FILES = 'mzml_files' + CONFIG_INFILE_NAME = 'infile_name' + CONFIG_OUTFILE_NAME = 'outfile_name' + CONFIG_IONS_TOLERANCE = 'ions_tolerance' + CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes' + CONFIG_RELATIVE = 'relative' + CONFIG_MSGF = 'msgf' + + def __init__(self, config_data, pipeline_arguments): + """ + Init the class with the specific parameters. + :param config_data configuration file + :param pipeline_arguments pipelines arguments + """ + + super(ValidatePeptidesService, self).__init__(self.CONFIG_KEY_VALIDATE_PEPTIDES, config_data, + pipeline_arguments) + + self._mzml_path = self.get_validate_parameters(variable=self.CONFIG_MZML_PATH, default_value=False) + self._mzml_files = self.get_validate_parameters(variable=self.CONFIG_MZML_FILES, default_value=False) + self._ions_tolerance = self.get_validate_parameters(variable=self.CONFIG_IONS_TOLERANCE, default_value=0.02) + self._relative = self.get_validate_parameters(variable=self.CONFIG_RELATIVE, default_value=False) + self._msgf = self.get_validate_parameters(variable=self.CONFIG_MSGF, default_value=False) + self._number_of_processes = self.get_validate_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES, + default_value=40) + + self.df_list = Manager().list() + + def get_validate_parameters(self, variable: str, default_value): + value_return = default_value + if variable in self.get_pipeline_parameters(): + value_return = self.get_pipeline_parameters()[variable] + elif self.CONFIG_KEY_VALIDATE_PEPTIDES in self.get_default_parameters() and \ + variable in self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES]: + value_return = self.get_default_parameters()[self.CONFIG_KEY_VALIDATE_PEPTIDES][variable] + return value_return + + def _predict_MS2_spectrum(self, peptide, size, product_ion_charge=1): + if self._msgf: + peptide = re.sub("[-?]", "", peptide) + modification = re.finditer("(\+\d{1,}\.\d{1,})", peptide) + + a = 0 + for i in modification: + peptide = peptide[:i.start() + a] + '[' + peptide[i.start() + a:i.end() + a] + ']' + peptide[ + i.end() + a:] + a += 2 + + tsg = TheoreticalSpectrumGenerator() + spec = MSSpectrum() + peptide = AASequence.fromString(peptide) + # size = len(peptide.toUnmodifiedString()) + p = Param() + p.setValue("add_metainfo", "true") + p.setValue("add_first_prefix_ion", "true") + p.setValue("add_precursor_peaks", "true") + tsg.setParameters(p) + tsg.getSpectrum(spec, peptide, 1, 1) # charge range 1:1 + + b_y_ions = [] + for i in spec.getStringDataArrays()[0]: + b_y_ions.append(i.decode()) + mz = [] + for i in spec: + mz.append(i.getMZ()) + + ions = pd.DataFrame({"mz": mz, "ion": b_y_ions, "z": 1}) + + ions.loc[2 * size - 2, "ion"] = "b" + str(size) + ions = ions.drop(2 * size - 1) + ions.loc[2 * size, "ion"] = "y" + str(size) + + ions.loc[:, "ion"] = ions.apply(lambda x: re.sub("[+]", "", x["ion"]), axis=1) + ions.loc[:, "pos"] = ions.apply(lambda x: re.sub("[^\d]", "", x["ion"]), axis=1) + ions.loc[:, "type"] = ions.apply(lambda x: re.sub("[^a-z]", "", x["ion"]), axis=1) + + proton_mono_mass = 1.007276 + if product_ion_charge > 1: + ions2 = ions.copy() + ions2.loc[:, "mz"] = ions2.apply(lambda x: (x["mz"] + proton_mono_mass) / 2, axis=1) + ions2.loc[:, "z"] = 2 + + ions = ions.merge(ions2, how='outer') + + ions = ions.reset_index(drop=True) + + return ions + + @staticmethod + def _get_intensity(exp_peak, ion_mz): + exp_peak.loc[:, "mz_difference"] = exp_peak.apply(lambda x: abs(float(ion_mz) - x["mz"]), axis=1) + min_index = exp_peak["mz_difference"].idxmin() + return exp_peak.loc[exp_peak["mz_difference"] == exp_peak["mz_difference"].min()].loc[min_index, "intensity"] + + def _match_exp2predicted(self, exp_peak, pred_peak): + pred_peak.loc[:, "error"] = pred_peak.apply(lambda x: min(abs(float(x["mz"]) - exp_peak["mz"])), axis=1) + pred_peak.loc[:, "intensity"] = pred_peak.apply(lambda x: self._get_intensity(exp_peak, x["mz"]), axis=1) + pred_peak.loc[:, "ppm"] = pred_peak.apply(lambda x: round(x["error"] / x["mz"] * 1000000, 2), axis=1) + + if self._relative: + match_ions = pred_peak[pred_peak["ppm"] < self._ions_tolerance] + else: + match_ions = pred_peak[pred_peak["error"] < self._ions_tolerance] + + match_ions = match_ions.reset_index(drop=True) + + return match_ions + + def _inspect_spectrum(self, df, mzml_path, mzml_files): + if self._msgf: + df.loc[:, "peptide_length"] = df.apply(lambda x: len(re.sub("[^A-Z]", "", x["Peptide"])), axis=1) + else: + df.loc[:, "peptide_length"] = df.apply(lambda x: len(x["sequence"]), axis=1) + + df["status"] = "skiped" + + df["ions_support"] = "NO" + df["support_ions"] = "" + df["sum.supportions.intensity"] = float(0) + + df["flanking_ions_support"] = "NO" + df["flanking_ions"] = "" + df["sum.flanking.ions.intensity"] = float(0) + + df["matched_ions"] = "" + df["sum.matchedions.intensity"] = float(0) + df["sum.fragmentions.intensity"] = float(0) + df["maxintensity"] = float(0) + df["average_intensity"] = float(0) + df["median_intensity"] = float(0) + mzml_file = None + + spectra_file = str(df.loc[0, "SpecFile"]) + if mzml_files and not mzml_path: + mzml_list = mzml_files.split(",") + for file in mzml_list: + if spectra_file in file: + mzml_file = file + break + elif not mzml_files and mzml_path: + mzml_file = os.path.join(mzml_path, spectra_file) + else: + raise ValueError( + "You only need to use either '--mzml_path' or '--mzml_files'.") + + exp = MSExperiment() + try: + MzMLFile().load(mzml_file, exp) + look = SpectrumLookup() + look.readSpectra(exp, "((?)\d+$)") + except Exception as e: + print(mzml_file + " has ERROR!") + print(e) + df["ions_support"] = "mzML ERROR" + return df + + for i in range(df.shape[0]): + scan_num = int(df.loc[i, "ScanNum"]) + if self._msgf: + # seq = DF.loc[i, "Variant Peptide"] + seq = re.sub("[^A-Z]", "", df.loc[i, "Peptide"]) + length = df.loc[i, "peptide_length"] + else: + seq = df.loc[i, "sequence"] + length = df.loc[i, "peptide_length"] + + # get peaks through ScanNum + try: + index = look.findByScanNumber(scan_num) + except Exception as e: + print("ERROR: " + str(e) + "; file:" + str(mzml_file) + "; scan_num:" + str(scan_num)) + continue + + exp_peaks = exp.getSpectrum(index).get_peaks() + + exp_peaks = pd.DataFrame({"mz": exp_peaks[0], "intensity": exp_peaks[1]}) + + if self._msgf: + predicted_peaks = self._predict_MS2_spectrum(str(df.loc[i, "Peptide"]), length, 1) + else: + predicted_peaks = self._predict_MS2_spectrum( + str(df.loc[i, "opt_global_cv_MS:1000889_peptidoform_sequence"]), length, 1) + match_ions = self._match_exp2predicted(exp_peaks, predicted_peaks) + + max_intensity = exp_peaks["intensity"].max() + average_intensity = exp_peaks["intensity"].mean() + median_intensity = exp_peaks["intensity"].median() + + df.loc[i, "sum.fragmentions.intensity"] = exp_peaks["intensity"].sum() + df.loc[i, "maxintensity"] = max_intensity + df.loc[i, "average_intensity"] = average_intensity + df.loc[i, "median_intensity"] = median_intensity + + if match_ions.shape[0] == 0: + continue + df.loc[i, "matched_ions"] = ','.join(match_ions["ion"].unique().tolist()) + df.loc[i, "sum.matchedions.intensity"] = match_ions["intensity"].sum() + + if df.loc[i, "position"] == "canonical": + continue + if df.loc[i, "position"] == "non-canonical": + continue + position = int(df.loc[i, "position"]) + if position == 0: + continue + if position > length: + continue + + df.loc[i, "status"] = "checked" + supportions_intensity = 0 + ions_support = "NO" + supportions = "" + + for j in range(match_ions.shape[0]): + ion_type = match_ions.loc[j, "type"] + pos = int(match_ions.loc[j, "pos"]) + ion = match_ions.loc[j, "ion"] + + if ion_type == "b" and pos >= position: + ions_support = "YES" + supportions_intensity = supportions_intensity + match_ions.loc[j, "intensity"] + supportions = supportions + ',' + ion + elif ion_type == "y" and pos > length - position: + ions_support = "YES" + supportions_intensity = supportions_intensity + match_ions.loc[j, "intensity"] + supportions = supportions + ',' + ion + + df.loc[i, "ions_support"] = ions_support + df.loc[i, "support_ions"] = supportions + df.loc[i, "sum.supportions.intensity"] = supportions_intensity + + # check if it is a noise peak or isotope peak supporting mutant ions + if df.loc[i, "sum.supportions.intensity"] < df.loc[i, "median_intensity"]: + df.loc[i, "ions_support"] = "NO" + + flanking_ions_support = "NO" + n1 = length + n2 = position + match_ions_set = set(match_ions["ion"].tolist()) + + if n2 == 1: + flanking_ions = {"b1", "y" + str(n1 - 1)} + flanking_ions = flanking_ions.intersection(match_ions_set) + if len(flanking_ions) > 0: + flanking_ions_support = "YES" + elif n2 == n1: + flanking_ions = {"y1", "b" + str(n1 - 1)} + flanking_ions = flanking_ions.intersection(match_ions_set) + if len(flanking_ions) > 0: + flanking_ions_support = "YES" + else: + flanking_ions_left = {"b" + str(n2 - 1), "y" + str(n1 - n2 + 1)} + flanking_ions_right = {"b" + str(n2), "y" + str(n1 - n2)} + + flanking_ions_left = flanking_ions_left.intersection(match_ions_set) + flanking_ions_right = flanking_ions_right.intersection(match_ions_set) + + flanking_ions = flanking_ions_left.union(flanking_ions_right) + if len(flanking_ions_left) > 0 and len(flanking_ions_right) > 0: + flanking_ions_support = "YES" + + df.loc[i, "flanking_ions_support"] = flanking_ions_support + df.loc[i, "flanking_ions"] = ",".join(flanking_ions) + if flanking_ions: + df.loc[i, "sum.flanking.ions.intensity"] = \ + match_ions[match_ions['ion'].str.contains("|".join(flanking_ions))]["intensity"].sum() + + if df.loc[i, "sum.flanking.ions.intensity"] < df.loc[i, "median_intensity"]: + df.loc[i, "flanking_ions_support"] = "NO" + + # fragmentation is not preferable at Cterm side of proline, so only require supporting ions + if re.search("P", seq[position - 1:position]): + df.loc[i, "flanking_ions_support"] = df.loc[i, "ions_support"] + + return df + + def _multiprocess_inspect_spectrum(self, df): + self.df_list.append(self._inspect_spectrum(df, self._mzml_path, self._mzml_files)) + + def validate(self, infile_name, outfile_name: str): + start_time = datetime.datetime.now() + print("Start time :", start_time) + df_psm = pd.read_table(infile_name, header=0, sep="\t") + + grouped_dfs = df_psm.groupby("SpecFile") + list_of_dfs = [group_df.reset_index(drop=True) for name, group_df in grouped_dfs] + + pool = Pool(int(self._number_of_processes)) + list(tqdm(pool.imap(self._multiprocess_inspect_spectrum, list_of_dfs), total=len(list_of_dfs), + desc="Validate By Each mzMl", unit="mzML")) + pool.close() + pool.join() + + df_output = pd.concat(self.df_list, axis=0, ignore_index=True) + df_output.to_csv(outfile_name, header=True, sep="\t", index=None) + + # if self._msgf: + # df_sub = df_output[df_output["status"] == "checked"] + # saav_psm_passed = df_sub[df_sub["flanking_ions_support"]=="YES"]["PrecursorError(ppm)"] + # saav_psm_failed = df_sub[df_sub["flanking_ions_support"]=="NO"]["PrecursorError(ppm)"] + # plot=plt.figure(figsize=(10,7)) + # plot1=plot.add_subplot(1,2,1) + # plot2=plot.add_subplot(1,2,2) + # plot1.hist(saav_psm_passed,bins=20) + # plot1.set_xlabel("PrecursorError(ppm)") + # plot1.set_title("SpectrumAI curated") + # plot2.hist(saav_psm_failed,bins=20) + # plot2.set_xlabel("PrecursorError(ppm)") + # plot2.set_title("SpectrumAI discarded") + # plt.savefig("precursorError_histogram.pdf") + + end_time = datetime.datetime.now() + print("End time :", end_time) + time_taken = end_time - start_time + print("Time consumption :", time_taken) diff --git a/pypgatk/pypgatk_cli.py b/pypgatk/pypgatk_cli.py index 916d415..f3e8217 100644 --- a/pypgatk/pypgatk_cli.py +++ b/pypgatk/pypgatk_cli.py @@ -19,9 +19,9 @@ from pypgatk.commands import dnaseq_to_proteindb as dnase_to_proteindb_cmd from pypgatk.commands import proteindb_decoy as proteindb_decoy_cmd from pypgatk.commands import peptide_class_fdr as peptide_class_fdr_cmd -from pypgatk.commands import msrescore as msrescore_configuration_cmd -from pypgatk.commands import deeplc as deeplc_cmd - +from pypgatk.commands import validate_peptides as validate_peptides_cmd +from pypgatk.commands import mztab_class_fdr as mztab_class_fdr_cmd +from pypgatk.commands import blast_get_position as blast_get_position_cmd CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) @@ -46,8 +46,9 @@ def cli(): cli.add_command(proteindb_decoy_cmd.generate_database) cli.add_command(proteindb_decoy_cmd.generate_database) cli.add_command(peptide_class_fdr_cmd.peptide_class_fdr) -cli.add_command(msrescore_configuration_cmd.msrescore_configuration) -cli.add_command(deeplc_cmd.generate_deeplc) +cli.add_command(validate_peptides_cmd.validate_peptides) +cli.add_command(mztab_class_fdr_cmd.mztab_class_fdr) +cli.add_command(blast_get_position_cmd.blast_get_position) def main(): diff --git a/pypgatk/testdata/test_blast_psms.tsv b/pypgatk/testdata/test_blast_psms.tsv new file mode 100644 index 0000000..df17d2b --- /dev/null +++ b/pypgatk/testdata/test_blast_psms.tsv @@ -0,0 +1,4 @@ +PSH sequence PSM_ID accession unique database database_version search_engine search_engine_score[1] modifications retention_time charge exp_mass_to_charge calc_mass_to_charge spectra_ref pre post start end opt_global_q-value opt_global_cv_MS:1002217_decoy_peptide opt_global_cv_MS:1000889_peptidoform_sequence SpecFile ScanNum +PSM YHTINGHNAEVR 0 "ENSP00000504571.1,ENSP00000503242.1,ENSP00000503961.1,ENSP00000504660.1,ENSP00000497298.1,ENSP00000503452.1,ENSP00000504799.1,ENSP00000503190.1,ENSP00000503898.1,ENSP00000503968.1,ENSP00000503885.1,ENSP00000504049.1,ENSP00000503550.1,ENSP00000503870.1,ENSP00000503521.1,ENSP00000503236.1,ENSP00000503360.1,ENSP00000503021.1,ENSP00000503915.1,ENSP00000503460.1,ENSP00000346694.4,ENSP00000478691.2,ENSP00000504439.1,ENSP00000504329.1,ENSP00000503476.1,ENSP00000504831.1,ENSP00000504023.1,ENSP00000504721.1,ENSP00000503514.1,ENSP00000503375.1,ENSP00000349101.8,ENSP00000503047.1,ENSP00000503833.1,ENSP00000503836.1,ENSP00000503703.1,ENSP00000503429.1,ENSP00000354021.4,ENSP00000504415.1,ENSP00000503060.1,ENSP00000503501.1,altorf_ENST00000679318.1_2,altorf_ENST00000677339.1_2,altorf_ENST00000678501.1_2,altorf_ENST00000676903.1_2,altorf_ENST00000608362.2_2,altorf_ENST00000677631.1_2,altorf_ENST00000676749.1_2,altorf_ENST00000678035.1_1,altorf_ENST00000678075.1_1,altorf_ENST00000678183.1_3,altorf_ENST00000679021.1_3,altorf_ENST00000677321.1_2,altorf_ENST00000677571.1_2,altorf_ENST00000677906.1_2,altorf_ENST00000678277.1_3,altorf_ENST00000678973.1_2,altorf_ENST00000679124.1_2,altorf_ENST00000679123.1_2,altorf_ENST00000677574.1_2,altorf_ENST00000678631.1_2,altorf_ENST00000678998.1_2,altorf_ENST00000354667.8_2,altorf_ENST00000618183.5_2,altorf_ENST00000677839.1_2,altorf_ENST00000676746.1_3,altorf_ENST00000678675.1_3,altorf_ENST00000676524.1_2,altorf_ENST00000678935.1_2,altorf_ENST00000678962.1_2,altorf_ENST00000679001.1_2,altorf_ENST00000678449.1_2,altorf_ENST00000356674.8_3,altorf_ENST00000678697.1_2,altorf_ENST00000678431.1_2,altorf_ENST00000676497.1_2,altorf_ENST00000677396.1_2,altorf_ENST00000678779.1_2,altorf_ENST00000360787.8_2,altorf_ENST00000679243.1_2,altorf_ENST00000677656.1_2,altorf_ENST00000678884.1_2,ncRNA_ENST00000677075.1_1,ncRNA_ENST00000476233.2_2,ncRNA_ENST00000676932.1_2,ncRNA_ENST00000677669.1_3,ncRNA_ENST00000490912.6_3,ncRNA_ENST00000463181.5_2,ncRNA_ENST00000495810.2_2,COSMIC:HNRNPA2B1_ENST00000618183:p.R225S:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.H108P:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.K104N:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.R190G:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.G65V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.M53I:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.L37*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000618183:p.E11G:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.E92Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.E133Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.D87H:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.M1?:,COSMIC:HNRNPA2B1:p.G233A:Substitution-Missense,COSMIC:HNRNPA2B1:p.G280C:Substitution-Missense,COSMIC:HNRNPA2B1:p.G224*:Substitution-Nonsense,COSMIC:HNRNPA2B1:p.N255Y:Substitution-Missense,COSMIC:HNRNPA2B1:p.K104N:Substitution-Missense,COSMIC:HNRNPA2B1:p.G285S:Substitution-Missense,COSMIC:HNRNPA2B1:p.H108P:Substitution-Missense,COSMIC:HNRNPA2B1:p.R190G:Substitution-Missense,COSMIC:HNRNPA2B1:p.G65V:Substitution-Missense,COSMIC:HNRNPA2B1:p.M53I:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G221A:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G268C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.N243Y:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G212*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.G273S:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.R178G:Substitution-Missense,COSMIC:HNRNPA2B1:p.L37*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.H96P:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.K92N:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G53V:Substitution-Missense,COSMIC:HNRNPA2B1:p.G214V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.M41I:Substitution-Missense,COSMIC:HNRNPA2B1:p.G237V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.L25*:Substitution-Nonsense,COSMIC:HNRNPA2B1:p.R203K:Substitution-Missense,COSMIC:HNRNPA2B1:p.Y336C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G202V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G225V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.R191K:Substitution-Missense,COSMIC:HNRNPA2B1:p.G332C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.Y324C:Substitution-Missense,COSMIC:HNRNPA2B1:p.E11G:Substitution-Missense,COSMIC:HNRNPA2B1:p.E133Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G320C:Substitution-Missense,COSMIC:HNRNPA2B1:p.E92Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.E121Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.E80Q:Substitution-Missense,COSMIC:HNRNPA2B1:p.M1?:,COSMIC:HNRNPA2B1:p.D87H:Substitution-Missense,COSMIC:HNRNPA2B1:p.G248*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.D75H:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.M1?:,COSMIC:HNRNPA2B1:p.G217V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G236*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.G205V:Substitution-Missense,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation" 0 PXD014145_decoy null "[, , Percolator, 3.05]" 0.642512 null 436.4756905 3 470.901519 470.9006154 ms_run[8]:controllerType=0 controllerNumber=1 scan=1500 "K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K" "K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K" "162,162,162,162,162,162,162,122,122,174,174,162,162,162,162,162,174,162,162,162,174,162,162,174,174,162,162,162,174,162,174,162,162,162,162,162,174,162,162,162,395,455,395,395,395,395,395,474,474,326,326,395,395,395,382,395,395,439,455,395,395,230,218,395,326,326,395,395,395,439,455,326,395,455,395,395,395,230,455,395,395,566,395,395,431,1358,1118,395,174,174,174,174,174,174,173,174,174,174,174,174,174,174,174,174,174,174,174,174,174,174,162,162,162,162,162,162,173,162,162,162,174,162,174,161,174,174,162,162,162,174,162,174,174,162,174,162,162,174,174,174,162,162,174,162,162,174,174,174" "173,173,173,173,173,173,173,133,133,185,185,173,173,173,173,173,185,173,173,173,185,173,173,185,185,173,173,173,185,173,185,173,173,173,173,173,185,173,173,173,406,466,406,406,406,406,406,485,485,337,337,406,406,406,393,406,406,450,466,406,406,241,229,406,337,337,406,406,406,450,466,337,406,466,406,406,406,241,466,406,406,577,406,406,442,1369,1129,406,185,185,185,185,185,185,184,185,185,185,185,185,185,185,185,185,185,185,185,185,185,185,173,173,173,173,173,173,184,173,173,173,185,173,185,172,185,185,173,173,173,185,173,185,185,173,185,173,173,185,185,185,173,173,185,173,173,185,185,185" 0.0561798 0 YHTINGHNAEVR test_blast_validate.mzML 1500 +PSM KMVSLAK 4 pseudo_ENST00000454683.1_2 1 PXD014145_decoy null "[, , Percolator, 3.05]" 0.668987 null 741.1 2 388.740661 388.7385759 ms_run[1]:controllerType=0 controllerNumber=1 scan=3252 R N 443 449 0.0947368 0 KMVSLAK test_blast_validate.mzML 3252 +PSM AAMAAWPPAAQAAAAAVAVVGGGGEPGAPR 8 "altorf_ENST00000247706.4_2,altorf_ENST00000593489.1_2" 0 PXD014145_decoy null "[, , Percolator, 3.05]" 0.547212 null 1209.2 5 529.4764486 529.4750268 ms_run[5]:controllerType=0 controllerNumber=1 scan=6341 "R,R" "G,G" "183,147" "212,176" 0.0526316 0 AAMAAWPPAAQAAAAAVAVVGGGGEPGAPR test_blast_validate.mzML 6341 diff --git a/pypgatk/testdata/test_blast_reference_database.fa b/pypgatk/testdata/test_blast_reference_database.fa new file mode 100644 index 0000000..60d9206 --- /dev/null +++ b/pypgatk/testdata/test_blast_reference_database.fa @@ -0,0 +1,72 @@ +>ENSP00000491150.1 pep chromosome:GRCh38:15:90934041:90953716:1 gene:ENSG00000140553.18 transcript:ENST00000639885.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:UNC45A description:unc-45 myosin chaperone A [Source:HGNC Symbol;Acc:HGNC:30594] +MGIQRRSGPLLTGLSVGLLLLFSWVTLVQPTSLSALVHHPHLCQTTRDVHGRHYPGFFCP +RLSDSPEEAYCCHLQAAGGSCCTRAEFEALYQVNLSALPPPPILRGPGPLLVLGLYNLLV +VTLMTVDLVHFCCGRGRSLGWSHRRPPSGSSAASSLQASSVEQLRKEGNELFKCGDYGGA +LAAYTQALGLDATPQDQAVLHRNRAACHLKLEDYDKAETEASKAIEKDGGDVKALYRRSQ +ALEKLGRLDQAVLDLQRCVSLEPKNKVFQEALRNIGGQIQEKVRYMSSTDAKVEQMFQIL +LDPEEKGTEKKQKASQNLVVLAREDAGAEKIFRSNGVQLLQRLLDMGETDLMLAALRTLV +GICSEHQSRTVATLSILGTRRVVSILGVESQAVSLAACHLLQVMFDALKEGVKKGFRGKE +GAIIVDPARELKVLISNLLDLLTEVGVSGQGRDNALTLLIKAVPRKSLKDPNNSLTLWVI +DQGLKKILEVGGSLQDPPGELAVTANSRMSASILLSKLFDDLKCDAERENFHRLCENYIK +SWFEGQGLAGKLRAIQTVSCLLQGPCDAGNRALELSGVMESVIALCASEQEEEQLVAVEA +LIHAAGKAKRASFITANGVSLLKDLYKCSEKDSIRIRALVGLCKLGSAGGTDFSMKQFAE +GSTLKLAKQCRKWLCNDQIDAGTRRWAVEGLAYLTFDADVKEEFVEDAAALKALFQLSRL +EERSVLFAVASALVNCTNSYDYEEPDPKMVELAKYAKQHVPEQHPKDKPSFVRARVKKLL +AAGVVSAMVCMVKTESPVLTSSCRELLSRVFLALVEEVEDRGTVVAQGGGRALIPLALEG +TDVGQTKAAQALAKLTITSNPEMTFPGERIYEVVRPLVSLLHLNCSGLQNFEALMALTNL +AGISERLRQKILKEKAVPMIEGYMFEEHEMIRRAATECMCNLAMSKEVQDLFEAQGNDRL +KLLVLYSGEDDELLQRAAAGGLAMLTSMRPTLCSRIPQVTTHWLEILQALLLSSNQELQH +RGAVVVLNMVEASREIASTLMESEMMEILSVLAKGDHSPVTRAAAACLDKAVEYGLIQPN +QDGE +>ENSP00000377191.2 pep chromosome:GRCh38:7:107923799:108002140:-1 gene:ENSG00000091136.15 transcript:ENST00000393561.6 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:LAMB1 description:laminin subunit beta 1 [Source:HGNC Symbol;Acc:HGNC:6486] +MTFKTFRPAAMLIERSSDFGKTWGVYRYFAYDCEASFPGISTGPMKKVDDIICDSRYSDI +EPSTEGEVIFRALDPAFKIEDPYSPRIQNLLKITNLRIKFVKLHTLGDNLLDSRMEIREK +YYYAVYDMVVRGNCFCYGHASECAPVDGFNEEVEGMVHGHCMCRHNTKGLNCELCMDFYH +DLPWRPAEGRNSNACKKCNCNEHSISCHFDMAVYLATGNVSGGVCDDCQHNTMGRNCEQC +KPFYYQHPERDIRDPNFCERCTCDPAGSQNEGICDSYTDFSTGLIAGQCRCKLNVEGEHC +DVCKEGFYDLSSEDPFGCKSCACNPLGTIPGGNPCDSETGHCYCKRLVTGQHCDQCLPEH +WGLSNDLDGCRPCDCDLGGALNNSCFAESGQCSCRPHMIGRQCNEVEPGYYFATLDHYLY +EAEEANLGPGVSIVERQYIQDRIPSWTGAGFVRVPEGAYLEFFIDNIPYSMEYDILIRYE +PQLPDHWEKAVITVQRPGRIPTSSRCGNTIPDDDNQVVSLSPGSRYVVLPRPVCFEKGTN +YTVRLELPQYTSSDSDVESPYTLIDSLVLMPYCKSLDIFTVGGSGDGVVTNSAWETFQRY +RCLENSRSVVKTPMTDVCRNIIFSISALLHQTGLACECDPQGSLSSVCDPNGGQCQCRPN +VVGRTCNRCAPGTFGFGPSGCKPCECHLQGSVNAFCNPVTGQCHCFQGVYARQCDRCLPG +HWGFPSCQPCQCNGHADDCDPVTGECLNCQDYTMGHNCERCLAGYYGDPIIGSGDHCRPC +PCPDGPDSGRQFARSCYQDPVTLQLACVCDPGYIGSRCDDCASGYFGNPSEVGGSCQPCQ +CHNNIDTTDPEACDKETGRCLKCLYHTEGEHCQFCRFGYYGDALQQDCRKCVCNYLGTVQ +EHCNGSDCQCDKATGQCLCLPNVIGQNCDRCAPNTWQLASGTGCDPCNCNAAHSFGPSCN +EFTGQCQCMPGFGGRTCSECQELFWGDPDVECRACDCDPRGIETPQCDQSTGQCVCVEGV +EGPRCDKCTRGYSGVFPDCTPCHQCFALWDVIIAELTNRTHRFLEKAKALKISGVIGPYR +ETVDSVERKVSEIKDILAQSPAAEPLKNIGNLFEEAEKLIKDVTEMMAQVEVKLSDTTSQ +SNSTAKELDSLQTEAESLDNTVKELAEQLEFIKNSDIRGALDSITKYFQMSLEAEERVNA +STTEPNSTVEQSALMRDRVEDVMMERESQFKEKQEEQARLLDELAGKLQSLDLSAAAEMT +CGTPPGASCSETECGGPNCRTDEGERKCGGPGCGGLVTVAHNAWQKAMDLDQDVLSALAE +VEQLSKMVSEAKLRADEAKQSAEDILLKTNATKEKMDKSNEELRNLIKQIRNFLTQDSAD +LDSIEAVANEVLKMEMPSTPQQLQNLTEDIRERVESLSQVEVILQHSAADIARAEMLLEE +AKRASKSATDVKVTADMVKEALEEAEKAQVAAEKAIKQADEDIQGTQNLLTSIESETAAS +EETLFNASQRISELERNVEELKRKAAQNSGEAEYIEKVVYTVKQSAEDVKKTLDGELDEK +YKKVENLIAKKTEESADARRKAEMLQNEAKTLLAQANSKLQLLKDLERKYEDNQRYLEDK +AQELARLEGEVRSLLKDISQKVAVYSTCL +>sp|P70612|CXCR1_RAT C-X-C chemokine receptor type 1 OS=Rattus norvegicus OX=10116 GN=Cxcr1 PE=3 SV=1 +MAEAEYFIWIAPEGDFEEEFGNITRMLPTGEYFSPCKRVPMTNRQAVVVFYALVFLLSLL +GNSLVMLVILYRRRTRSVTDVYVLNLAIADLLFSLTLPFLAVSKWKGWIFGTPLCKMVSL +LKEVNFFSGILLLACISVDRYLAIVHATRTLTRKRYLVKFVCMGTWGLSLVLSLPFAIFR +QAYKPYRSGTVCYEVLGEATADLRITLRGLSHIFGFLLPLFIMLVCYGLTLRTLFKAHMR +QKRRAMWVIFAVVLVFLLCCLPYNLVLLSDTLLGAHLIQDTCERRNNIDQALYITEILGF +SHSCLNPVIYAFVGQSFRHEFLKILANLVHKEVLTHHSASFRTSLTTIY +>sp|C4Z1I4|SYG_LACE2 Glycine--tRNA ligase OS=Lachnospira eligens (strain ATCC 27750 / DSM 3376 / VPI C15-48 / C15-B4) OX=515620 GN=glyQS PE=3 SV=1 +MEKTMEKIVSLAKARGFVYPGSEIYGGLANTWDYGNLGVELKNNVKKAWWQKFVQESPYN +VGVDCAILMNPQTWVASGHLGGFSDPLMDCKECHERFRADKLIEDWADENSYDLGGSVDG +WTQEQMKNFIDEKNICCPSCGKHNFTDIRQFNLMFKTFQGVTEDAKNTVYLRPETAQGIF +VNFKNVQRTSRKKVPFGIGQIGKSFRNEITPGNFTFRTREFEQMELEFFCKPGTDLEWFT +YWRQYCIDWLKALGIKEDEMRARDHSPEELCFYSKGTTDIEFLFPFGWGELWGIADRTDY +DLTQHQTVSGEDMSYFDDEAKEKYIPYVIEPSLGADRVTLAFLCSAYDEEELEGGDVRTV +LHFHPAIAPVKIGILPLSKKLNEGAEKVYAELSKYYNCEFDDRGNIGKRYRRQDEIGTPF +CITYDFDSEEDGAVTVRDRDTMQQERIKIADLKAYFEDKFRF +>ENSP00000504660.1 pep chromosome:GRCh38:7:26171686:26201301:-1 gene:ENSG00000122566.22 transcript:ENST00000676903.1 gene_biotype:protein_coding transcript_biotype:nonsense_mediated_decay gene_symbol:HNRNPA2B1 description:heterogeneous nuclear ribonucleoprotein A2/B1 [Source:HGNC Symbol;Acc:HGNC:5033] +MEREKEQFRKLFIGGLSFETTEESLRNYYEQWGKLTDCVVMRDPASKRSRGFGFVTFSSM +AEVDAAMAARPHSIDGRVVEPKRAVAREESGKPGAHVTVKKLFVGGIKEDTEEHHLRDYF +EEYGKIDTIEIITDRQSGKKRGFGFVTFDDHDPVDKIVLQKYHTINGHNAEVRKALSRQE +MQEVQSSRSGRGGNFGFGDSRGGGGNFGPGPGSNFRGGSDGYGSGRGFGDGYNGYGGGPG +GGNFGGSPGYGGGRGGYGGGGPGYGNQGGGYGGGYDNYGGGNYGSGNYNDFGNYNQQPSN +YGPMKSGNFGGSRNMGGPYGGGNYGPGGSGGSGGYGGRSRY \ No newline at end of file diff --git a/pypgatk/testdata/test_blast_validate.mzML b/pypgatk/testdata/test_blast_validate.mzML new file mode 100644 index 0000000..17f4359 --- /dev/null +++ b/pypgatk/testdata/test_blast_validate.mzML @@ -0,0 +1,238 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + AAAA4JVEWUAAAAAgj4NZQAAAAGCYhFtAAAAA4M3EW0AAAABgmAVcQAAAAIAcU1xAAAAAIJnFXEAAAACgfvtcQAAAAKCKBF1AAAAAIC7DXUAAAACgMwVeQAAAAOA/gl5AAAAA4B0EYEAAAABgTgRgQAAAAAA3JGBAAAAAQGskYEAAAADAVURgQAAAAMCERGBAAAAAYG9kYEAAAAAAnmRgQAAAAEBIwmBAAAAA4G0CYUAAAACAIUJhQAAAAOAhw2FAAAAAYHeCY0AAAADA9sJjQAAAAGD2wWRAAAAAgCDDZEAAAADA+eJkQAAAAABKJGVAAAAAYNHjZUAAAABA90FmQAAAAMDiaWZAAAAA4M5CZ0AAAADgnmRnQAAAAEC5hGdAAAAAoPLVZ0AAAAAAV1loQAAAAMDRYmhAAAAAoB2kaEAAAACgzSJpQAAAAOB05GpAAAAAwKGDbEAAAABgdcVsQAAAAACm421AAAAAIMsFb0AAAADAg4NvQAAAACCTEnBAAAAAIMKBcEAAAAAgKRJxQAAAAMD/InFAAAAAwAszcUAAAABAp8FxQAAAAKC00XFAAAAAoKehckAAAADgEtJyQAAAAGAi4nJAAAAAABgjc0AAAACgmDpzQAAAAEB4QnNAAAAAgBdSc0AAAACAFrJzQAAAAIBgsnNAAAAAYK3Kc0AAAADgnKp0QAAAAIDNKnZAAAAAwASzdkAAAABgem53QAAAAGBthHdAAAAA4BWTd0AAAACA6ap3QAAAACBAI3hAAAAAgAJbeEAAAADgL3t4QAAAACDXInlAAAAAIK8zeUAAAAAgL+R5QAAAACAh+3pAAAAAYEote0AAAACg9i17QAAAAADGpHtAAAAAwJALfEAAAADAlQx8QAAAAKDwSHxAAAAAwBxNfEAAAABAgox8QAAAAOC3ZH1AAAAAIHZsfUAAAABAiHR9QAAAAKBPpH1AAAAAIKaMfkAAAAAgSvR+QAAAAEAJlH9AAAAAQAakf0AAAACAEq1/QAAAACCQrX9AAAAAoHSuf0AAAABAHFqAQAAAAMDEo4BAAAAAgFacgUAAAADAR9qBQAAAAKBT4oFAAAAAQH5igkAAAACAf2qCQAAAAMA2uoJAAAAAgD/kgkAAAAAAx1CDQAAAACBZBIVAAAAAoPaqhkAAAACgHnOIQAAAAOBDe4tAAAAAgHcDjEAAAADgeguMQAAAAIBJWpFAAAAAoClckkAAAABA0W2WQA== + + + + + + Ceu+RNz9rETavKFHZXksRdrkm0RVHiFEiGJ8RAtgH0QoE6ZEmEVxRJN7mkQs8BVEY0PrRRb8xUXr/sBFdM3PRWxsy0US6c1FSajKRZw8mUVaAhlEoZxgRs63lEQxdTxELuagRFYte0VX8QBGn5MRRSSrGkUdplhEZSZXRvBkHkQ2qxlENoh1RE0XgEZNbihElkMrRKCyJERocKJFnSV3RLcmpkTrYa1EyxELRjh+g0XsFadF4hwhRaeGOkScZEZF+vBxRHqqs0W3T4RGTolVRI+N3EVCPjZEJfhpRFNClkZxdYtEIE1KREZtJEU1uxtESTkmRYlbpUTJwypE+IhSRTwoJ0R8MENEbURNReCFIUSgDXNEBIGARL3bK0Qu+IhENnphRJXFF0WSvKFE/uK8RPV2SkQEbDdEKj4YRNltKUQ5DVlEoHCTRBXGZ0SM1DdE8m4nRVL7PUQNhjBFu2xzRee+RUVWcs1FljIyRFhtYkSJKwtFI5YmRPn7b0TlooBEVo0lRatYGUXrVR1EmjlDRD7pikXBLn1EQ+FKRo3FskSf045EE+MXRLemOETPUj9ENMaHRK1EoEXlqA1FP43nRRvXkkRBWUpEKfBFRNn1TUQ= + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + AAAAwORGWUAAAABgjYNZQAAAACDlhVlAAAAAoJeEW0AAAACg0MRbQAAAAACYBVxAAAAAgJHFXEAAAACgigRdQAAAACBryF9AAAAAQPYDYEAAAACgGwRgQAAAAOBOBGBAAAAAoEcjYEAAAADANiRgQAAAACBqJGBAAAAAYJIkYEAAAACgmEFgQAAAAIDEQmBAAAAAYGNDYEAAAABgUkRgQAAAAKCFRGBAAAAAQN1iYEAAAACAbWRgQAAAAICgZGBAAAAAIMlkYEAAAABgO4JgQAAAAICIhGBAAAAAoGwCYUAAAAAAIUJhQAAAAKCigmFAAAAAQBuiYUAAAABgxuNhQAAAAGBsY2JAAAAAIJ1jYkAAAADguINiQAAAAGAeuWJAAAAAQEkjY0AAAADAeUZjQAAAAMDHY2NAAAAAIO6DY0AAAABgdqNjQAAAAOAEpGNAAAAA4MvBY0AAAABA98JjQAAAAAB04mNAAAAAoJxjZUAAAABAH6RlQAAAACDQ42VAAAAAQAPlZUAAAAAg6gNmQAAAAMAbBWZAAAAAYE0CZ0AAAADA9yJnQAAAAMD4Q2dAAAAAYByFZ0AAAACApq9oQAAAAKC612hAAAAAgHTjaEAAAAAASANpQAAAAAB2BGlAAAAA4PEjaUAAAAAgToRpQAAAAIAbNGpAAAAAYMeiakAAAAAAz/NqQAAAAEBNBGtAAAAAwM1Ea0AAAADgcMVsQAAAAMAi42xAAAAAIJDlbEAAAACgpURtQAAAAKDvqm1AAAAAYKBDbkAAAACgROZuQAAAAEDFBW9AAAAAgL3Eb0AAAABAgONvQAAAAICoF3BAAAAAoHyCcEAAAABgvgJxQAAAAKDsEXFAAAAA4GUTcUAAAADA8PxxQAAAAGBQE3JAAAAAIFLTckAAAADgO9NzQAAAAIAUM3RAAAAAQGczdEAAAACAdzt0QAAAAEDCo3RAAAAAgHyzdEAAAABAxbN0QAAAACCr43RAAAAAgGyDdUAAAACgkIt1QAAAAOAqk3VAAAAAgKOTdUAAAAAgqqN1QAAAAEAgVHZAAAAAgK1jdkAAAABgaHN2QAAAAAC9c3ZAAAAAQKUrd0AAAABAi1t3QAAAACCVY3dAAAAAoGmEd0AAAACAd5R3QAAAACAPtHdAAAAAgJa7d0AAAACA0kN4QAAAAEAuRHhAAAAAIKRLeEAAAAAA10t4QAAAAMDLTnhAAAAAwK5TeEAAAABgOlR4QAAAAECWE3lAAAAAIBckeUAAAACgACR6QAAAACAMNHpAAAAAQL/DekAAAACA8NN6QAAAAIAB5HpAAAAAIA/0ekAAAAAgU6R7QAAAAMDa03tAAAAAQOzje0AAAAAA+vN7QAAAACC7lHxAAAAAIAW1fEAAAABg+nR9QAAAAOAYhX1AAAAAoPN0fkAAAAAANVh/QAAAAOAZfX9AAAAAQAyFf0AAAADAF41/QAAAAGDgj39AAAAAQB6Vf0AAAADgu5V/QAAAACAppX9AAAAAwLcagEAAAADgyJKAQAAAAIBhmoBAAAAAgKyagEAAAAAAsaKAQAAAAICN6oBAAAAAgMjygEAAAABAYPqAQAAAAECt+oBAAAAAALUCgUAAAADgQhqBQAAAAAC8coFAAAAAIKN6gUAAAADgpYKBQAAAAECV4oFAAAAAAOjqgUAAAACACFOCQAAAAMBjyoJAAAAAIOHKgkAAAADArdKCQAAAAGAE04JAAAAAwAXbgkAAAACA6vKCQAAAAEDgIoNAAAAAILtCg0AAAAAgv0qDQAAAAMCVUoNAAAAAQIJag0AAAAAA71qDQAAAAKCFYoNAAAAAwNVig0AAAACAn2qDQAAAAODusoNAAAAAYO66g0AAAAAAAiuEQAAAACAkO4RAAAAAwAJDhEAAAAAAG7uEQA== + + + + + + GlE5RvzSwkaHjTRFaolUR+5kiUUaTaJGWX20RcYK4UZPwoFFX/OgRUNMEEhgkB9IREhkSIo/ckgEbP1HsQWPRUa7nEVCrmFIxYQTRk8K+Ecy7jpIWGpWRaQzMkjog+VHnsRbRT4+TEVOZmdFE06PRbYAP0XVmjtFlwa1RV4LJUavkK5FT4gmSB0vP0XTCyxFCsMpRa6jJkWkeiJFpTk8RQ0aVUWCoqVFlQobRWCa1Uar8Z9FCKpyRrrcGkWCQRZIekY4RhEmM0bJyoFGJwMcRQOfT0X8OxRFQQovRjqHJ0U/cg9Fi0m/RZ/iIkUc9W9GozIORjxScEUptjxFTLxMRdKNUkU0kYFG3EIBR2tl60cbXIZF1HqZRW9+C0eo3mRF1utBRUZJP0VC/OVGRwQtRZ7NOkUfrRNF/pVBRca9WEXko0VGxRuZRqr8I0W79xtGVRtGR6kcCkc4IuFG+nEyRYsEpkVTZzpFlfdORS1hp0XGxphF4s6FRcvYG0WFV2BFEWAqRTNhiUW+v4tFxu+ZRjrUjUU0DphFFmqCRWNYXkXx9LBGxi9BR0pNiEXyw15FIj4+Rb/yZUfyLFBFLJ99RxcSQUekEFJFCCeRRTM9x0UzlVFFv8FCR5uPV0eEek1Fg16VRQ+rskXjQYFIBIWvRjS+aEUrRS9Ggu2nSDwb4kYTAcZFLqCQRnIwXkWEsolFBGLXRrlSJ0X7FblFBP8vRZInQEVsajJF21jNRm3MQEVpUkZFJXcqR4AwTkWFl2lFWWt/R51lIkYABUZF1P8iRSmVWkXnMn5Hf390RWw6N0a79qRFbo83SGdpvUaJuGhF4FZjRfNBiEWi3Y9FK8KIRd+nckZ2PO5GkQtxRWlomUVNIoNFsniERYzRwEZmzclGJD7dRjxqhUVIrHFFFzkXRllzjkWKeqFHaEyGRT+TU0UP4HVGSvnMRqD/3UY= + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + AAAAQItEWUAAAACggoFZQAAAAGCFg1lAAAAAoLrDWUAAAACAsMpaQAAAAMBJgltAAAAAoI6EW0AAAACA74ZbQAAAACBixFtAAAAAIMXEW0AAAACgjwVcQAAAAMCIRFxAAAAAwIjFXEAAAABggARdQAAAAGDABV1AAAAAICgFXkAAAACAM2VeQAAAAECFhF5AAAAAoGLnXkAAAABAfERfQAAAAEB/g19AAAAAYIXFX0AAAABgXshfQAAAAIAWBGBAAAAA4EkEYEAAAADgGCJgQAAAAEBCI2BAAAAA4DEkYEAAAAAgZSRgQAAAAMCMJGBAAAAAQJVBYEAAAAAATURgQAAAAGCARGBAAAAAAJdiYEAAAACAaGRgQAAAAKCbZGBAAAAAQGgCYUAAAACgnAJhQAAAAGDDAmFAAAAAQBpCYUAAAABgcYJhQAAAAICZgmFAAAAAYMCCYUAAAADAFaJhQAAAAGBCo2FAAAAAoJniYUAAAAAgFwJiQAAAAKAXI2JAAAAAYGtiYkAAAACAk2NiQAAAAOCYAmNAAAAAwBUiY0AAAABAlEFjQAAAAKAZQ2NAAAAAALBhY0AAAACAl2JjQAAAAEBwgmNAAAAA4OqDY0AAAADA7KFjQAAAAOByo2NAAAAAADekY0AAAADg8MJjQAAAAMBNxGNAAAAAgGziY0AAAABAPORjQAAAAIBu5GNAAAAAgLT/Y0AAAAAAQ0NkQAAAAKCZgmRAAAAAwBaiZEAAAACAnqNkQAAAAADvwWRAAAAAQJTiZEAAAABgFiNlQAAAAOBsYmVAAAAAYISCZUAAAABgc4NlQAAAAEDsomVAAAAAYMWkZUAAAABAyeNlQAAAACD85GVAAAAAwOYDZkAAAABgFQVmQAAAAMBwgmZAAAAAoO2hZkAAAACgE6NmQAAAAIAro2ZAAAAAoHGjZkAAAACA78JmQAAAAOCNw2ZAAAAA4LzhZkAAAABAbeJmQAAAAMAM42ZAAAAAgEgjZ0AAAACAx0JnQAAAAMAVhWdAAAAAAMOiZ0AAAAAgEsNnQAAAAODw4mdAAAAAQG4CaEAAAABgiiJoQAAAAEBxI2hAAAAAYPBCaEAAAACgamJoQAAAAGCTY2hAAAAAoMLCaEAAAACA78NoQAAAAABC4mhAAAAAIFoCaUAAAABARwNpQAAAAIB1BGlAAAAAoPHjaUAAAAAgBBVqQAAAAGDuImpAAAAAAEYjakAAAACgYkNqQAAAAGBDg2pAAAAAIG+EakAAAABAwKJqQAAAAKDto2pAAAAAwL+0akAAAAAAwcNqQAAAAKDLxGpAAAAAIJgia0AAAABAdiNrQAAAAMCzQmtAAAAAAIGla0AAAADAwcJrQAAAAMAsxWtAAAAA4OvUa0AAAAAgPuJrQAAAAOD05GtAAAAAAEIDbEAAAACgw0NsQAAAAEBFY2xAAAAAQN9jbEAAAAAg6qVsQAAAAOCdw2xAAAAAQGfFbEAAAABAhOVsQAAAAGBsQ21AAAAA4E1EbUAAAACgxmNtQAAAAGBsZW1AAAAAAOGDbUAAAADAH6NtQAAAAMDCw21AAAAAwJ/jbUAAAABAGQNuQAAAAGCVIm5AAAAAoDAjbkAAAAAAlkNuQAAAAACe5W5AAAAA4D7mbkAAAAAAvgVvQAAAAIAfI29AAAAAIHclb0AAAABA2yVvQAAAACA+hG9AAAAAQMPEb0AAAAAAHsVvQAAAACB2429AAAAAgJvkb0AAAADgOeVvQAAAAMBcAnBAAAAAALghcEAAAABAwjFwQAAAAKDPcXBAAAAAQGB5cEAAAACgzaFwQAAAAIC5sXBAAAAAwDjacEAAAACAueFwQAAAAMBF4nBAAAAAgLoCcUAAAADAIgpxQAAAAODJEnFAAAAA4GMTcUAAAABAzyJxQAAAAEBQQnFAAAAA4PdqcUAAAAAA9pFxQAAAAKA4snFAAAAAoEoTckAAAABAtjFyQAAAAOB9MnJAAAAAgDxCckAAAAAg7kpyQAAAAKBJUnJAAAAAALVhckAAAAAgOKJyQAAAAGD94XJAAAAAYFpDc0AAAABA4lFzQAAAAGC3UnNAAAAAQGRic0AAAADAI3JzQAAAAOC3cnNAAAAAgBWkc0AAAABAIbRzQAAAAKAvxHNAAAAAAGbyc0AAAACgJgJ0QAAAACBwAnRAAAAAgDASdEAAAABAlTp0QAAAAGANcnRAAAAAIKRydEAAAADAN4J0QAAAAMCxgnRAAAAAYE2SdEAAAAAAebN0QAAAAIA543RAAAAAwJQSdUAAAACAWCJ1QAAAAKBcMnVAAAAAwABkdUAAAADgW3N1QAAAAMALdHVAAAAAQFN7dUAAAADgY4N1QAAAAIAahHVAAAAAADqSdUAAAACgpZJ1QAAAAIAhk3VAAAAA4CGUdUAAAADASKJ1QAAAAKCyonVAAAAA4DqydUAAAAAgGNN1QAAAAOCw7XVAAAAAwAIydkAAAAAAjjJ2QAAAAOCaQnZAAAAAAJFEdkAAAAAgOFJ2QAAAACA/U3ZAAAAAYE1bdkAAAACAvmJ2QAAAAIBQY3ZAAAAAIKVjdkAAAABAYXN2QAAAAGDNsnZAAAAAwNHidkAAAABgY+N2QAAAACAXA3dAAAAAQHlEd0AAAACAlGt3QAAAAABlhHdAAAAAwG6Ud0AAAACAeMJ3QAAAAKA883dAAAAAoNBDeEAAAADA+ZJ4QAAAACD6snhAAAAA4IXbeEAAAAAApuJ4QAAAAGCR43hAAAAA4Fv0eEAAAAAgZAR5QAAAAGDzBHlAAAAAoGcTeUAAAADgbxR5QAAAAAAGFXlAAAAA4HIjeUAAAACAvjJ5QAAAAIB3Y3lAAAAAIKdyeUAAAADAD4N5QAAAACB1g3lAAAAAYMuIeUAAAADge7t5QAAAAACPw3lAAAAAoNICekAAAADgSQN6QAAAACC2A3pAAAAA4OMEekAAAACA5hJ6QAAAAEDtFHpAAAAA4CZDekAAAACAOlN6QAAAAAC4U3pAAAAAgL9zekAAAABAk5R6QAAAAECjpHpAAAAAQLK0ekAAAABA2+N6QAAAAGCm8npAAAAAYMAUe0AAAABAZyN7QAAAAIDQUntAAAAAIFdje0AAAADAYXN7QAAAACC0tHtAAAAAIBO1e0AAAACgzsR7QAAAAKDb1HtAAAAAwFTee0AAAADA7ON7QAAAAMDn5HtAAAAAAPj0e0AAAAAAEwN8QAAAAMAHBXxAAAAA4CQTfEAAAACgrDN8QAAAAMB6pHxAAAAAoOnjfEAAAACAFPR8QAAAAIAb/HxAAAAAYBUEfUAAAAAAKQx9QAAAAOAcHH1AAAAA4DojfUAAAACgICR9QAAAAGBGVX1AAAAAYFVzfUAAAACg/HR9QAAAAGCvhH1AAAAA4DuVfUAAAADgVKN9QAAAAEBJpX1AAAAAgM3DfUAAAADA0st9QAAAAMDf031AAAAA4PcEfkAAAAAAih5+QAAAAOBsc35AAAAAIO50fkAAAABAe3V+QAAAACDNg35AAAAA4ISFfkAAAACgLol+QAAAAACmlX5AAAAAQLCefkAAAAAgOqN+QAAAAICAw35AAAAAwBX0fkAAAACgxhN/QAAAACBAFX9AAAAAgMcbf0AAAADgSyV/QAAAAADzU39AAAAAwCVVf0AAAADgbGR/QAAAAEBebH9AAAAAQKqzf0AAAACAZMN/QAAAAMB6039AAAAAoHv0f0AAAABgMwKAQAAAAGCwEoBAAAAAQLEUgEAAAAAgYReAQAAAAGCzGoBAAAAAIDMigEAAAACguSKAQAAAAKA7KoBAAAAAwL8qgEAAAABAKDaAQAAAACBQQoBAAAAAgL1EgEAAAABgVEaAQAAAAMBnR4BAAAAAoCJSgEAAAADA7GmAQAAAACDgcYBAAAAAYC1ygEAAAACA0HSAQAAAAKB5d4BAAAAAIDR6gEAAAAAgNX6AQAAAAMA7goBAAAAAoDeGgEAAAADgMYqAQAAAAICIioBAAAAAIIeOgEAAAADgfI+AQAAAAIB+koBAAAAAAM2SgEAAAAAgCp6AQAAAAOAQpoBAAAAAwEeygEAAAACASbaAQAAAAEDIuYBAAAAAIKjBgEAAAADAbsKAQAAAACAr0oBAAAAAQDHagEAAAACAlPGAQAAAAMAOAoFAAAAAoPEJgUAAAACg4EmBQAAAAGDjUYFAAAAAAEBWgUAAAABAvHKBQAAAAMA2goFAAAAAYCuSgUAAAADgsJKBQAAAAAAOmoFAAAAAoFyagUAAAAAAupqBQAAAAGBcnoFAAAAAAGCigUAAAABAuKKBQAAAAGCWr4FAAAAAwEWygUAAAADA8bSBQAAAAEBTwoFAAAAAIJ/fgUAAAABgTOKBQAAAAID15IFAAAAAYPzygUAAAABgXPaBQAAAACAD+4FAAAAAIFIegkAAAABgSyKCQAAAAMCHMoJAAAAAwII2gkAAAACAfjqCQAAAAADVOoJAAAAA4N5CgkAAAADA40qCQAAAAEBYZoJAAAAAgJZ6gkAAAADAnH6CQAAAAOCYgoJAAAAAoDSigkAAAAAgCsuCQAAAAMDz0oJAAAAAgIHWgkAAAADAcdqCQAAAAAD+2oJAAAAAAHregkAAAADgDOOCQAAAAEC6+oJAAAAA4MACg0AAAAAgdAqDQAAAAECMGoNAAAAAwI8eg0AAAAAAkyKDQAAAAKCVJoNAAAAAAJYqg0AAAAAApDKDQAAAAMC3foNAAAAAwCaCg0AAAADAbo6DQAAAAAC+koNAAAAAYMaag0AAAABAhMKDQAAAAEAmxYNAAAAAwL7Gg0AAAABA0ceDQAAAAEBvyoNAAAAA4CrNg0AAAAAg0c+DQAAAACBP3oNAAAAAIK/ug0AAAACglfKDQAAAAAA19YNAAAAAgIv6g0AAAACAlP6DQAAAAIB2JoRAAAAAAHAqhEAAAACAYDKEQAAAAIC4NoRAAAAA4MA6hEAAAACAwT6EQAAAAABRQoRAAAAAoHlOhEAAAABAVGKEQAAAAKCFaoRAAAAAoIVuhEAAAACAvHKEQAAAAEC4eoRAAAAA4CmChEAAAAAAAIuEQAAAAAARm4RAAAAAwIm6hEAAAADgeMqEQAAAAKCt2oRAAAAAALLehEAAAABAteKEQAAAAGDm74RAAAAAIJHyhEAAAADAXQOFQAAAACBrCoVAAAAAYFMShUAAAADAnRqFQAAAAABHHYVAAAAAoPYfhUAAAACguyKFQAAAAEC+JoVAAAAAwLwqhUAAAAAgpDKFQAAAAOCkOoVAAAAAIKVKhUAAAACAV02FQAAAAKAQW4VAAAAAAPNqhUAAAACA7XKFQAAAAOCDmoVAAAAAYOmqhUAAAADg8cqFQAAAAADrzoVAAAAAgPPShUAAAABAVNWFQAAAAEDO8oVAAAAA4NT2hUAAAADAVfuFQAAAAABfAoZAAAAAYFwFhkAAAABgOSuGQAAAAMAPMIZAAAAAwLcyhkAAAAAANjOGQAAAAEBoNYZAAAAAALE2hkAAAACAETiGQAAAAEDdOoZAAAAAYFQ7hkAAAADA4z6GQAAAAIDpQoZAAAAAgL9KhkAAAABANkuGQAAAAADCToZAAAAA4DpThkAAAAAAHVuGQAAAAOBkg4ZAAAAAAGiLhkAAAACgu46GQAAAAEDXkoZAAAAAAN+WhkAAAACg1JqGQAAAAGDWnoZAAAAAAKemhkAAAADABKuGQAAAAIDh2oZAAAAAIFHbhkAAAACg3d6GQAAAAKDc4oZAAAAAYFPjhkAAAAAg4eaGQAAAAAB1IodAAAAA4PQih0AAAACA+CaHQAAAAMD9KodAAAAAAOBOh0AAAADAgGOHQAAAACD1qodAAAAAYA+3h0AAAAAgDruHQAAAAIAOv4dAAAAAgNvKh0AAAAAg7tKHQAAAAKDv1odAAAAAYNLih0AAAABg6+qHQAAAAMAMG4hAAAAAIAYfiEAAAABADSOIQAAAAKDBMohAAAAAIL9SiEAAAACAI1+IQAAAAAAfY4hAAAAAgBlniEAAAACAFmuIQAAAACAJe4hAAAAAoBSDiEAAAADAAoyIQAAAAAAro4hAAAAAICyniEAAAABAL6uIQAAAAAAxr4hAAAAAIDWziEAAAACgMLeIQAAAAGDYwohAAAAAAM/KiEAAAACg4NKIQAAAAEB204hAAAAA4IbbiEAAAACAIuOIQAAAAGAW94hAAAAAwL8SiUAAAABg+zqJQAAAAAABQ4lAAAAAQA5riUAAAADgq2uJQAAAAACXeolAAAAAAN2iiUAAAACgbLOJQAAAAIB7u4lAAAAAQHvDiUAAAADA98qJQAAAAEBj04lAAAAAgML7iUAAAACgtzuKQAAAAIBEQ4pAAAAAAKlLikAAAADAsFOKQAAAAMBZg4pAAAAAYF6HikAAAABAUbOKQAAAACBUu4pAAAAAIGDLikAAAADAXs+KQAAAAGDB24pAAAAAIMnjikAAAACAy+uKQAAAAAA+94pAAAAA4C4ni0AAAADAOSuLQAAAAAA7c4tAAAAAgDl7i0AAAABgY6OLQAAAAICO54tAAAAAoIrri0AAAADgf++LQAAAAIB6E4xAAAAAQOkjjEAAAADABDSMQAAAAKDQW4xAAAAAAPJjjEAAAADg/oOMQAAAAAD2i4xAAAAAoHGfjEAAAABg09uMQAAAAKDX44xAAAAAAAbsjEAAAABgFvSMQAAAAIBXC41AAAAAIGwPjUAAAACAYhONQAAAAAAMFI1AAAAAIA4cjUAAAAAAEySNQAAAAKCiK41AAAAAoH4zjUAAAAAAwruNQAAAACDJw41AAAAAoMbLjUAAAACglkOOQAAAAODBS45AAAAA4K17jkAAAABAtoOOQAAAAACwi45AAAAAYLELj0AAAACgNpyPQAAAACA1PpBAAAAAIDNCkEAAAADgJHqQQAAAAGDEiZBAAAAAIP+ZkEAAAADA+52QQAAAAIAHopBAAAAAYAemkEAAAABAEqqQQAAAAIBFrpBAAAAA4EOykEAAAADgCOKQQAAAAAAO5pBAAAAAQAFCkUAAAABAAEaRQAAAAOBTlpFAAAAAoFzykUAAAADALh6SQAAAAABmKpJAAAAAoDpekkAAAAAAVGKSQAAAAOBOZpJAAAAAIC+ekkAAAADgKaqSQAAAAIBm1pJAAAAAoGvakkAAAAAAQeKSQAAAAKA45pJAAAAAQCP+kkAAAADgihqTQAAAAMAEHpNAAAAA4Ioek0AAAACAkyKTQAAAAEA4QpNAAAAAwDFGk0AAAABAfraTQAAAAEBg2pNAAAAAoIP2k0AAAACgfvqTQAAAAMBqHpRAAAAAAF0ilEAAAABge2aUQAAAACCAapRAAAAAQHlulEAAAAAAaX6UQAAAAOBagpRAAAAAIHLGlEAAAACgd8qUQAAAAOB2zpRAAAAAAKXWlEAAAADgodqUQAAAAIC2HpVAAAAAgLQilUAAAACAjuaVQAAAACCT6pVAAAAAQJ8qlkAAAACAoy6WQAAAAICqMpZAAAAA4Kg2lkAAAAAgsDqWQAAAAGCtPpZAAAAAQNjalkAAAABA4h6XQAAAAMAVo5hA + + + + + + m2qfRy+dqkSAnlBI+k68RfBMrUQ+0MBEAIobSRhQHUU5EydFzGT8Ronpq0ZBGnlGEwGKR05H80X1UglFzV8pRToPwEQvnwFFz164RMVuyEQdD45GEknTRVtzBEW73chHz3MBSPwYCUdhklxF0ogLSJET+UdQiR9F6AmfRim2okdjCq1HccPVRQvQm0eT+4pHowW0ReiNxUSisEJGS5VCRu5AuESJQzxGuDSyRCCR6UVoMCFGyQalRNH/mEU679JF/hLfRbIhB0WwsdxFhMrSRbzOgkZTxLhFlCjbRLPmJ0ZPHttFHbTaRLqNM0V9X7hFoBgIRVaMpEZowS9Gmt1VRWFB1EVWLjNFIeesRM5o00SxArdFyRnCRWlRJEXHarhG70rpRBwgPEYZZo1GoDrPRK2nIEZTZhBF1B5NRcvk5kccMcBFFz9IRdAyVUY7hCFF5DwNRfirCkUcRCRFBiKIR0DSlkZZq61F/tzzRMxlXEVqGdVEcCCxRW34DEVPIAZG2bA6Rx+H1UTsEh9FzgcDR5TnskXdeTlF6Ig9RbqRTUWKHEZFJbUuRY+6JUeD4MVGV9TbRN8zKUbNdbpESVTbRO0Dw0WG22BG4dK3RwWA9EWtytlFZ8RoRiI32kU6ZMdFdGoXR1JzzUT2zwBGKRIWR7wODkWAdS1FAPcQRVECT0XnCcxFdcmuRWT65kQ0MT9FxDbuRB37BkitjxxFpz8MRpOQA0WSwz5G/OoOSGg2IkYyrapFQ5YiRd/5FUelB7ZFagwaRfmJzEUrpLtE463QRMkh90ZE7B1GURsxRYi0P0WwInpGnrQ5RuS2skhQJiVFAGkARcAvq0aATQlFW0PPRHDEc0ZSdpVFU0xgR+R33kRU0klFIh3zRrrqIEW9a8pFJA7SRCJVykSALxdGChE2RRg9yEX5Ls9Ext0EScj2F0UYz2FHlQwCRbRyMUWoCjBGBqUURgVQvUTVx+RFs12zRVtNPkXIGRhFn4bIRgXFBUaleqREpZb9RHCDuEUR0itG4KX0RBAYCEYfHOZEI/LvRfRDEkYErr5EySumRHysikf5yBdGfPKsRsmMOEffOJdFdHrhRbEz40Rx5r1F3zSkRvy1IUUKeMFEhigURlCt60QoIeVEDa8cR1DpHEcCsLpF/mIXRuwTT0ahzIJI7gwpRaalyUQeNfJGsGW4RrUnzEbW689EKxoKRSZhF0WuckNFPbr7RNH4BkXSlK9ERTwURawSrUf35D1G0pAQRXgi8UR9TGxGqE86RWUN30TH0O1EFW+7RHfIJkYxK05FaCfkRVzHyURVrt1Erq4UReUrGEXgVgNHiFrbRdj8r0RJyjlFNncMRT84QEVbW99E8PsWRpIMtEXA8SBFfVi9RPczikeT/W5Gs/h4RmI+JEaUZs9ECVI+RauoX0aKd/ZEfQIpRarwLkWqytpGU2c1Rrf+BEZec/tEzEhsRvESAUVk85dG8QwZRzDsAUX3DFRF1koaR8o6lEUN6BxF+gI1ReYbwEXLYh5IucLrRvn8NkWyFxtGk7QrRQnmBUaUTjhF07c4RkvdM0VjOtBFc7w2Rj+LuUgXoYBHbGocRWBU6kRFYPZFoM4+Rsyx00YXeiRFyvWbRVtsK0aiqA9GuvskRUuQ4EZHaIVGLB8kRewjz0RNkQBF+IFJRrTY7URROqtFbnksRV05zUb8NidFSbSmRjNrsUVCQVlFE24LRjJk8UV4cB1Fcwn/RDHC3USV98xEepkURbUGX0f+btdFhz5KRuLZOkVapQNF/mf2RMByzUS4mMVFQT+7RG7EhEYn5QFH2mAWRYXkukXqKPZERREhRvrsBUXZHeJETCj2Rae5IUZYzxRFtJq4RZQK20TTDb5FKDVZRUJMpUWLIHRIBfAURa3PYUcreetFvukNRg6P7EWXBS5G0GQfRS8gFEbcYdJEMUreROrLDUeoci9Fi5RZRr68kkaxCQRFkNn3RFLu+kY3jgxGwhQORd3rjUZc7XtH6pu7RiXM1ERaVrNGmNJoRe0EvEXPvg1FMv3ERtwRFkYv9gRFkqDBRYHBjUafJaFGJt5NRQf43USGcLlFmScgRtbQsEaGYDlFJEETRbYZMUVISl9Ft5onRsnMrkWYjH1FzKRQR64dn0XhsbZG18rHRerD+US8QIZGgGqDRje9KEZ9XxNFCfIrRoIdSkbh0ztFf5mpRrijV0Uhd/pFSbcCRU4r3UQeZlFFyF4WRWpXMUWUrCRFfp+5R/1grkY59AFG1ZP/RZlyREX0fABFS8msRdFwCkb3NcpHIhXFRf0+50UC+7pGcPoWRRAa8kTM6khHc285Rvy4VUaSegFGkcZlSPEz9UearSNHqAsqRjXuAUU0U/BE7TgDRZlWwUTTxQxGeJH6ROFyzUXsWoRGPkkARmnUl0bxNahFAxIuRdcMyESPM95Ev5urReGqVUV9EbpFZlK3RVDVJ0Ue1ypF2OkoRZpLFEUCw5pG8KRcRmyrskX2a/5EPbsFRVPTTEXMfiFG9mEDRkUJWUbQ1blFoWEgRX1Nt0XhsAFFcsj9RPSw/EW/7gdH+0OXRutT2UWNfQRFaTGyRd+ZF0ZJYrZFWC3SRdntyEWyEDJGABoRRpSONUcQcPtGVg1FRq9o4kXxsv1Ebv4JRlpRAUUcNilFnKzuRcAgAkU6VvtFIBNLRThIREa7Ub5Fg3n8RHwHR0VjjMJFeMjARbK2YkawHf9F/INXRRG2KUVUSvlEXJOHRov2REXE+P1F6S27RV0lBUXrZ6pGUIiHRQRjE0bAkg5GWZuSRf0hEEaXAB5FOzK7RRO9I0WD9p5GNP4wRdLXFkURU1JG0BcvRljKrkVbBSRFKQgfRSk+N0Xk9fNFJzkRRz9iE0dhg7dG86QpRvd0G0bbwjNFD3A0R6D19Ebl2D1GmEIQRXc/wkXt6yFFbMYJRRdYLEVMzwtFeNXjRGgq9ETGhwdFWYoARcChA0UbvIxGrBEmRkeEHEU14PJEn0A7RfWMoUZx1s9GfW0gRhbVGEU9aAxGwNtCRWO9A0WVGTNGKM6TSB+jRUhe3ZpHJ0qvRsKEHUW2Pw9Fh9MdRY306ETeEuZFS/hQRY5tCEXfqSdFVIccRnRHW0ZD5LNFeaywRZms7ERecqpFJIulRSj7lkbPnMJF6+UkRU2b+kRPIQZFhkJNRbjXAkWR3BpFaUPjRcpZRkWIWzZGGVwxRsq2TEbcO11FPebfRZ+RwUXDZeFGL8oeRrxSDkWMYjRFZhsFRdWE3ETPgT1GuqMLRu3kE0UO4kZFBpFdRbsU7ESeP/lEJ2A4RbrwvkWkbwFF9ffoRAfzP0bUkTpFGlYARQ98XEYPZQJGRJc+RiAOdkXne9ZF0lo6RZlEOUX4nUFHm1C+RtsK2UVmoC1F72U5RTpvZkfG3KVG2WXQRRyVE0V6oxBF+h5TR8hkqEZv//pFUwHwRC19WkXH5RRFl/sORfuXw0VfusVFbcY6RrJNNUWo9DRG5DVuRtEgdkZR0RpGcxFcRav1Y0Z3IDVFQiM9RrhdUEUpJVVFKvmpRed6B0Uy5DpFlxXdRJ/nh0YO3NdFtLk1RQ1OVEVXzL5FJfIURfFn+ETZMh5F+cKvRYGbWkcbAvhE4vsSR1yaKkb8UbpF6RRARaueFkWZtxdFT3lXRmWqyEV2ZCJFFZFORTW19kZy4L5GWcRYRSYrPkV6ehZFiF1CR+yH00bgj+5Fm7qyRTDuGUXs28hG8f5XRqz8OEVOiaFFZYwZRm1wWkb737lGzo4XRgwqV0WuQPtEHDVDRfY1B0VafslF + + + + + + + + + 3050 + 9318 + 16593 + + +32398 +0 + \ No newline at end of file diff --git a/pypgatk/testdata/test_blast_validate_psms.tsv b/pypgatk/testdata/test_blast_validate_psms.tsv new file mode 100644 index 0000000..95ffba2 --- /dev/null +++ b/pypgatk/testdata/test_blast_validate_psms.tsv @@ -0,0 +1,7 @@ +PSH sequence PSM_ID accession unique database database_version search_engine search_engine_score[1] modifications retention_time charge exp_mass_to_charge calc_mass_to_charge spectra_ref pre post start end opt_global_q-value opt_global_cv_MS:1002217_decoy_peptide opt_global_cv_MS:1000889_peptidoform_sequence SpecFile ScanNum position +PSM YHTINGHNAEVR 0 "ENSP00000504571.1,ENSP00000503242.1,ENSP00000503961.1,ENSP00000504660.1,ENSP00000497298.1,ENSP00000503452.1,ENSP00000504799.1,ENSP00000503190.1,ENSP00000503898.1,ENSP00000503968.1,ENSP00000503885.1,ENSP00000504049.1,ENSP00000503550.1,ENSP00000503870.1,ENSP00000503521.1,ENSP00000503236.1,ENSP00000503360.1,ENSP00000503021.1,ENSP00000503915.1,ENSP00000503460.1,ENSP00000346694.4,ENSP00000478691.2,ENSP00000504439.1,ENSP00000504329.1,ENSP00000503476.1,ENSP00000504831.1,ENSP00000504023.1,ENSP00000504721.1,ENSP00000503514.1,ENSP00000503375.1,ENSP00000349101.8,ENSP00000503047.1,ENSP00000503833.1,ENSP00000503836.1,ENSP00000503703.1,ENSP00000503429.1,ENSP00000354021.4,ENSP00000504415.1,ENSP00000503060.1,ENSP00000503501.1,altorf_ENST00000679318.1_2,altorf_ENST00000677339.1_2,altorf_ENST00000678501.1_2,altorf_ENST00000676903.1_2,altorf_ENST00000608362.2_2,altorf_ENST00000677631.1_2,altorf_ENST00000676749.1_2,altorf_ENST00000678035.1_1,altorf_ENST00000678075.1_1,altorf_ENST00000678183.1_3,altorf_ENST00000679021.1_3,altorf_ENST00000677321.1_2,altorf_ENST00000677571.1_2,altorf_ENST00000677906.1_2,altorf_ENST00000678277.1_3,altorf_ENST00000678973.1_2,altorf_ENST00000679124.1_2,altorf_ENST00000679123.1_2,altorf_ENST00000677574.1_2,altorf_ENST00000678631.1_2,altorf_ENST00000678998.1_2,altorf_ENST00000354667.8_2,altorf_ENST00000618183.5_2,altorf_ENST00000677839.1_2,altorf_ENST00000676746.1_3,altorf_ENST00000678675.1_3,altorf_ENST00000676524.1_2,altorf_ENST00000678935.1_2,altorf_ENST00000678962.1_2,altorf_ENST00000679001.1_2,altorf_ENST00000678449.1_2,altorf_ENST00000356674.8_3,altorf_ENST00000678697.1_2,altorf_ENST00000678431.1_2,altorf_ENST00000676497.1_2,altorf_ENST00000677396.1_2,altorf_ENST00000678779.1_2,altorf_ENST00000360787.8_2,altorf_ENST00000679243.1_2,altorf_ENST00000677656.1_2,altorf_ENST00000678884.1_2,ncRNA_ENST00000677075.1_1,ncRNA_ENST00000476233.2_2,ncRNA_ENST00000676932.1_2,ncRNA_ENST00000677669.1_3,ncRNA_ENST00000490912.6_3,ncRNA_ENST00000463181.5_2,ncRNA_ENST00000495810.2_2,COSMIC:HNRNPA2B1_ENST00000618183:p.R225S:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.H108P:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.K104N:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.R190G:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.G65V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.M53I:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.L37*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000618183:p.E11G:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.E92Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.E133Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.D87H:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000618183:p.M1?:,COSMIC:HNRNPA2B1:p.G233A:Substitution-Missense,COSMIC:HNRNPA2B1:p.G280C:Substitution-Missense,COSMIC:HNRNPA2B1:p.G224*:Substitution-Nonsense,COSMIC:HNRNPA2B1:p.N255Y:Substitution-Missense,COSMIC:HNRNPA2B1:p.K104N:Substitution-Missense,COSMIC:HNRNPA2B1:p.G285S:Substitution-Missense,COSMIC:HNRNPA2B1:p.H108P:Substitution-Missense,COSMIC:HNRNPA2B1:p.R190G:Substitution-Missense,COSMIC:HNRNPA2B1:p.G65V:Substitution-Missense,COSMIC:HNRNPA2B1:p.M53I:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G221A:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G268C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.N243Y:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G212*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.G273S:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.R178G:Substitution-Missense,COSMIC:HNRNPA2B1:p.L37*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.H96P:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.K92N:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G53V:Substitution-Missense,COSMIC:HNRNPA2B1:p.G214V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.M41I:Substitution-Missense,COSMIC:HNRNPA2B1:p.G237V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.L25*:Substitution-Nonsense,COSMIC:HNRNPA2B1:p.R203K:Substitution-Missense,COSMIC:HNRNPA2B1:p.Y336C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G202V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G225V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.R191K:Substitution-Missense,COSMIC:HNRNPA2B1:p.G332C:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.Y324C:Substitution-Missense,COSMIC:HNRNPA2B1:p.E11G:Substitution-Missense,COSMIC:HNRNPA2B1:p.E133Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G320C:Substitution-Missense,COSMIC:HNRNPA2B1:p.E92Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.E121Q:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.E80Q:Substitution-Missense,COSMIC:HNRNPA2B1:p.M1?:,COSMIC:HNRNPA2B1:p.D87H:Substitution-Missense,COSMIC:HNRNPA2B1:p.G248*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.D75H:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.M1?:,COSMIC:HNRNPA2B1:p.G217V:Substitution-Missense,COSMIC:HNRNPA2B1_ENST00000356674:p.G236*:Substitution-Nonsense,COSMIC:HNRNPA2B1_ENST00000356674:p.G205V:Substitution-Missense,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation,cbiomut:ENST00000354667:HNRNPA2B1:p.D76H:Missense_Mutation" 0 PXD014145_decoy "[, , Percolator, 3.05]" 0.642512 436.4756905 3 470.901519 470.9006154 ms_run[8]:controllerType=0 controllerNumber=1 scan=1500 "K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K" "K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K,K" "162,162,162,162,162,162,162,122,122,174,174,162,162,162,162,162,174,162,162,162,174,162,162,174,174,162,162,162,174,162,174,162,162,162,162,162,174,162,162,162,395,455,395,395,395,395,395,474,474,326,326,395,395,395,382,395,395,439,455,395,395,230,218,395,326,326,395,395,395,439,455,326,395,455,395,395,395,230,455,395,395,566,395,395,431,1358,1118,395,174,174,174,174,174,174,173,174,174,174,174,174,174,174,174,174,174,174,174,174,174,174,162,162,162,162,162,162,173,162,162,162,174,162,174,161,174,174,162,162,162,174,162,174,174,162,174,162,162,174,174,174,162,162,174,162,162,174,174,174" "173,173,173,173,173,173,173,133,133,185,185,173,173,173,173,173,185,173,173,173,185,173,173,185,185,173,173,173,185,173,185,173,173,173,173,173,185,173,173,173,406,466,406,406,406,406,406,485,485,337,337,406,406,406,393,406,406,450,466,406,406,241,229,406,337,337,406,406,406,450,466,337,406,466,406,406,406,241,466,406,406,577,406,406,442,1369,1129,406,185,185,185,185,185,185,184,185,185,185,185,185,185,185,185,185,185,185,185,185,185,185,173,173,173,173,173,173,184,173,173,173,185,173,185,172,185,185,173,173,173,185,173,185,185,173,185,173,173,185,185,185,173,173,185,173,173,185,185,185" 0.0561798 0 YHTINGHNAEVR test_blast_validate.mzML 1500 canonical +PSM KMVSLAK 4 pseudo_ENST00000454683.1_2 1 PXD014145_decoy "[, , Percolator, 3.05]" 0.668987 741.1 2 388.740661 388.7385759 ms_run[1]:controllerType=0 controllerNumber=1 scan=3252 R N 443 449 0.0947368 0 KMVSLAK test_blast_validate.mzML 3252 2 +PSM KMVSLAK 4 pseudo_ENST00000454683.1_2 1 PXD014145_decoy "[, , Percolator, 3.05]" 0.668987 741.1 2 388.740661 388.7385759 ms_run[1]:controllerType=0 controllerNumber=1 scan=3252 R N 443 449 0.0947368 0 KMVSLAK test_blast_validate.mzML 3252 4 +PSM KMVSLAK 4 pseudo_ENST00000454683.1_2 1 PXD014145_decoy "[, , Percolator, 3.05]" 0.668987 741.1 2 388.740661 388.7385759 ms_run[1]:controllerType=0 controllerNumber=1 scan=3252 R N 443 449 0.0947368 0 KMVSLAK test_blast_validate.mzML 3252 5 +PSM KMVSLAK 4 pseudo_ENST00000454683.1_2 1 PXD014145_decoy "[, , Percolator, 3.05]" 0.668987 741.1 2 388.740661 388.7385759 ms_run[1]:controllerType=0 controllerNumber=1 scan=3252 R N 443 449 0.0947368 0 KMVSLAK test_blast_validate.mzML 3252 6 +PSM AAMAAWPPAAQAAAAAVAVVGGGGEPGAPR 8 "altorf_ENST00000247706.4_2,altorf_ENST00000593489.1_2" 0 PXD014145_decoy "[, , Percolator, 3.05]" 0.547212 1209.2 5 529.4764486 529.4750268 ms_run[5]:controllerType=0 controllerNumber=1 scan=6341 "R,R" "G,G" "183,147" "212,176" 0.0526316 0 AAMAAWPPAAQAAAAAVAVVGGGGEPGAPR test_blast_validate.mzML 6341 non-canonical diff --git a/pypgatk/testdata/test_validate.mzML b/pypgatk/testdata/test_validate.mzML new file mode 100644 index 0000000..78474ce --- /dev/null +++ b/pypgatk/testdata/test_validate.mzML @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + AAAAAIuDWUAAAAAgkZNcQAAAAAA6Ol1AAAAAQGpSXUAAAAAAMIhfQAAAAIAByF9AAAAAoGbIX0AAAACgLvpfQAAAAGAbBGBAAAAAgE4EYEAAAABgNiRgQAAAAABpJGBAAAAAoFFEYEAAAAAghURgQAAAAGBtZGBAAAAAIJziYUAAAADA255iQAAAAODZy2JAAAAAoB0iY0AAAACgnWJjQAAAAIAkhGNAAAAAgEKkY0AAAACAneJkQAAAAEDJpGVAAAAAQOXEZUAAAADgzuNlQAAAAAAfBWZAAAAAIBskZ0AAAAAAHIVnQAAAAMB2YmhAAAAAQEniaEAAAACAduNoQAAAAODMImlAAAAAYL2iakAAAAAgTWNsQAAAAECeY2xAAAAAQHPFbEAAAACAk+VsQAAAAAAjA25AAAAA4JkibkAAAACgc6JuQAAAAEDOqm9AAAAAQC34b0AAAACgMzFwQAAAAIB4MXBAAAAAICFxcEAAAACgh4NxQAAAAAB8QXJAAAAAIH1jckAAAACge2tyQAAAAMCRgnJAAAAAQOdRc0AAAACAp2FzQAAAAADpk3NAAAAAgNG7c0AAAAAgUuNzQAAAAIBb83NAAAAAIBVydEAAAACAxJt0QAAAAKAe9HVAAAAAICH8dUAAAACgVEJ2QAAAAMDtr3ZAAAAAYBLUdkAAAACgG9x2QAAAAGAm83ZAAAAAoJ9hd0AAAADAaIR3QAAAACB4lHdAAAAAwIHCd0AAAABAPNJ3QAAAAIAoJHhAAAAAoGoseEAAAADgcTR4QAAAACDkQXhAAAAAgMmBeEAAAADAquJ4QAAAACC68nhAAAAAYGAMeUAAAABgZRR5QAAAACBcHHlAAAAAgOG0ekAAAAAAyBR8QAAAAAAVdXxAAAAAACBtfUAAAADAPIV9QAAAAIBClX1AAAAAAFClfUAAAAAgXLV9QAAAAEAURX5AAAAAQBhNfkAAAAAAAlN+QAAAAMAaVX5AAAAA4CFdfkAAAACAEmN+QAAAAABfWH9AAAAA4An0f0AAAACgdnuBQAAAACDQwYFAAAAAAE8ygkAAAADg40KCQAAAAGDnSoJAAAAAgGtbgkAAAAAAdGOCQAAAAKDjqoJAAAAAYO6ugkAAAABg9LKCQAAAAIDttoJAAAAA4P66gkAAAADA+dKCQAAAAMD62oJAAAAAoD85g0AAAABAwrODQAAAAEBgK4RAAAAAABVDhEAAAACguZOEQAAAAIDBm4RAAAAAABmzhEAAAADAFreEQAAAAGBEC4VAAAAA4EgThUAAAADgRhuFQAAAAKD8WYVAAAAAYA7shUAAAAAgQEeGQAAAAGBQS4ZAAAAAYEFzhkAAAAAgRLeGQAAAAEBCu4ZAAAAA4Ee/hkAAAABg/MOGQAAAAKAEzIZAAAAAIAvUhkAAAAAgkUOHQAAAAACUS4dAAAAAwJNTh0AAAAAAA1qHQAAAAECA94dAAAAAgFQDiEAAAAAgWiSIQAAAACBcLIhAAAAA4PZciEAAAAAgYGeIQAAAACCXxohAAAAAgEr8iEAAAADgUASJQAAAAOBuB4lAAAAAQFcMiUAAAACgXBSJQAAAAMB8h4lAAAAA4HeLiUAAAACAgc+JQAAAACCM04lAAAAAgIfXiUAAAACAoDeKQAAAAIA9zIpAAAAAgIHPikAAAADggNOKQAAAAEA91IpAAAAAgIvXikAAAACAjduKQAAAAOA33IpAAAAAAFFii0AAAABgtWOLQAAAAGC5Z4tAAAAAANKHi0AAAACgrdOLQAAAAOC014tAAAAAQLfbi0AAAADAxt+LQAAAAKC8DIxAAAAAILVjjEAAAADAq2eMQAAAAABikYxAAAAA4KzTjEAAAACAtNeMQAAAAGCk24xAAAAAIL/njEAAAADg0+uMQAAAAEC/74xAAAAAoIcEjUAAAAAgigyNQAAAAEAOZY1AAAAA4BFtjUAAAABgxOONQAAAAEDB541AAAAAYM3rjUAAAABACEWOQAAAAOAPTY5AAAAAoAxVjkAAAABg8e+OQAAAACD4845AAAAAIBb4jkAAAADAAuyPQAAAAMDw749AAAAAYGY+kEAAAACAGXiQQAAAAEB2gpBAAAAAIBS4kEAAAABgE/aQQAAAAKAR+JBAAAAAIBP8kEAAAABgLASRQAAAACAzCJFAAAAAwAFYkUAAAACgvVqRQAAAAMCkhpJAAAAAwN6qkkAAAABgTbqSQAAAACC8mpNAAAAAgMqek0AAAADAONOVQAAAAKBNWJdAAAAAADehmkAAAABgVS6oQAAAACB+pqlA + + + + + + YhbORH8/cUSYHXxEkw6BRHtg3UYOzb1FgCnvRnqmYETpP7FGG2fvRlMEvUa6fzdGUuqoRiRRbkb4aXFGgyqmRLMRkkScLIlEf8uSRN9qkkTezZ5E4O6pRLxes0QliZpEc7+TRDr6/kSCO75Evv8ARZShokQ0laREn/foRFCxlUUYaq9EN0yWROsmHEbHqqFEX8lYR3UZf0S3pv5EKZuHRP5EvEQE8nREeV2LRLOseUTqjc5E2dxqRYS5kEVL2p5E/YKmRB3IkkTKz7hEZZH4RE1likUURpNEFGCcRWF22kRuOOBEfeiURfmJ00Ui/K5FwOL/RMw+F0ZYuIJEioJnRiBfkkWSVJNE8dKNRVnEAUe3cMVFDj+BRH/7qETiG8BEeY22RSQJi0STxqxEStvARAAPA0YMiNBEGBZIRlbioEWfd4BEx9H5RMVDFEVvk8pE8kusRS128UWkITVIEtwmRy2V4kXnztREhwKBRvpirEU3QOBF7EzHRBayBEXCT5VEFSWGRcHZGEVVWrpEYpgJRfh3gEZI7GZFs8CTRuWQnEVsUaREJ7EjRsrDeEXf+fFEjWWaRHADJ0btw4ZFRdqhRIkYsUW+VqhEMgW6RKGIb0ZROoBFnXsfRq7VmUWwB/BG26B3Rihz5UStka5Fq1u0Rf/ybkXfhcBEgjygRFRUEEYCSwhG8qexRXRi+0RGG8ZGg4csRkN+9UZToyFGKj3cRK+hDUXUTJpF8AvSRMan70V4yulE3eKMRIMai0SRQ49EMn/CRAB5HEdwYAdFOTt2RsDigUVp6g5Fwg7vRJmnTUYLShtGdKGlRbqnmUQHinRGEHoRRh5N9UUJ0MtFDH4ARTJq8kTFXtREWHSnROhmF0UnrL1EMYqbRMpx3kXGVy1GP2PoRQnFlUSDjE9FyF7hRPeUr0TNdpNEMmz4RU4aG0ZtOwdF6zX2RDW/AEU64QVFnGYcRhE6h0WQWD1GwfQLRWtBtUQyOBdF7ZndRHwi7EY9CoRGrWW0RRoiFEWYiE9FqxCbRAlGpERHW3tFlI8LRcX+EUV+VspEjQGxRC27gUWTOrlEKBO2RP31eUU877hEpUnHRNTztUQL/sRE2vYURUMlokTTZIVFa2jfRKZcvUR+/qlEjs6rREVbwUSYVLRE + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + AAAAgDgGWUAAAAAAoAZZQAAAAOBvRllAAAAA4GcDWkAAAABAjkRcQAAAAMA7w1xAAAAAgIkEXUAAAACgKMNdQAAAAIAuBV5AAAAAgC+IX0AAAACA+MVfQAAAAGD/x19AAAAAQGbIX0AAAADgGgRgQAAAAGBOBGBAAAAAwO4PYEAAAABAHCJgQAAAACA2JGBAAAAAwGkkYEAAAACAUURgQAAAAACFRGBAAAAAoGxkYEAAAACgiYRgQAAAAIBfoWBAAAAAgLLcYEAAAABAbAJhQAAAAAAaomFAAAAA4MXjYUAAAACgCIZiQAAAAGDDY2NAAAAA4EKkY0AAAAAg98JjQAAAAKDCw2NAAAAAAHDiY0AAAACAQeRjQAAAAGDzomVAAAAAYMqkZUAAAACA5MRlQAAAACDP42VAAAAA4P/kZUAAAACAHAVmQAAAAODxoWZAAAAAYJvjZkAAAAAgHCRnQAAAAODLQmdAAAAAQOREZ0AAAACg/GRnQAAAACAdhWdAAAAAACAjaEAAAADgHKRoQAAAAAB1BGlAAAAAYFCDakAAAAAgyqJqQAAAAKDOImtAAAAA4E+EbEAAAACA+aRsQAAAAOBTpWxAAAAAYHLFbEAAAACAIeNsQAAAACCR5WxAAAAA4HkEbUAAAACgt0ZtQAAAACAMxG5AAAAAQMsFb0AAAACAy8RvQAAAACBlEnBAAAAA4HoxcEAAAACAJTNwQAAAAGC54XBAAAAAQB8BcUAAAAAA9CFxQAAAACC70XFAAAAAYOPScUAAAADAEPNxQAAAAMCp83FAAAAAoAATckAAAAAARSFyQAAAAMDsInJAAAAAYMCyckAAAADAe8JyQAAAAICQ83JAAAAAQJ0Dc0AAAABgUJNzQAAAAKDmk3NAAAAAgFyjc0AAAAAgEaJ0QAAAAGBxo3RAAAAA4L2jdEAAAAAAfLN0QAAAAICKw3RAAAAAYI7TdEAAAABgAwN1QAAAAEAMdHVAAAAAYGODdUAAAABAipJ1QAAAAACUk3VAAAAAQEJTdkAAAABg/nN2QAAAAODdkXZAAAAAICDjdkAAAACgR/12QAAAAKDPQndAAAAA4H1Ed0AAAABAaYR3QAAAAOB3lHdAAAAAgIDjd0AAAAAgdhJ4QAAAAGAnJHhAAAAAAHTDeEAAAAAgfMt4QAAAAODA5HhAAAAAQJD3eEAAAACgagR5QAAAACB4FHlAAAAAAPPbeUAAAACg6QR6QAAAAMD0FHpAAAAAoJ+LekAAAADgqZN6QAAAAACrpHpAAAAAgNDUekAAAABAJDJ7QAAAAGBks3tAAAAAgBi1e0AAAABg1MR7QAAAAKDi1HtAAAAAgPDke0AAAADAAfV7QAAAAGAPBXxAAAAAACAVfEAAAABAzVN8QAAAAADUW3xAAAAAoMOUfEAAAADAyqN8QAAAAOAEtHxAAAAAYBS8fEAAAABg/cJ8QAAAAGBPBH1AAAAA4H51fkAAAACAG4x+QAAAAOBS7H5AAAAAoGD0fkAAAADgjhKAQAAAAIC3GoBAAAAAQL4igEAAAAAAzTKAQAAAAMDSOoBAAAAAIClegEAAAACArIKAQAAAAAA29oBAAAAAYEz8gEAAAAAg3QmBQAAAAMBfYoFAAAAA4Kd6gUAAAADAttqBQAAAAACWhoJAAAAAAN7CgkAAAADg7DqDQAAAAEAYQ4NAAAAA4CJLg0AAAABAd36DQAAAAIDjooNAAAAAIOeqg0AAAADA6rKDQAAAAOAjw4NAAAAAQCrLg0AAAAAgKdODQAAAACAOI4RAAAAAoA4rhEAAAAAgFzOEQAAAAGBBq4RAAAAAoPqChUAAAAAA/YqFQAAAAOA50IVAAAAAoDtLhkAAAACAP1OGQAAAAOBEW4ZAAAAAgJjDhkAAAAAgz0uHQAAAAIDRU4dAAAAAANVbh0AAAACgZX+HQAAAAKCWo4dAAAAAwGzbh0AAAACgVkOIQAAAACBXS4hAAAAAYFRTiEAAAADAO5OIQAAAAOBgu4hAAAAAYF7DiEAAAADgr+uIQAAAAGCy84hAAAAAwJ8LiUAAAACASiOJQAAAAABRK4lAAAAAwEsziUAAAADA4tOJQAAAAKD224lAAAAAgJaDikAAAACAkIuKQAAAAMB31IpAAAAAwH7cikAAAAAAA8yLQAAAAID604tAAAAAQLRLjEAAAABgvFOMQAAAACBalIxAAAAAYParjEAAAADA/rOMQAAAAGD4u4xAAAAAwD3kjkAAAAAAR+yOQAAAAGBP9I5AAAAAAPlMj0AAAAAA/VSPQAAAAID0XI9AAAAAYPcVkEAAAADARDCQQAAAAGA3MpBAAAAA4FZOkEAAAACAUFCQQAAAAGBRUpBAAAAA4FdykEAAAADgXnSQQAAAAMBddpBAAAAA4Fx4kEAAAADALvaQQAAAAEB8LpFAAAAAoDw+kUAAAACAQEKRQAAAAEBGRpFAAAAAYJBykUAAAACAnnaRQAAAAKCPepFAAAAAQDqCkUAAAACAuo6SQAAAAMDAkpJAAAAA4Ht6k0AAAADgen6TQAAAACB1gpNAAAAAwMw+lUAAAAAgykKVQAAAAMDCRpVAAAAAwCjTlkAAAADAJ9eWQAAAACBLz5hAAAAAAIDzmUAAAABAS4CcQAAAAMAUcJ5AAAAAoDB8nkAAAABAK6CeQA== + + + + + + V7FGRTonKUX7UFtF0PNERbFaf0VBqDlFkw2NRjDaIEVYBVZF2BT6R1BSDEX1U3FHavM5SGLmXEiiBW9IfyDpRDJiYUVhMAhIq4HjR1qwLEh6LQJICXwGSMwjKkUE5lRF4uYMRbQ7OEehz0dFumpIRRp+9kQz+BVFodb8RJaY3EVpvghF9f4qRY95H0XnKSFFwyT0RjO3jEbReGVGyESxRqBd7kboKghGqLceRSLFEEXA1BtF1pHERqqYB0VvdORGfmsbRZpJgEVI2gtFX1ENRco8dEV88gNFUFQKRvVBWUVQCdxFleCUSKbEVEXBnepGoHcMRbg0EUXplOlFM81URhlVdEXS3T9F5yA8Rt4zQ0aeIiJFAdhlRTLM90WLuxpF+lhDReCDX0c+PRRF2pkHRU3VGUV1HApFr/wORnbLFUVVVI1HgEcKRgZ8Dkc/zy9FDZw+RXnISEav30FFliPfRQccQEjnv+NGyuoNRcPH7kTU0z5FBU9YRUIBCUaE+QRF4Dc0Rb1/40UNIARFg/X3RFCZ60Rhs2pF+jFKRu+qC0hblOBGYhwkRQHI8EXXyE9FGJReRjtfTUXgLF9F1dEHRcyUiUYNbABFVccVRt4gSkfDrH9FdivCRtKBYEUxNblGwqpwRZPAHUVX62hF6EsmRqV6AUj+wf1GvcklRfuGwUc3AZJGh5spRdUf50Xfn2VFJpHYRdBpAEX8cDhF6/MWRez3U0UA2hNF2hCiRgM1gUVweE1Fl74XRVlEMEXkfLhHWt/2RtsobUfoI3VGgzYgRe/nHEWKWiNFdC4PReMlFEV9HhVF9TcgRYvPE0bLVyBFaV0mRdyET0Y/pdxGN2rlRX9UDkXyDkBHABJCRhpnXEUPyrFHl8XKRreKjEWNQslH+WYCR6QYH0Y+CxFGE7IpRmpQbkUzpTZFb+QAR2G4LEbPtoZFT5hiRfq3Q0dzrLhGyRsxRblaE0VpOiBFvfECRjfiF0fu9kNGGgETRTT/CEWVTwlG95cwRa4hvEYjzSBG798YRsjvi0cGfPZGGRshRvz8AkbGaMVFlm94RV0cM0U0fKdG7IcoRkL1QkafcVxFt74vRjjVg0WfYStFznEVR1gpXUa2RYVF2qPRRm6neUZoA2RFXjYOR/evmUawRRBGeNBHRQrkQUWmhB1F6s10RXQ+60VVPAdGokqFRv9GukbgzatGAur7RVs4YUWVQDtFaPOER8r1MUePU5NG+BBKRmR68UVZbiRFHdUrRW0HfEboRlRGOyPYRukCi0ZOARBGfpeIRsYkMEbiMWBFlGJnRTxaGkWLs0BFF90jRRs3EkWyFUJF3YNMRd8fFUU= + + + + + + + + + 3759 + 11862 + + +20562 +0 + \ No newline at end of file diff --git a/pypgatk/testdata/test_validate_psms.tsv b/pypgatk/testdata/test_validate_psms.tsv new file mode 100644 index 0000000..403b7ba --- /dev/null +++ b/pypgatk/testdata/test_validate_psms.tsv @@ -0,0 +1,3 @@ +SpecFile Biological.set Retention.time.min. Ion.injection.time.ms. SpecID ScanNum FragMethod Precursor IsotopeError PrecursorError(ppm) Charge Peptide Protein DeNovoScore MSGFScore SpecEValue EValue percolator.svm.score PSM.q.value peptide.q.value tmt10plex_126 tmt10plex_127N tmt10plex_127C tmt10plex_128N tmt10plex_128C tmt10plex_129N tmt10plex_129C tmt10plex_130N tmt10plex_130C tmt10plex_131 position Variant Peptide +test_validate.mzML Set1 59.558592 83.3344385 controllerType=0 controllerNumber=1 scan=19937 19937 HCD 1052.5897 0 -0.115971394 2 +229.163TIAEC+57.021LAEELINAAK+229.163 "=_18600958@4.20978040680119@fr10:1378089(pre=-,post=-)" 188 171 5.79E-18 6.76E-10 1.897 0 0 128042 61780 190414 226202 244759 139458 116483 176833 133302 137712 8 TIAECLAEELINAAK +test_validate.mzML Set1 27.764549 150.000006 controllerType=0 controllerNumber=1 scan=8461 8461 HCD 1068.8729 2 -7.502769 3 +229.163K+229.163AAAPTPEEEMDEC+57.021EQALAAEPK+229.163 "=_21935565@4.06037609192942@fr8:1746571(pre=-,post=-)" 192 55 7.83E-12 0.001002411 0.854 0.007575758 0.005524862 28336.2 6073.63 30612.8 22688 30643.6 24194.2 11743.9 21621.2 15252.3 15450.5 6 KAAAPTPEEEMDECEQALAAEPK diff --git a/pypgatk/tests/pypgatk_tests.py b/pypgatk/tests/pypgatk_tests.py index 0f885f2..bf37fea 100644 --- a/pypgatk/tests/pypgatk_tests.py +++ b/pypgatk/tests/pypgatk_tests.py @@ -1,5 +1,4 @@ import unittest - from click.testing import CliRunner from pypgatk.pypgatk_cli import cli @@ -283,6 +282,33 @@ def test_check_ensembl_database(self): 'testdata/proteindb_from_ENSEMBL_VCF-clean.fa', '--add_stop_codons', '--num_aa', '6']) self.assertEqual(result.exit_code, 0) + # @pytest.mark.skip(reason="Not working with pytest pooling") + # def test_validate_peptides_msgf(self): + # runner = CliRunner() + # result = runner.invoke(cli, + # ['validate_peptides', '--mzml_path', 'testdata', + # '--infile_name', 'testdata/test_validate_psms.tsv', '--outfile_name', + # 'testdata/test_validate_psms_out.tsv', '--msgf']) + # print("ERROR IN RESULT: + " + str(result.exception) + " + " + result.output) + # self.assertEqual(result.exit_code, 0) + + def test_blast(self): + runner = CliRunner() + result = runner.invoke(cli, + ['blast_get_position', '--input_psm_to_blast', 'testdata/test_blast_psms.tsv', + '--output_psm', 'testdata/test_blast_psms_out.tsv', '--input_reference_database', + 'testdata/test_blast_reference_database.fa']) + self.assertEqual(result.exit_code, 0) + + # @pytest.mark.skip(reason="Not working with pytest pooling") + # def test_blast_out_validate(self): + # runner = CliRunner() + # result = runner.invoke(cli, + # ['validate_peptides', '--mzml_files', 'testdata/test_blast_validate.mzML', + # '--infile_name', 'testdata/test_blast_validate_psms.tsv', '--outfile_name', + # 'testdata/test_blast_validate_psms_out.tsv']) + # self.assertEqual(result.exit_code, 0) + if __name__ == '__main__': unittest.main() diff --git a/requirements.txt b/requirements.txt index 3a06ea9..2bc3b1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,17 @@ -biopython==1.73 -Click==7.0 -gffutils==0.10.1 +biopython +Click +gffutils numpy -PyYAML==5.1.2 -requests==2.21.0 -simplejson==3.16.0 -ratelimit==2.2.1 -pyteomics==4.4.2 +PyYAML +requests +simplejson +ratelimit +pyteomics +pathos pybedtools pandas pyopenms +matplotlib +pytest +tqdm +pyahocorasick diff --git a/setup.py b/setup.py index 1005544..9565dfc 100644 --- a/setup.py +++ b/setup.py @@ -19,18 +19,22 @@ def readme(): license='LICENSE.txt', include_package_data=True, install_requires=[ - 'biopython==1.73', - 'Click==7.0', - 'gffutils==0.10.1', + 'biopython', + 'Click', + 'gffutils', 'numpy', 'pandas', - 'PyYAML==5.1.2', - 'requests==2.21.0', - 'simplejson==3.16.0', - 'ratelimit==2.2.1', - 'pyteomics==4.4.2', + 'PyYAML', + 'requests', + 'simplejson', + 'ratelimit', + 'pyteomics', + 'pathos', 'pybedtools', - 'pyopenms' + 'pyopenms', + 'matplotlib', + 'tqdm', + 'pyahocorasick' ], python_requires=">=3.6", scripts=['pypgatk/pypgatk_cli.py'],