Merge pull request #79 from DongdongdongW/spectrumAI

update new blast and spectrumAI
bigbio · Apr 24, 2024 · 96341a4 · 96341a4
2 parents a0193ec + 114de48
commit 96341a4
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 584 deletions.
diff --git a/pypgatk/commands/validate_peptides.py b/pypgatk/commands/validate_peptides.py
@@ -21,12 +21,12 @@
 @click.option('-n', '--number_of_processes', help='Used to specify the number of processes. Default is 40.')
 @click.option('-r', '--relative', help='When using ppm as ions_tolerance (not Da), it needs to be turned on',
               is_flag=True)
-@click.option('-msgf', '--msgf',
-              help='If it is the standard format of MSGF output, please turn on this switch, otherwise it defaults to mzTab format',
+@click.option('-m', '--mztab',
+              help='If the tsv was obtained from mzTab, please enable this option. Default to tsv obtained from parquet',
               is_flag=True)
 @click.pass_context
 def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outfile_name, ions_tolerance,
-                      number_of_processes, relative, msgf):
+                      number_of_processes, relative, mztab):
     config_data = None
     if config_file is not None:
         config_data = read_yaml_from_file(config_file)
@@ -47,8 +47,8 @@ def validate_peptides(ctx, config_file, mzml_path, mzml_files, infile_name, outf
         pipeline_arguments[ValidatePeptidesService.CONFIG_NUMBER_OF_PROCESSES] = number_of_processes
     if relative is not None:
         pipeline_arguments[ValidatePeptidesService.CONFIG_RELATIVE] = relative
-    if msgf is not None:
-        pipeline_arguments[ValidatePeptidesService.CONFIG_MSGF] = msgf
+    if mztab is not None:
+        pipeline_arguments[ValidatePeptidesService.CONFIG_MZTAB] = mztab
 
     validate_peptides_service = ValidatePeptidesService(config_data, pipeline_arguments)
     if validate_flag:

diff --git a/pypgatk/proteogenomics/blast_get_position.py b/pypgatk/proteogenomics/blast_get_position.py
@@ -8,61 +8,69 @@
 
 from pypgatk.toolbox.general import ParameterConfiguration
 
+def get_details(fasta, peptide):
+    res = []
+    i = 0
+    j = 0
+    for AA1, AA2 in zip(fasta, peptide):
+        i += 1
+        j += 1
+        if AA1 == AA2:
+            continue
+        else:
+            res.append(str(i) + "|" + AA1 + ">" + AA2)
+    return res
 
-def _blast_set(fasta_set, peptide):
+def peptide_blast_protein(fasta, peptide):
     length = len(peptide)
-    position_set = set()
-    for fasta in fasta_set:
-        if len(fasta) >= length:
-            alignments_score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0, score_only=True)
-            if alignments_score == length:
-                return "canonical"
-            elif alignments_score == length - 1:
-                alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=0, open=-1, extend=0)
-                for alignment in alignments_local:
-                    # insertion e.g., ABCDMEFGH<----ABCDEFGH
-                    if alignment.end - alignment.start == length + 1:
-                        s = fasta[alignment.start:alignment.end]
-                        for i in range(length):
-                            if peptide[i] != s[i]:
-                                position_set.add(i + 1)
-                                break
-                    # substitution e.g., ABCDMFGH<----ABCDEFGH
-                    elif alignment.end - alignment.start == length:
-                        s = fasta[alignment.start:alignment.end]
-                        for i in range(length):
-                            if peptide[i] != s[i]:
-                                position_set.add(i + 1)
-                                break
-                    # substitution e.g., ABCDEFGM<----ABCDEFGH
-                    elif alignment.end - alignment.start == length - 1:
-                        s = fasta[alignment.start:alignment.end]
-                        if peptide[0] != s[0]:
-                            position_set.add(1)
-                        elif peptide[-1] != s[-1]:
-                            position_set.add(length)
-            elif alignments_score == length - 2:
-                alignments_local = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, match=1, mismatch=-1,
-                                                           open=-1, extend=0)
-                for alignment in alignments_local:
-                    # deletion e.g., ABCEFGH<----ABCDEFGH
-                    if alignment.end - alignment.start == length and alignment.score == length - 2:
-                        s = fasta[alignment.start:alignment.end - 1]
-                        if pairwise2.align.localms(sequenceA=s, sequenceB=peptide, match=1, mismatch=0, open=0,
-                                                   extend=0, score_only=True) == length - 1:
-                            for i in range(length - 1):
-                                if peptide[i] != s[i]:
-                                    position_set.add(i + 1)
-                                    break
-    if position_set:
-        return position_set
+    mismatch = []
+    if len(fasta) >= length:    
+        score = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, 
+                                                       match=1, mismatch=0, open=-2, extend=-2, score_only=True)
+        if score == length-1:
+            alignment = pairwise2.align.localms(sequenceA=fasta, sequenceB=peptide, 
+                                                       match=1, mismatch=0, open=-2, extend=-2)[0]
+            if alignment.end - alignment.start == length:
+                mismatch = get_details(alignment.seqA[alignment.start:alignment.end], alignment.seqB[alignment.start:alignment.end])
+            elif alignment.end - alignment.start == length-1:
+                res = get_details(alignment.seqA[alignment.start:alignment.end+1], alignment.seqB[alignment.start:alignment.end+1])
+                if len(res) == 1:
+                    if res[0].split(">")[1]!="-":
+                        mismatch = res
+                    else:
+                        mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
+                elif len(res) == 0:
+                    mismatch = get_details(alignment.seqA[alignment.start-1:alignment.end], alignment.seqB[alignment.start-1:alignment.end])
+                else:
+                    print("Number of mismatch Error")
+    return mismatch
+
+def _blast_set(fasta_dict, peptide):
+    positions = dict()
+    for fasta in fasta_dict.keys():
+        mismatch = peptide_blast_protein(fasta, peptide)
+        if len(mismatch) == 1:
+            if positions.get(mismatch[0]):
+                positions[mismatch[0]].update(fasta_dict[fasta])
+            else:
+                positions[mismatch[0]] = fasta_dict[fasta]
+        elif len(mismatch) > 1:
+            print("Number of mismatch > 1")
+            print(peptide)
+            print(fasta)
+            print(mismatch)
+    if positions:
+        res = []
+        for key,value in positions.items():
+            splits = key.split("|")
+            splits.append(",".join(value))
+            res.append(splits) 
+        return res
     else:
         return "non-canonical"
 
-
 class BlastGetPositionService(ParameterConfiguration):
     CONFIG_KEY_BlastGetPosition = 'blast_get_position'
-    # CONFIG_CANONICAL_PEPTIDE_PREFIX = 'canonical_peptide_prefix'
     CONFIG_INPUT_REFERENCE_DATABASE = 'input_reference_database'
     CONFIG_NUMBER_OF_PROCESSES = 'number_of_processes'
 
@@ -79,9 +87,14 @@ def __init__(self, config_data, pipeline_arguments):
         self._number_of_processes = self.get_blast_parameters(variable=self.CONFIG_NUMBER_OF_PROCESSES,
                                                               default_value='40')
 
-        self.fa_set = set()
+
+        self.fasta_dict = dict()
         for j in SeqIO.parse(self._input_reference_database, "fasta"):
-            self.fa_set.add(str(j.seq))
+            if self.fasta_dict.get(str(j.seq)):
+                self.fasta_dict[str(j.seq)].add(j.id)
+            else:
+                self.fasta_dict[str(j.seq)] = {j.id}
+
         self.blast_dict = Manager().dict()
 
     def get_blast_parameters(self, variable: str, default_value):
@@ -92,7 +105,7 @@ def get_blast_parameters(self, variable: str, default_value):
                 variable in self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition]:
             value_return = self.get_default_parameters()[self.CONFIG_KEY_BlastGetPosition][variable]
         return value_return
-
+        
     def _blast_canonical(self, df):
         seq_set = set(df["sequence"].to_list())
 
@@ -104,7 +117,7 @@ def _blast_canonical(self, df):
 
         auto.make_automaton()
 
-        for protein_seq in self.fa_set:
+        for protein_seq in self.fasta_dict.keys():
             for end_ind, found in auto.iter(protein_seq):
                 seq_dict[found] = "canonical"
                 print("Found", found, "at position", end_ind, "in protein sequence")
@@ -113,7 +126,7 @@ def _blast_canonical(self, df):
         return df
 
     def _result(self, sequence):
-        self.blast_dict[sequence] = _blast_set(self.fa_set, sequence)
+        self.blast_dict[sequence] = _blast_set(self.fasta_dict, sequence)
 
     def blast(self, input_psm_to_blast, output_psm):
         """
@@ -137,7 +150,7 @@ def blast(self, input_psm_to_blast, output_psm):
         else:
             raise ValueError("The input file format is not supported.")
 
-        psm = psm.head(4)
+
         psm = self._blast_canonical(psm)
 
         first_filter = psm[psm.position == "canonical"]
@@ -163,25 +176,24 @@ def blast(self, input_psm_to_blast, output_psm):
         psm_to_findpos = psm_to_findpos[psm_to_findpos.position != "non-canonical"]
 
         if len(psm_to_findpos) > 0:
-            psm_to_findpos["var_num"] = psm_to_findpos.apply(lambda x: len(x["position"]), axis=1)
-            psm_to_findpos = psm_to_findpos.loc[psm_to_findpos.index.repeat(psm_to_findpos["var_num"])]
-            psm_to_findpos["var_num"].iloc[0] = 0
-            psm_id = psm_to_findpos["usi"].iloc[0]
-            for i in range(1, psm_to_findpos.shape[0]):
-                if psm_to_findpos["usi"].iloc[i] == psm_id:
-                    psm_to_findpos["var_num"].iloc[i] = psm_to_findpos["var_num"].iloc[i - 1] + 1
-                else:
-                    psm_id = psm_to_findpos["usi"].iloc[i]
-                    psm_to_findpos["var_num"].iloc[i] = 0
-            psm_to_findpos["position"] = psm_to_findpos.apply(
-                lambda x: str(x["position"])[1:-1].split(",")[x["var_num"]],
-                axis=1)
-            psm_to_findpos.drop(columns="var_num", axis=1, inplace=True)
-            psm_to_findpos["position"] = psm_to_findpos.apply(lambda x: x["position"].replace(' ', ''), axis=1)
+            psm_to_findpos = psm_to_findpos.explode("position", ignore_index=True)
+            psm_to_findpos["variant"] = psm_to_findpos["position"].apply(lambda x : x[1]) 
+            psm_to_findpos["protein"] = psm_to_findpos["position"].apply(lambda x : x[2]) 
+            psm_to_findpos["position"] = psm_to_findpos["position"].apply(lambda x : x[0]) 
 
         all_psm_out = pd.concat([first_filter, second_filter, non_filter, psm_to_findpos], axis=0, join='outer')
         all_psm_out = all_psm_out.sort_values("usi")
-        all_psm_out.to_csv(output_psm, header=1, sep=",", index=None)
+
+        if output_psm.endswith(".csv.gz"):
+            all_psm_out.to_csv(output_psm, header=True, sep=",", index=None, compression="gzip")
+        elif output_psm.endswith(".csv"):
+            all_psm_out.to_csv(output_psm, header=True, sep=",", index=None)
+        elif output_psm.endswith(".tsv.gz"):
+            all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None, compression="gzip")
+        elif output_psm.endswith(".tsv"):
+            all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)
+        else:
+            all_psm_out.to_csv(output_psm, header=True, sep="\t", index=None)
 
         end_time = datetime.datetime.now()
         print("End time :", end_time)