Skip to content

Commit

Permalink
Merge pull request #95 from daichengxin/master
Browse files Browse the repository at this point in the history
Fixed bugs
  • Loading branch information
ypriverol authored Jul 13, 2021
2 parents 390b56d + fae47cd commit 1e28afa
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 31 deletions.
66 changes: 37 additions & 29 deletions sdrf_pipelines/openms/openms.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ class FileToColumnEntries:
file2diss = dict()
file2enzyme = dict()
file2source = dict()
# TODO not sure why this is needed
file2label = dict()
# TODO the following lines will be very difficult with labels. Usually a combination of file&label defines the factor
# I saw that you try to keep lists or combined strings here, but maybe hashing a tuple would be easier here.
Expand All @@ -33,8 +32,8 @@ def __init__(self) -> None:
self._unimod_database = UnimodDatabase()
self.tmt16plex = {'TMT126': 1, 'TMT127N': 2, 'TMT127C': 3, 'TMT128N': 4, 'TMT128C': 5,
'TMT129N': 6, 'TMT129C': 7, 'TMT130N': 8, 'TMT130C': 9, 'TMT131N': 10,
'TMT131C': 11, 'TMT132N': 12, 'TMT132C': 13, 'TMT133N': 14, 'TMT134N': 15,
'TMT134C': 16}
'TMT131C': 11, 'TMT132N': 12, 'TMT132C': 13, 'TMT133N': 14, 'TMT133C': 15,
'TMT134N': 16}
self.tmt11plex = {'TMT126': 1, 'TMT127N': 2, 'TMT127C': 3, 'TMT128N': 4, 'TMT128C': 5,
'TMT129N': 6, 'TMT129C': 7, 'TMT130N': 8, 'TMT130C': 9, 'TMT131N': 10,
'TMT131C': 11}
Expand All @@ -43,19 +42,18 @@ def __init__(self) -> None:
self.tmt6plex = {'TMT126': 1, 'TMT127': 2, 'TMT128': 3,
'TMT129': 4, 'TMT130': 5, 'TMT131': 6}
# Hardcode enzymes from OpenMS
self.enzymes = {"Glutamyl endopeptidase":"glutamyl endopeptidase",
"Trypsin/p":"Trypsin/P",
"Lys-c":"Lys-C","Lys-n":"Lys-N","Arg-c":"Arg-C","Arg-c/p":"Arg-C/P",
"Asp-n":"Asp-N","Asp-n/b":"Asp-N/B","Asp-n_ambic":"Asp-N_ambic",
"Chymotrypsin/p":"Chymotrypsin/P","Cnbr":"CNBr",
"V8-de":"V8-DE", "V8-e":"V8-E",
"Elastase-trypsin-chymotrypsin":"elastase-trypsin-chymotrypsin",
"Pepsina":"PepsinA",
"Unspecific cleavage":"unspecific cleavage", "No cleavage":"no cleavage"}
self.enzymes = {"Glutamyl endopeptidase": "glutamyl endopeptidase",
"Trypsin/p": "Trypsin/P",
"Lys-c": "Lys-C", "Lys-n": "Lys-N", "Arg-c": "Arg-C", "Arg-c/p": "Arg-C/P",
"Asp-n": "Asp-N", "Asp-n/b": "Asp-N/B", "Asp-n_ambic": "Asp-N_ambic",
"Chymotrypsin/p": "Chymotrypsin/P", "Cnbr": "CNBr",
"V8-de": "V8-DE", "V8-e": "V8-E",
"Elastase-trypsin-chymotrypsin": "elastase-trypsin-chymotrypsin",
"Pepsina": "PepsinA",
"Unspecific cleavage": "unspecific cleavage", "No cleavage": "no cleavage"}

# TODO What about iTRAQ?

# TODO How does this work? In OpenMS there are no such modifications. You can have different "isotope mods"
# for light, medium and heavy. E.g. Label:13C(2)15N(2) (K) as light or Dimethyl:2H(2)13C (K) as light
self.silac3 = {'silac light': 1, 'silac medium': 2, 'silac heavy': 3}
self.silac2 = {'silac light': 1, 'silac heavy': 2}
Expand Down Expand Up @@ -259,8 +257,7 @@ def openms_convert(self, sdrf_file: str = None, keep_raw: bool = False, one_tabl
enzyme = enzyme.capitalize()
# This is to check if the openMS map of enzymes
if enzyme in self.enzymes:
enzyme = self.enzymes[enzyme]

enzyme = self.enzymes[enzyme]

f2c.file2enzyme[raw] = enzyme

Expand All @@ -273,7 +270,6 @@ def openms_convert(self, sdrf_file: str = None, keep_raw: bool = False, one_tabl
else:
f2c.file2fraction[raw] = "1"

## TODO try to avoid try catch here. Can't you just check the number of captured groups?
if re.search("NT=(.+?)(;|$)", row['comment[label]']) is not None:
label = re.search("NT=(.+?)(;|$)", row['comment[label]']).group(1)
f2c.file2label[raw] = [label]
Expand Down Expand Up @@ -390,6 +386,7 @@ def writeTwoTableExperimentalDesign(self, output_filename, sdrf, file2technical_
Fraction_group = {}
sample_id_map = {}
sample_id = 1
pre_frac_group = 1
for _0, row in sdrf.iterrows():
raw = row["comment[data file]"]
source_name = row["source name"]
Expand All @@ -401,12 +398,18 @@ def writeTwoTableExperimentalDesign(self, output_filename, sdrf, file2technical_
for i in range(source_name_index):
offset = offset + int(source_name2n_reps[source_name_list[i]])

fraction_group = str(offset + int(replicate))
fraction_group = offset + int(replicate)
if raw in Fraction_group.keys():
if fraction_group < Fraction_group[raw]:
Fraction_group[raw] = fraction_group
else:
Fraction_group[raw] = fraction_group

# make fraction group consecutive
if Fraction_group[raw] > pre_frac_group + 1:
Fraction_group[raw] = pre_frac_group + 1
pre_frac_group = Fraction_group[raw]

if re.search(sample_identifier_re, source_name) is not None:
sample = re.search(sample_identifier_re, source_name).group(1)
else:
Expand Down Expand Up @@ -449,7 +452,7 @@ def writeTwoTableExperimentalDesign(self, output_filename, sdrf, file2technical_
out = raw

f.write(
Fraction_group[raw] + "\t" + file2fraction[raw] + "\t" + out + "\t" + label + "\t" + str(sample) + "\n")
str(Fraction_group[raw]) + "\t" + file2fraction[raw] + "\t" + out + "\t" + label + "\t" + str(sample) + "\n")

# sample table
f.write("\n")
Expand Down Expand Up @@ -549,6 +552,7 @@ def writeOneTableExperimentalDesign(self, output_filename, legacy, sdrf, file2te
BioReplicate = []
sample_id_map = {}
sample_id = 1
pre_frac_group = 1
for _0, row in sdrf.iterrows():
raw = row["comment[data file]"]
source_name = row["source name"]
Expand All @@ -560,14 +564,19 @@ def writeOneTableExperimentalDesign(self, output_filename, legacy, sdrf, file2te
for i in range(source_name_index):
offset = offset + int(source_name2n_reps[source_name_list[i]])

fraction_group = str(offset + int(replicate))
fraction_group = offset + int(replicate)

if raw in Fraction_group.keys():
if fraction_group < Fraction_group[raw]:
Fraction_group[raw] = fraction_group
else:
Fraction_group[raw] = fraction_group

# make fraction group consecutive
if Fraction_group[raw] > pre_frac_group + 1:
Fraction_group[raw] = pre_frac_group + 1
pre_frac_group = Fraction_group[raw]

if re.search(sample_identifier_re, source_name) is not None:
sample = re.search(sample_identifier_re, source_name).group(1)

Expand All @@ -582,7 +591,6 @@ def writeOneTableExperimentalDesign(self, output_filename, legacy, sdrf, file2te

# Solve non-sample id expression models
if source_name in sample_id_map.keys():
# TODO why do you sometimes build the dicts based on filename and sometimes based on source??
sample = sample_id_map[source_name]
else:
sample_id_map[source_name] = sample_id
Expand Down Expand Up @@ -614,8 +622,7 @@ def writeOneTableExperimentalDesign(self, output_filename, legacy, sdrf, file2te
else:
choice = self.tmt6plex
label = str(choice[label[label_index[raw]]])
# TODO if at all, this only works if the labels for the same file are consecutive in the SDRF and always
# in the same order as specified in the initial labels dictionary. Very Dangerous!

# This can be avoided the dicts are built based on file&label as key.
label_index[raw] = label_index[raw] + 1
elif 'SILAC' in ','.join(file2label[raw]):
Expand Down Expand Up @@ -644,20 +651,20 @@ def writeOneTableExperimentalDesign(self, output_filename, legacy, sdrf, file2te
mix_id = mixture_raw_tag[raw]

if legacy:
f.write(Fraction_group[raw] + "\t" + file2fraction[
f.write(str(Fraction_group[raw]) + "\t" + file2fraction[
raw] + "\t" + out + "\t" + label + "\t" + str(sample) + "\t" + condition
+ "\t" + MSstatsBioReplicate + "\t" + str(mix_id) + "\n")
else:
f.write(Fraction_group[raw] + "\t" + file2fraction[
f.write(str(Fraction_group[raw]) + "\t" + file2fraction[
raw] + "\t" + out + "\t" + label + "\t" + condition + "\t"
+ MSstatsBioReplicate + "\t" + str(mix_id) + "\n")
else:
if legacy:
f.write(Fraction_group[raw] + "\t" + file2fraction[
f.write(str(Fraction_group[raw]) + "\t" + file2fraction[
raw] + "\t" + out + "\t" + label + "\t" + str(sample) + "\t" + condition + "\t" +
MSstatsBioReplicate + "\n")
else:
f.write(Fraction_group[raw] + "\t" + file2fraction[
f.write(str(Fraction_group[raw]) + "\t" + file2fraction[
raw] + "\t" + out + "\t" + label + "\t" + condition + "\t" +
MSstatsBioReplicate + "\n")
f.close()
Expand Down Expand Up @@ -695,16 +702,17 @@ def save_search_settings_to_file(self, output_filename, sdrf, f2c):
if 'TMT' not in f2c.file2mods[raw][0] and 'TMT' not in f2c.file2mods[raw][1]:
tmt_fix_mod = TMT_mod[label]
if f2c.file2mods[raw][0]:
f2c.file2mods[raw] = (','.join(f2c.file2mods[raw][0].split(',').extend(tmt_fix_mod)),
f2c.file2mods[raw][1])
FixedMod = ','.join(f2c.file2mods[raw][0].split(',') + tmt_fix_mod)
f2c.file2mods[raw] = (FixedMod, f2c.file2mods[raw][1])
else:
f2c.file2mods[raw] = (','.join(tmt_fix_mod), f2c.file2mods[raw][1])
elif "label free sample" in labels:
label = "label free sample"
elif "silac" in labels:
label = "SILAC"
else:
pass # TODO For else
raise Exception("Failed to find any supported labels. Supported labels are 'silac', 'label free "
"sample', and tmt labels in the format 'TMT131C'")

f.write(
URI + "\t" + raw + "\t" + f2c.file2mods[raw][0] + "\t" + f2c.file2mods[raw][1] + "\t" + label + "\t" +
Expand Down
5 changes: 4 additions & 1 deletion sdrf_pipelines/parse_sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ def openms_from_sdrf(ctx, sdrf: str, raw: bool, onetable: bool, legacy: bool, ve
conditionsfromcolumns: str):
if sdrf is None:
help()
OpenMS().openms_convert(sdrf, raw, onetable, legacy, verbose, conditionsfromcolumns)
try:
OpenMS().openms_convert(sdrf, raw, onetable, legacy, verbose, conditionsfromcolumns)
except Exception as e:
print("Error: " + str(e))


@click.command('convert-maxquant',
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="sdrf-pipelines",
version="0.0.16",
version="0.0.17",
author="BigBio Team",
author_email="[email protected]",
description="Translate, convert SDRF to configuration pipelines",
Expand Down

0 comments on commit 1e28afa

Please sign in to comment.