Skip to content

Commit

Permalink
fix ft peptide index bug
Browse files Browse the repository at this point in the history
  • Loading branch information
caleb-easterly committed Mar 13, 2019
1 parent 3733e57 commit 17d84c2
Show file tree
Hide file tree
Showing 6 changed files with 325 additions and 7 deletions.
101 changes: 101 additions & 0 deletions metaquantome/data/test/ft_func.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
peptide go
VGDDIAELDNR GO:0004828
VGDDIAELDNR GO:0004828
VGDDIAELDNR GO:0005524
VGDDIAELDNR GO:0005524
VGDDIAELDNR GO:0006434
VGDDIAELDNR GO:0006434
VGDDIAELDNR GO:0097056
VGDDIAELDNR GO:0097056
VGDDIAELDNR GO:0016260
VGDDIAELDNR GO:0016260
VGDDIAELDNR GO:0005737
VGDDIAELDNR GO:0005737
FEAYGWQVIR GO:0004802
FEAYGWQVIR GO:0004802
FEAYGWQVIR GO:0046872
FEAYGWQVIR GO:0046872
FEAYGWQVIR GO:0008152
FEAYGWQVIR GO:0008152
FPIVNANVVK GO:0000166
FPIVNANVVK GO:0000166
FPIVNANVVK GO:0008663
FPIVNANVVK GO:0008663
FPIVNANVVK GO:0046872
FPIVNANVVK GO:0046872
FPIVNANVVK GO:0008254
FPIVNANVVK GO:0008254
FPIVNANVVK GO:0009166
FPIVNANVVK GO:0009166
EAIEDSGLTEDQVSNDR GO:0008152
EAIEDSGLTEDQVSNDR GO:0008152
EAIEDSGLTEDQVSNDR GO:0016740
EAIEDSGLTEDQVSNDR GO:0016740
EAIEDSGLTEDQVSNDR GO:0004315
EAIEDSGLTEDQVSNDR GO:0004315
VETNGADGIK GO:0005351
VETNGADGIK GO:0005351
VETNGADGIK GO:0022877
VETNGADGIK GO:0022877
VETNGADGIK GO:0103111
VETNGADGIK GO:0103111
VETNGADGIK GO:0016021
VETNGADGIK GO:0016021
VETNGADGIK GO:0005886
VETNGADGIK GO:0005886
VETNGADGIK GO:0009401
VETNGADGIK GO:0009401
IDENKDYGK GO:0005524
IDENKDYGK GO:0005524
IDENKDYGK GO:0003677
IDENKDYGK GO:0003677
IDENKDYGK GO:0003899
IDENKDYGK GO:0003899
IDENKDYGK GO:0046983
IDENKDYGK GO:0046983
IDENKDYGK GO:0006351
IDENKDYGK GO:0006351
IDENKDYGK GO:0016021
IDENKDYGK GO:0016021
IDENKDYGK GO:0005737
IDENKDYGK GO:0005737
HTSQMFITGPAVIK GO:0016874
HTSQMFITGPAVIK GO:0016874
HTSQMFITGPAVIK GO:0016740
HTSQMFITGPAVIK GO:0016740
HTSQMFITGPAVIK GO:0004492
HTSQMFITGPAVIK GO:0004492
TEPTNPYGESK GO:0050662
TEPTNPYGESK GO:0050662
TEPTNPYGESK GO:0003978
TEPTNPYGESK GO:0003978
TEPTNPYGESK GO:0006012
TEPTNPYGESK GO:0006012
SQAIPAEADK GO:0016021
SQAIPAEADK GO:0016021
SQAIPAEADK GO:0009276
SQAIPAEADK GO:0009276
SQAIPAEADK GO:0005886
SQAIPAEADK GO:0005886
SQAIPAEADK GO:0016655
SQAIPAEADK GO:0016655
SQAIPAEADK GO:0010181
SQAIPAEADK GO:0010181
SQAIPAEADK GO:0006814
SQAIPAEADK GO:0006814
YSEEVENKK GO:0016740
YSEEVENKK GO:0016740
GDVENGTTVSDFDKEEIR GO:0003746
GDVENGTTVSDFDKEEIR GO:0003746
GDVENGTTVSDFDKEEIR GO:0003924
GDVENGTTVSDFDKEEIR GO:0003924
GDVENGTTVSDFDKEEIR GO:0005525
GDVENGTTVSDFDKEEIR GO:0005525
GDVENGTTVSDFDKEEIR GO:0005622
GDVENGTTVSDFDKEEIR GO:0005622
VYDEQILEEER GO:0016021
VYDEQILEEER GO:0016021
VYDEQILEEER GO:0005886
VYDEQILEEER GO:0005886
VYDEQILEEER GO:0022869
VYDEQILEEER GO:0022869
101 changes: 101 additions & 0 deletions metaquantome/data/test/ft_int.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
Sequence int
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
VGDDIAELDNR 128828.704101563
FEAYGWQVIR 127300.75
FEAYGWQVIR 127300.75
FEAYGWQVIR 127300.75
FEAYGWQVIR 127300.75
FEAYGWQVIR 127300.75
FEAYGWQVIR 127300.75
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
FPIVNANVVK 165529.185546875
EAIEDSGLTEDQVSNDR 789450.40625
EAIEDSGLTEDQVSNDR 789450.40625
EAIEDSGLTEDQVSNDR 789450.40625
EAIEDSGLTEDQVSNDR 789450.40625
EAIEDSGLTEDQVSNDR 789450.40625
EAIEDSGLTEDQVSNDR 789450.40625
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
VETNGADGIK 1825392.1875
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
IDENKDYGK 664577.94140625
HTSQMFITGPAVIK 508794.919921875
HTSQMFITGPAVIK 508794.919921875
HTSQMFITGPAVIK 508794.919921875
HTSQMFITGPAVIK 508794.919921875
HTSQMFITGPAVIK 508794.919921875
HTSQMFITGPAVIK 508794.919921875
TEPTNPYGESK 630307.7890625
TEPTNPYGESK 630307.7890625
TEPTNPYGESK 630307.7890625
TEPTNPYGESK 630307.7890625
TEPTNPYGESK 630307.7890625
TEPTNPYGESK 630307.7890625
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
SQAIPAEADK 2338804.921875
YSEEVENKK 6907897.3125
YSEEVENKK 6907897.3125
GDVENGTTVSDFDKEEIR 176286.673828125
GDVENGTTVSDFDKEEIR 176286.673828125
GDVENGTTVSDFDKEEIR 176286.673828125
GDVENGTTVSDFDKEEIR 176286.673828125
GDVENGTTVSDFDKEEIR 176286.673828125
GDVENGTTVSDFDKEEIR 176286.673828125
GDVENGTTVSDFDKEEIR 176286.673828125
GDVENGTTVSDFDKEEIR 176286.673828125
VYDEQILEEER 141822.306640625
VYDEQILEEER 141822.306640625
VYDEQILEEER 141822.306640625
VYDEQILEEER 141822.306640625
VYDEQILEEER 141822.306640625
VYDEQILEEER 141822.306640625
101 changes: 101 additions & 0 deletions metaquantome/data/test/ft_tax.tab
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
peptide lca
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
VGDDIAELDNR 29465
FEAYGWQVIR 2
FEAYGWQVIR 2
FEAYGWQVIR 2
FEAYGWQVIR 2
FEAYGWQVIR 2
FEAYGWQVIR 2
FPIVNANVVK 2
FPIVNANVVK 2
FPIVNANVVK 2
FPIVNANVVK 2
FPIVNANVVK 2
FPIVNANVVK 2
FPIVNANVVK 2
FPIVNANVVK 2
FPIVNANVVK 2
FPIVNANVVK 2
EAIEDSGLTEDQVSNDR 712
EAIEDSGLTEDQVSNDR 712
EAIEDSGLTEDQVSNDR 712
EAIEDSGLTEDQVSNDR 712
EAIEDSGLTEDQVSNDR 712
EAIEDSGLTEDQVSNDR 712
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
VETNGADGIK 848
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
IDENKDYGK 1
HTSQMFITGPAVIK 39778
HTSQMFITGPAVIK 39778
HTSQMFITGPAVIK 39778
HTSQMFITGPAVIK 39778
HTSQMFITGPAVIK 39778
HTSQMFITGPAVIK 39778
TEPTNPYGESK 203492
TEPTNPYGESK 203492
TEPTNPYGESK 203492
TEPTNPYGESK 203492
TEPTNPYGESK 203492
TEPTNPYGESK 203492
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
SQAIPAEADK 729
YSEEVENKK 848
YSEEVENKK 848
GDVENGTTVSDFDKEEIR 848
GDVENGTTVSDFDKEEIR 848
GDVENGTTVSDFDKEEIR 848
GDVENGTTVSDFDKEEIR 848
GDVENGTTVSDFDKEEIR 848
GDVENGTTVSDFDKEEIR 848
GDVENGTTVSDFDKEEIR 848
GDVENGTTVSDFDKEEIR 848
VYDEQILEEER 2
VYDEQILEEER 2
VYDEQILEEER 2
VYDEQILEEER 2
VYDEQILEEER 2
VYDEQILEEER 2
17 changes: 10 additions & 7 deletions metaquantome/modules/function_taxonomy_interaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,13 @@ def function_taxonomy_analysis(df, func_colname, pep_colname, ontology, slim_dow
godb, norm_df = fa.clean_function_df(data_dir, df, func_colname, ontology, slim_down)
if slim_down:
norm_df = fa.slim_down_df(godb, norm_df, func_colname)
# remove peptide/go-term duplicates
# remove peptide/go-term duplicates (in the case that different GO term annotations
# for the same peptide are mapped to the same slim GO term)
# index is named peptide
dedup_df = norm_df.\
reset_index().\
drop_duplicates(subset=[pep_colname, func_colname], keep='first').\
set_index(pep_colname)
drop_duplicates(subset=['peptide', func_colname], keep='first').\
set_index('peptide')
# ---- get rank of lca ----- #
# resolve data dir
if not data_dir:
Expand All @@ -63,7 +65,7 @@ def function_taxonomy_analysis(df, func_colname, pep_colname, ontology, slim_dow
dedup_df = dedup_df.loc[is_not_nan & is_in_db]

dedup_df['des_rank'] = dedup_df[tax_colname].apply(lambda x: des_rank_mapper(ft_tar_rank, x, ncbi))
# filter out peptides that are less specific than query rank
# filter out peptides that are less specific than query rank (which have a taxid of 0)
dedup_df = dedup_df[dedup_df['des_rank'] > 0]

# ---- group by go and new des_rank column, then sum intensity ---- #
Expand All @@ -75,7 +77,8 @@ def function_taxonomy_analysis(df, func_colname, pep_colname, ontology, slim_dow
# group by both cog and lca and add
grouped = df_int.groupby(by=[func_colname, 'des_rank']).sum(axis=1)
# get groupwise counts (i.e., unique peptides)
counts = df_counts.groupby(by=[func_colname, 'des_rank']).sum(axis=1)
# multiply by 1 to convert any single booleans (True) to 1
counts = df_counts.groupby(by=[func_colname, 'des_rank']).sum(axis=1) * 1
ints_and_counts = grouped.join(counts, rsuffix='_n_peptide')

# ---- output prep ---- #
Expand All @@ -97,6 +100,8 @@ def function_taxonomy_analysis(df, func_colname, pep_colname, ontology, slim_dow
results['tax_id'] = taxids
results['rank'] = [ncbi.get_rank(int(elem)) for elem in taxids]
results['taxon_name'] = ncbi.convert_taxid_to_name(taxids)
# drop des_rank column
results.drop('des_rank', axis=1, inplace=True)
return results


Expand All @@ -114,5 +119,3 @@ def des_rank_mapper(des_rank, taxid, ncbi):
return dict_mapper[des_rank]
else:
return 0


1 change: 1 addition & 0 deletions metaquantome/util/expand_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def read_and_join_files(mode, pep_colname_int, pep_colname_func, pep_colname_tax
dfs.append(func)
# join all
dfs_joined = join_on_peptide(dfs)
dfs_joined.index.name = 'peptide'
return dfs_joined


Expand Down
11 changes: 11 additions & 0 deletions tests/travis/testExpand.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,5 +179,16 @@ def testParentIntensityHigher(self):
self.assertGreaterEqual(ints[1301], ints[1305])


class TestFunctionTaxonomyAnalysis(unittest.TestCase):
def testDifferentNames(self):
tax = testfile('ft_tax.tab')
func = testfile('ft_func.tab')
int = testfile('ft_int.tab')
ft = expand.expand('ft', sinfo='{"A": ["int"]}', int_file=int, pep_colname_int='Sequence',
pep_colname_func='peptide', pep_colname_tax='peptide', data_dir=TEST_DIR, tax_file=tax,
tax_colname='lca', func_file=func, func_colname="go")
self.assertIn("A_mean", list(ft))


if __name__=='__main__':
unittest.main()

0 comments on commit 17d84c2

Please sign in to comment.