From 0d5ac1b1718cf6087c7598e497b78eacc92962eb Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Sun, 16 Jun 2024 17:11:28 -0400 Subject: [PATCH 1/6] Improve SCOMP handling --- indra/sources/bel/processor.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/indra/sources/bel/processor.py b/indra/sources/bel/processor.py index 49eed505eb..c136769ca8 100644 --- a/indra/sources/bel/processor.py +++ b/indra/sources/bel/processor.py @@ -546,28 +546,17 @@ def get_db_refs_by_name(ns, name, node_data): if up_id: db_refs = {'UP': up_id} # Map Selventa families and complexes to FamPlex - elif ns == 'SFAM': + elif ns in {'SFAM', 'SCOMP'}: try: - sfam_id, xrefs = selventa_lookup[('SFAM', name)] - db_refs = {"SFAM": sfam_id} + selventa_id, xrefs = selventa_lookup[(ns, name)] + db_refs = {ns: selventa_id} indra_name = bel_to_indra.get(name) except KeyError: indra_name = None db_refs = None - - if indra_name is None: - logger.info('Could not find mapping for BEL/SFAM family: ' - '%s (%s)' % (name, node_data)) - else: - db_refs['FPLX'] = indra_name - name = indra_name - elif ns == 'SCOMP': - scomp_id, xrefs = selventa_lookup[('SCOMP', name)] - db_refs = {'SCOMP': scomp_id} - indra_name = bel_to_indra.get(name) if indra_name is None: - logger.info('Could not find mapping for BEL/SCOMP complex: ' - '%s (%s)' % (name, node_data)) + logger.info('Could not find mapping for BEL/%s family: ' + '%s (%s)' % (ns, name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name From 601791b3eb92326072cada35ea3079d8e9ac9517 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 18 Jun 2024 16:37:34 -0400 Subject: [PATCH 2/6] Allow choosing preferred mappings --- indra/databases/hgnc_client.py | 13 ++++++++++++- indra/ontology/bio/ontology.py | 12 ++++++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/indra/databases/hgnc_client.py b/indra/databases/hgnc_client.py index 64ac8f0bc4..daae2b29f4 100644 --- a/indra/databases/hgnc_client.py +++ b/indra/databases/hgnc_client.py @@ -422,6 +422,10 @@ def get_hgnc_name_from_mgi_name(mgi_name: str) -> Union[str, None]: def _read_hgnc_maps(): hgnc_file = get_resource_path("hgnc_entries.tsv") csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8') + hgnc_uniprot_preferred = get_resource_path("hgnc_uniprot_preferred.csv") + csv_rows_uniprot_preferred = \ + read_unicode_csv(hgnc_uniprot_preferred, delimiter=',', + encoding='utf-8') hgnc_names = {} hgnc_ids = {} hgnc_withdrawn = [] @@ -515,11 +519,18 @@ def _read_hgnc_maps(): for old_id, new_id in hgnc_withdrawn_new_ids.items(): hgnc_names[old_id] = hgnc_names[new_id] + uniprot_ids_preferred = {} + for row in csv_rows_uniprot_preferred: + hgnc_id = row[0] + uniprot_id = row[1] + uniprot_ids_preferred[hgnc_id] = uniprot_id + return ( hgnc_names, hgnc_ids, hgnc_withdrawn, uniprot_ids, entrez_ids, entrez_ids_reverse, mouse_map, rat_map, prev_sym_map, ensembl_ids, ensembl_ids_reverse, gene_types, dict(hgnc_to_enzymes), dict(enzyme_to_hgncs), + uniprot_ids_preferred ) @@ -527,7 +538,7 @@ def _read_hgnc_maps(): hgnc_names, hgnc_ids, hgnc_withdrawn, uniprot_ids, entrez_ids, entrez_ids_reverse, mouse_map, rat_map, prev_sym_map, ensembl_ids, ensembl_ids_reverse, gene_type, - hgnc_to_enzymes, enzyme_to_hgncs, + hgnc_to_enzymes, enzyme_to_hgncs, uniprot_ids_preferred ) = _read_hgnc_maps() diff --git a/indra/ontology/bio/ontology.py b/indra/ontology/bio/ontology.py index e053bf6b09..b775eca5d9 100644 --- a/indra/ontology/bio/ontology.py +++ b/indra/ontology/bio/ontology.py @@ -26,7 +26,7 @@ class BioOntology(IndraOntology): # should be incremented to "force" rebuilding the ontology to be consistent # with the underlying resource files. name = 'bio' - version = '1.33' + version = '1.34' ontology_namespaces = [ 'go', 'efo', 'hp', 'doid', 'chebi', 'ido', 'mondo', 'eccode', ] @@ -147,11 +147,15 @@ def add_hgnc_uniprot_entrez_xrefs(self): from indra.databases import hgnc_client from indra.databases import uniprot_client edges = [] - for hid, uid in hgnc_client.uniprot_ids.items(): - uids = uid.split(', ') + for hid, upid in hgnc_client.uniprot_ids.items(): + uids = upid.split(', ') + preferred = hgnc_client.uniprot_ids_preferred.get(hid) + if preferred: + uids = [preferred] for uid in uids: + edge_data = {'type': 'xref', 'source': 'hgnc'} edges.append((self.label('HGNC', hid), self.label('UP', uid), - {'type': 'xref', 'source': 'hgnc'})) + edge_data)) self.add_edges_from(edges) edges = [(self.label('UP', uid), self.label('HGNC', hid), From 97e734dbf09915488d6a9941be63422723ceec65 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 18 Jun 2024 19:43:05 -0400 Subject: [PATCH 3/6] Limit to numpy < 2 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 14183ece7c..d193d126dd 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def main(): 'requests>=2.11', 'lxml', 'ipython', 'future', 'networkx>=3', 'pandas>=2', 'ndex2==2.0.1', 'jinja2', 'protmapper>=0.0.29', 'obonet', - 'tqdm', 'pybiopax>=0.0.5'] + 'tqdm', 'pybiopax>=0.0.5', 'numpy<2'] extras_require = { # Inputs and outputs From 14822a1a85d0ee264927891548e7da5b1f998718 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 18 Jun 2024 19:43:44 -0400 Subject: [PATCH 4/6] Add HGNC-UniProt preferred mappings file --- indra/resources/hgnc_uniprot_preferred.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 indra/resources/hgnc_uniprot_preferred.csv diff --git a/indra/resources/hgnc_uniprot_preferred.csv b/indra/resources/hgnc_uniprot_preferred.csv new file mode 100644 index 0000000000..489c4ef4a2 --- /dev/null +++ b/indra/resources/hgnc_uniprot_preferred.csv @@ -0,0 +1,3 @@ +hgnc_id,uniprot_id +17868,Q9BXH1 +30377,Q14160 \ No newline at end of file From 4b121c503afcef068c2df16dfa4f2ac117e5f55f Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 18 Jun 2024 19:48:11 -0400 Subject: [PATCH 5/6] Replace uses of longfloat with longdouble --- indra/assemblers/indranet/net.py | 14 +++++++------- indra/tests/test_indranet_assembler.py | 8 ++++---- indra/tests/test_pathfinding.py | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/indra/assemblers/indranet/net.py b/indra/assemblers/indranet/net.py index ae91c76cfe..3f34659494 100644 --- a/indra/assemblers/indranet/net.py +++ b/indra/assemblers/indranet/net.py @@ -327,8 +327,8 @@ def _simple_scorer_update(G, edge): # Catch underflow except FloatingPointError as err: # Numpy precision - NP_PRECISION = 10 ** -np.finfo(np.longfloat).precision - logger.warning('%s: Resetting ag_belief to 10*np.longfloat precision ' + NP_PRECISION = 10 ** -np.finfo(np.longdouble).precision + logger.warning('%s: Resetting ag_belief to 10*np.longdouble precision ' '(%.0e)' % (err, Decimal(NP_PRECISION * 10))) ag_belief = NP_PRECISION * 10 return ag_belief @@ -337,14 +337,14 @@ def _simple_scorer_update(G, edge): def _complementary_belief(G, edge): # Aggregate belief score: 1-prod(1-belief_i) np.seterr(all='raise') - NP_PRECISION = 10 ** -np.finfo(np.longfloat).precision # Numpy precision + NP_PRECISION = 10 ** -np.finfo(np.longdouble).precision # Numpy precision belief_list = [s['belief'] for s in G.edges[edge]['statements']] try: - ag_belief = np.longfloat(1.0) - np.prod(np.fromiter( - map(lambda belief: np.longfloat(1.0) - belief, belief_list), - dtype=np.longfloat)) + ag_belief = np.longdouble(1.0) - np.prod(np.fromiter( + map(lambda belief: np.longdouble(1.0) - belief, belief_list), + dtype=np.longdouble)) except FloatingPointError as err: - logger.warning('%s: Resetting ag_belief to 10*np.longfloat precision ' + logger.warning('%s: Resetting ag_belief to 10*np.longdouble precision ' '(%.0e)' % (err, Decimal(NP_PRECISION * 10))) ag_belief = NP_PRECISION * 10 return ag_belief diff --git a/indra/tests/test_indranet_assembler.py b/indra/tests/test_indranet_assembler.py index 05429614a2..ffb57b4328 100644 --- a/indra/tests/test_indranet_assembler.py +++ b/indra/tests/test_indranet_assembler.py @@ -175,10 +175,10 @@ def test_to_digraph(): 'Activation', 'Phosphorylation', 'Inhibition', 'IncreaseAmount'} assert all(digraph.edges[e].get('belief', False) for e in digraph.edges) assert all(isinstance(digraph.edges[e]['belief'], - (float, np.longfloat)) for e in digraph.edges) + (float, np.longdouble)) for e in digraph.edges) assert all(digraph.edges[e].get('weight', False) for e in digraph.edges) assert all(isinstance(digraph.edges[e]['weight'], - (float, np.longfloat)) for e in digraph.edges) + (float, np.longdouble)) for e in digraph.edges) digraph_from_df = IndraNet.digraph_from_df(df) assert nx.is_isomorphic(digraph, digraph_from_df) @@ -206,11 +206,11 @@ def test_to_signed_graph(): assert all(signed_graph.edges[e].get('belief', False) for e in signed_graph.edges) assert all(isinstance(signed_graph.edges[e]['belief'], - (float, np.longfloat)) for e in signed_graph.edges) + (float, np.longdouble)) for e in signed_graph.edges) assert all(signed_graph.edges[e].get('weight', False) for e in signed_graph.edges) assert all(isinstance(signed_graph.edges[e]['weight'], - (float, np.longfloat)) for e in signed_graph.edges) + (float, np.longdouble)) for e in signed_graph.edges) def _weight_mapping(G): diff --git a/indra/tests/test_pathfinding.py b/indra/tests/test_pathfinding.py index 0cdde959fc..621cd1424a 100644 --- a/indra/tests/test_pathfinding.py +++ b/indra/tests/test_pathfinding.py @@ -71,7 +71,7 @@ def _setup_unsigned_graph(): # Add belief for e in dg.edges: dg.edges[e]['belief'] = edge_beliefs[e] - dg.edges[e]['weight'] = -np.log(edge_beliefs[e], dtype=np.longfloat) + dg.edges[e]['weight'] = -np.log(edge_beliefs[e], dtype=np.longdouble) # Add edge_by_hash dg.graph['hashes'] = hashes From d36b07960ff19864c96f34a04024360bc87a28b8 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 18 Jun 2024 19:51:46 -0400 Subject: [PATCH 6/6] Remove numpy version constraint --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d193d126dd..14183ece7c 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def main(): 'requests>=2.11', 'lxml', 'ipython', 'future', 'networkx>=3', 'pandas>=2', 'ndex2==2.0.1', 'jinja2', 'protmapper>=0.0.29', 'obonet', - 'tqdm', 'pybiopax>=0.0.5', 'numpy<2'] + 'tqdm', 'pybiopax>=0.0.5'] extras_require = { # Inputs and outputs