Skip to content

Commit

Permalink
Merge pull request #117 from cthoyt/move-dump-terms
Browse files Browse the repository at this point in the history
Move dump_terms()
  • Loading branch information
bgyori authored Jun 28, 2023
2 parents 95938a2 + b14de7d commit 76daa5b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 14 deletions.
14 changes: 1 addition & 13 deletions gilda/generate_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
import re
import os
import csv
import gzip
import json
import logging
import requests
import indra
from indra.databases import hgnc_client, uniprot_client, chebi_client, \
go_client, mesh_client, doid_client
from indra.statements.resources import amino_acids
from .term import Term, filter_out_duplicates
from .term import Term, dump_terms, filter_out_duplicates
from .process import normalize
from .resources import resource_dir, popular_organisms

Expand Down Expand Up @@ -704,17 +703,6 @@ def get_all_terms():
return terms


def dump_terms(terms, fname):
"""Dump a list of terms to a tsv.gz file."""
logger.info('Dumping into %s' % fname)
header = ['norm_text', 'text', 'db', 'id', 'entry_name', 'status',
'source', 'organism', 'source_db', 'source_id']
with gzip.open(fname, 'wt', encoding='utf-8') as fh:
writer = csv.writer(fh, delimiter='\t')
writer.writerow(header)
writer.writerows(t.to_list() for t in terms)


def main():
from .resources import GROUNDING_TERMS_PATH as fname
terms = get_all_terms()
Expand Down
18 changes: 17 additions & 1 deletion gilda/term.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import csv
import gzip
import itertools
import logging
from typing import Optional, Set, Tuple
from typing import Iterable, Optional, Set, Tuple

__all__ = [
"Term",
"get_identifiers_curie",
"get_identifiers_url",
"filter_out_duplicates",
"dump_terms",
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -179,3 +182,16 @@ def filter_out_duplicates(terms):
new_terms = sorted(new_terms, key=lambda x: (x.text, x.db, x.id))
logger.info('Got %d unique terms...' % len(new_terms))
return new_terms


TERMS_HEADER = ['norm_text', 'text', 'db', 'id', 'entry_name', 'status',
'source', 'organism', 'source_db', 'source_id']


def dump_terms(terms: Iterable[Term], fname) -> None:
"""Dump a list of terms to a tsv.gz file."""
logger.info('Dumping into %s', fname)
with gzip.open(fname, 'wt', encoding='utf-8') as fh:
writer = csv.writer(fh, delimiter='\t')
writer.writerow(TERMS_HEADER)
writer.writerows(t.to_list() for t in terms)

0 comments on commit 76daa5b

Please sign in to comment.