diff --git a/README.md b/README.md index 064144b603..fe940eacc9 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,7 @@ Reading systems: | Sparser | [`indra.sources.sparser`](https://indra.readthedocs.io/en/latest/modules/sources/sparser/index.html#) | https://github.com/ddmcdonald/sparser | | Eidos | [`indra.sources.eidos`](https://indra.readthedocs.io/en/latest/modules/sources/eidos/index.html#) | https://github.com/clulab/eidos | | TEES | [`indra.sources.tees`](https://indra.readthedocs.io/en/latest/modules/sources/tees/index.html) | https://github.com/jbjorne/TEES | +| EVEX | [`indra.sources.evex`](https://indra.readthedocs.io/en/latest/modules/sources/evex/index.html) | http://evexdb.org/ | | MedScan | [`indra.sources.medscan`](https://indra.readthedocs.io/en/latest/modules/sources/medscan/index.html) | https://doi.org/10.1093/bioinformatics/btg207 | | RLIMS-P | [`indra.sources.rlimsp`](https://indra.readthedocs.io/en/latest/modules/sources/rlimsp/index.html) | https://research.bioinformatics.udel.edu/rlimsp | | ISI/AMR | [`indra.sources.isi`](https://indra.readthedocs.io/en/latest/modules/sources/isi/index.html) | https://github.com/sgarg87/big_mech_isi_gg | diff --git a/doc/modules/sources/evex.rst b/doc/modules/sources/evex.rst new file mode 100644 index 0000000000..4e7ccd51f5 --- /dev/null +++ b/doc/modules/sources/evex.rst @@ -0,0 +1,17 @@ +EVEX (:py:mod:`indra.sources.evex`) +=================================== + +.. automodule:: indra.sources.evex + :members: + +EVEX API (:py:mod:`indra.sources.evex.api`) +------------------------------------------- + +.. automodule:: indra.sources.evex.api + :members: + +EVEX Processor (:py:mod:`indra.sources.evex.processor`) +------------------------------------------------------- + +.. automodule:: indra.sources.evex.processor + :members: diff --git a/doc/modules/sources/index.rst b/doc/modules/sources/index.rst index 186becb0be..fdb7b9dbc1 100644 --- a/doc/modules/sources/index.rst +++ b/doc/modules/sources/index.rst @@ -16,6 +16,7 @@ Reading Systems sparser/index medscan/index tees/index + evex isi/index geneways/index rlimsp/index diff --git a/indra/resources/default_belief_probs.json b/indra/resources/default_belief_probs.json index bc498bc634..27a42901fc 100644 --- a/indra/resources/default_belief_probs.json +++ b/indra/resources/default_belief_probs.json @@ -34,7 +34,8 @@ "creeds": 0.01, "ubibrowser": 0.01, "acsn": 0.01, - "semrep": 0.05 + "semrep": 0.05, + "evex": 0.05 }, "rand": { "eidos": 0.3, @@ -71,6 +72,7 @@ "creeds": 0.1, "ubibrowser": 0.1, "acsn": 0.1, - "semrep": 0.3 + "semrep": 0.3, + "evex": 0.3 } } diff --git a/indra/resources/source_info.json b/indra/resources/source_info.json index 1935932b27..40bc7e4b24 100644 --- a/indra/resources/source_info.json +++ b/indra/resources/source_info.json @@ -299,6 +299,16 @@ "background-color": "#6600cc" } }, + "evex": { + "name": "EVEX", + "link": "http://evexdb.org/", + "type": "reader", + "domain": "biology", + "default_style": { + "color": "white", + "background-color": "#295c8d" + } + }, "creeds": { "name": "CREEDS", "link": "https://maayanlab.cloud/CREEDS/", diff --git a/indra/sources/evex/__init__.py b/indra/sources/evex/__init__.py new file mode 100644 index 0000000000..53a72ba706 --- /dev/null +++ b/indra/sources/evex/__init__.py @@ -0,0 +1,2 @@ +from .api import download_evex, process_human_events +from .processor import EvexProcessor, EvexStandoff \ No newline at end of file diff --git a/indra/sources/evex/api.py b/indra/sources/evex/api.py new file mode 100644 index 0000000000..c9a4b54483 --- /dev/null +++ b/indra/sources/evex/api.py @@ -0,0 +1,144 @@ +import os +import glob +import logging +import pickle +import tarfile +from urllib.request import urlretrieve +import requests +import pandas +import tqdm + +from .processor import EvexProcessor + +logger = logging.getLogger(__name__) + +human_network = 'http://evexdb.org/download/network-format/Metazoa/' \ + 'Homo_sapiens.tar.gz' +standoff_root = 'http://evexdb.org/download/standoff-annotation/version-0.1/' + + +def process_human_events(base_folder=None): + """Process all human events available in EVEX. + + Note that unless the standoff files have already been downloaded using the + `download_evex` function, the Statements produced by this function + will not carry any evidence text, agent text and various other metadata + in them for which the standoff files are required. + + Parameters + ---------- + base_folder : Optional[str] + If provided, the given base folder is used to download the human + network file from EVEX. Otherwise, the `pystow` package is used + to create an `evex` folder within the pystow base path, + typically ~/.data/evex. + + Returns + ------- + EvexProcessor + An EvexProcessor instance with the extracted INDRA Statements + as its statements attribute. + """ + if not base_folder: + import pystow + base_folder = pystow.join('evex').as_posix() + standoff_index = build_standoff_index() + network_file = os.path.join(base_folder, 'Homo_sapiens.tar.gz') + if not os.path.exists(network_file): + urlretrieve(human_network, network_file) + with tarfile.open(network_file, 'r:gz') as fh: + relations_file = fh.extractfile('EVEX_relations_9606.tab') + articles_file = fh.extractfile('EVEX_articles_9606.tab') + relations_df = pandas.read_csv(relations_file, sep='\t') + articles_df = pandas.read_csv(articles_file, sep='\t') + ep = EvexProcessor(relations_df, articles_df, standoff_index) + ep.process_statements() + return ep + + +def build_standoff_index(cached=True, base_folder=None): + """Build an index of publications in standoff bulk archive files. + + This index is necessary to figure out which standoff archive the annotations + for a given article are in. + + Parameters + ---------- + cached: Optional[bool] + If True, the standoff index is cached in the base folder and isn't + regenerated if this function is called again, just reloaded. + This is useful since generating the full standoff file index + can take a long time. Default: True + base_folder : Optional[str] + If provided, the given base folder is used to download the human + network file from EVEX. Otherwise, the `pystow` package is used + to create an `evex` folder within the pystow base path, + typically ~/.data/evex. + """ + if not base_folder: + import pystow + base_folder = pystow.join('evex').as_posix() + cache_file = os.path.join(base_folder, 'standoff_index.pkl') + if cached and os.path.exists(cache_file): + logger.info('Loading standoff index from %s' % cache_file) + with open(cache_file, 'rb') as fh: + return pickle.load(fh) + index = {} + for fname in tqdm.tqdm(glob.glob(os.path.join(base_folder, 'batch*')), + desc='Building standoff index'): + try: + with tarfile.open(fname, 'r:gz') as fh: + names = fh.getnames() + except tarfile.ReadError: + logger.error('Could not read tarfile %s' % fname) + continue + ids = {tuple(os.path.splitext(name)[0].split('_')[:2]) + for name in names if name.endswith('ann')} + for paper_id in ids: + index[paper_id] = fname + if cached: + with open(cache_file, 'wb') as fh: + pickle.dump(index, fh) + return index + + +def download_evex(base_folder=None): + """Download EVEX human network and standoff output files. + + This function downloads the human network file as well as a large number + of standoff output files. These files are necessary to find evidence text, + agent text and agent coordinates to be used in INDRA. Note that there + are over 4 thousand such files, and the overall size is around 6 GB. + + Parameters + ---------- + base_folder : Optional[str] + If provided, the given base folder is used to download the human + network file from EVEX. Otherwise, the `pystow` package is used + to create an `evex` folder within the pystow base path, + typically ~/.data/evex. + """ + from bs4 import BeautifulSoup + if not base_folder: + import pystow + base_folder = pystow.join('evex').as_posix() + # Download human network first + fname = os.path.join(base_folder, 'Homo_sapiens.tar.gz') + if not os.path.exists(fname): + urlretrieve(human_network, fname) + # Now download all the standoff files + res = requests.get(standoff_root) + soup = BeautifulSoup(res.text, 'html.parser') + children = [standoff_root + node.get('href') + for node in soup.find_all('a') + if node.get('href').startswith('files')] + for child in tqdm.tqdm(children): + res = requests.get(child) + soup = BeautifulSoup(res.text, 'html.parser') + downloadables = [child + node.get('href') + for node in soup.find_all('a') + if node.get('href').startswith('batch')] + for downloadable in downloadables: + fname = os.path.join(base_folder, downloadable.split('/')[-1]) + if not os.path.exists(fname): + urlretrieve(downloadable, fname) diff --git a/indra/sources/evex/processor.py b/indra/sources/evex/processor.py new file mode 100644 index 0000000000..62b1ae6640 --- /dev/null +++ b/indra/sources/evex/processor.py @@ -0,0 +1,690 @@ +import copy +import csv +from collections import defaultdict +from dataclasses import dataclass, field +from io import TextIOWrapper +import itertools +import logging +import tarfile +from typing import Any, Dict +import networkx +import tqdm +from indra.ontology.standardize import get_standard_agent +from indra.statements.validate import validate_statement +from indra.statements import * + +logger = logging.getLogger(__name__) + + +class EvexProcessor: + """A processor to extract INDRA Statements from EVEX relations.""" + def __init__(self, relations_table, articles_table, standoff_index): + self.relations_table = relations_table + self.articles_table = articles_table + # Build an index of + self.article_lookup = self.build_article_lookup() + self.standoff_index = standoff_index + self.statements = [] + self.standoff_cache = {} + + def process_statements(self): + """Process rows of the EXEV relations table into INDRA Statements.""" + for row in tqdm.tqdm(self.relations_table.itertuples(), + total=len(self.relations_table), + desc='Processing Evex relations'): + self.statements += self.process_row(row) + + def process_row(self, row): + """Process a row in the relations table into INDRA Statements.""" + + # First, we determine the statement type and create the subject/object + # ageints. + pol_idx = 1 if row.refined_polarity == 'Negative' else 0 + stmt_types = type_indra_mappings.get(row.refined_type) + if not stmt_types: + return [] + stmt_type = stmt_types[pol_idx] + source_id = str(row.source_entrezgene_id) + target_id = str(row.target_entrezgene_id) + subj_agent = get_standard_agent('EGID:%s' % source_id, + db_refs={'EGID': source_id}) + obj_agent = get_standard_agent('EGID:%s' % target_id, + db_refs={'EGID': target_id}) + + # We now figure out what articles provide evidence for this relation + article_keys = self.article_lookup.get(row.general_event_id) + stmts = [] + for article_prefix, article_id in article_keys: + # These text refs are known based on info we have independent of + # standoff availability + text_refs = {article_prefix: article_id} + pmid = article_id if article_prefix == 'PMID' else None + + # We not find the standoff for the given relation and gather + # evidence info for it if possible. + standoff = self.get_standoff_for_event(article_prefix, article_id) + if not standoff: + evidence_info = [{}] + else: + evidence_info = find_evidence_info(standoff, source_id, + target_id, row.refined_type, + row.refined_polarity) + # For each article, it's possible that multiple evidences are + # available for the relation so we create a separate Statements + # (each with a single Evidence) here. + for ev_info in evidence_info: + annotations = { + 'evex_relation_type': row.refined_type, + 'evex_polarity': row.refined_polarity, + 'evex_general_event_id': str(row.general_event_id), + 'evex_standoff_regulation_id': + ev_info.get('regulation_uid'), + 'evex_confidence': ev_info.get('confidence') + } + # These are propagated to allow filtering later + epistemics = {} + if ev_info.get('negation'): + epistemics['negated'] = True + if ev_info.get('speculation'): + epistemics['is_hypothesis'] = True + + if ev_info.get('subj_coords'): + annotations['agents'] = \ + {'coords': [ev_info['subj_coords'], + ev_info['obj_coords']]} + ev = Evidence(source_api='evex', + pmid=pmid, + text_refs=text_refs, + text=ev_info.get('text'), + annotations=annotations, + epistemics=epistemics) + + # We can set the raw Agent text which is specific to this + # given evidence. + subj = copy.deepcopy(subj_agent) + obj = copy.deepcopy(obj_agent) + if ev_info.get('subj_text'): + subj.db_refs['TEXT'] = ev_info.get('subj_text') + if ev_info.get('obj_text'): + obj.db_refs['TEXT'] = ev_info.get('obj_text') + + # Finally, create the Statement object + if stmt_type == Complex: + stmt = Complex([subj, obj], evidence=[ev]) + else: + stmt = stmt_type(subj, obj, evidence=[ev]) + validate_statement(stmt) + stmts.append(stmt) + return stmts + + def get_standoff_for_event(self, article_prefix, article_id): + """Based on article info, return a standoff object of annotations.""" + key = ( + 'pmc' if article_prefix == 'PMCID' else 'pubmed', + article_id[3:] if article_prefix == 'PMCID' else article_id + ) + if key in self.standoff_cache: + return self.standoff_cache[key] + standoff_file = self.standoff_index.get(key) + if not standoff_file: + return None + standoff = EvexStandoff(standoff_file, key) + self.standoff_cache[key] = standoff + return standoff + + def build_article_lookup(self): + """Build a lookup for articles corresponding to event IDs.""" + article_lookup = defaultdict(list) + for row in self.articles_table.itertuples(): + prefix, article_id = row.article_id.split(': ') + if prefix == 'PMCID': + if not article_id.startswith('PMC'): + article_id = 'PMC' + article_id + article_lookup[row.general_event_id].append( + ('PMCID', article_id)) + elif prefix == 'PMID': + article_lookup[row.general_event_id].append( + ('PMID', article_id)) + else: + ValueError('Unexpected article type: %s' % prefix) + return dict(article_lookup) + + +def find_evidence_info(standoff, source_id, target_id, event_type, + polarity): + """Given a standoff, find all regulations matching a relation row + and return corresponding evidence info.""" + potential_regs = standoff.find_potential_regulations(source_id, + target_id) + matching_reg_info = [] + for reg in potential_regs: + source_paths = reg.paths_to_entrez_id(source_id) + source_annotated_paths = [standoff.annotate_path(source_path) + for source_path in source_paths] + target_paths = reg.paths_to_entrez_id(target_id) + target_annotated_paths = [standoff.annotate_path(target_path) + for target_path in target_paths] + + if event_type == 'Binding': + for source_path, target_path in \ + itertools.product(source_annotated_paths, + target_annotated_paths): + if 'Binding' in source_path and 'Binding' in target_path: + source_reg_idx = source_path.index('Binding') + target_reg_idx = target_path.index('Binding') + if source_path[source_reg_idx + 1] == 'Theme' and \ + target_path[target_reg_idx + 1] == 'Theme': + matching_reg_info.append( + get_regulation_info(standoff, reg, + source_path[-1], + target_path[-1])) + else: + pos_event_type, neg_event_type = \ + type_standoff_mappings[event_type] + polarity_is_positive = (polarity in {'Positive', 'Unspecified'}) + constraints = [ + { + 'source': {'Positive_regulation', 'Regulation', + 'Catalysis'}, + 'target': (pos_event_type if polarity_is_positive + else neg_event_type) + }, + { + 'source': {'Negative_regulation'}, + 'target': (neg_event_type if polarity_is_positive + else pos_event_type) + } + ] + for source_path, target_path in \ + itertools.product(source_annotated_paths, + target_annotated_paths): + for constraint in constraints: + if source_path[0] in constraint['source'] \ + and source_path[1] == 'Cause' \ + and target_path[1] == 'Theme' \ + and constraint['target'] in target_path: + matching_reg_info.append( + get_regulation_info(standoff, reg, + source_path[-1], + target_path[-1])) + + if not matching_reg_info: + if len(potential_regs) == 1: + txt = standoff.get_sentence_for_offset( + potential_regs[0].event.start) + matching_reg_info = [{'text': txt, + 'regulation_uid': potential_regs[0].uid}] + else: + data = {'source_id': source_id, + 'target_id': target_id, + 'event_type': event_type, + 'polarity': polarity} + label = '\n'.join(['%s: %s' % (k, v) for k, v in data.items()]) + standoff.save_potential_regulations(source_id, target_id, + key=standoff.key, + label=label) + matching_reg_info = [{}] + + return matching_reg_info + + +def get_regulation_info(standoff, regulation, source_uid, target_uid): + """Gather specific evidence info from a regulation in a standoff.""" + text = standoff.get_sentence_for_offset(regulation.event.start) + subj = standoff.elements[source_uid] + subj_text = subj.text + subj_coord = [standoff.get_sentence_relative_offset(subj.start), + standoff.get_sentence_relative_offset(subj.end)] + obj = standoff.elements[target_uid] + obj_text = obj.text + obj_coord = [standoff.get_sentence_relative_offset(obj.start), + standoff.get_sentence_relative_offset(obj.end)] + return {'text': text, + 'subj_text': subj_text, + 'subj_coords': subj_coord, + 'obj_text': obj_text, + 'obj_coords': obj_coord, + 'regulation_uid': regulation.uid, + 'confidence': regulation.confidence_val, + 'negation': True if regulation.negation else False, + 'speculation': True if regulation.speculation else False} + + +def get_sentence_for_offset(text_lines, line_offsets, offset): + """Return a text line for a given offset based on line offsets.""" + for idx in range(len(line_offsets) - 1): + if line_offsets[idx + 1] > offset: + return text_lines[idx].strip() + return text_lines[-1] + + +def get_sentence_relative_offset(line_offsets, offset): + """Return an offset relative to the sentence it is in.""" + for idx in range(len(line_offsets) - 1): + if line_offsets[idx + 1] > offset: + return offset - line_offsets[idx] + return offset - line_offsets[-1] + + +class EvexStandoff: + """Represent an EVEX standoff file's contents as a set of objects.""" + def __init__(self, standoff_file, key): + self.key = key + # We need to get the content of the text lines corresponding to + # the standoff annotations, and then process the annotations from + # the annotation file. + with tarfile.open(standoff_file, 'r:gz') as fh: + ann_file = TextIOWrapper(fh.extractfile('%s_%s.ann' % key), + encoding='utf-8') + txt_file = TextIOWrapper(fh.extractfile('%s_%s.txt' % key), + encoding='utf-8') + self.text_lines = txt_file.readlines() + self.elements = process_annotations(ann_file) + # To be able to linearly index into sentences broken up into separate + # lines, we build an index of line offsets + self.line_offsets = [0] + for idx, line in enumerate(self.text_lines[:-1]): + self.line_offsets.append(self.line_offsets[idx] + len(line)) + + def get_sentence_for_offset(self, offset): + """Return the sentence for a given offset in the standoff annotation.""" + return get_sentence_for_offset(self.text_lines, self.line_offsets, + offset) + + def get_sentence_relative_offset(self, offset): + """Return an offset relative to the sentence it is in.""" + return get_sentence_relative_offset(self.line_offsets, offset) + + def find_exact_regulations(self, cause_entrez_id, theme_entrez_id): + """Find regulations that only contain the given entrez IDs.""" + regs = [] + for uid, element in self.elements.items(): + if isinstance(element, Regulation): + if {cause_entrez_id, theme_entrez_id} == element.entrez_ids: + regs.append(element) + return regs + + def find_potential_regulations(self, cause_entrez_id, theme_entrez_id): + """Find regulations that contain the given entrez IDs.""" + regs = [] + for uid, element in self.elements.items(): + if isinstance(element, Regulation): + if {cause_entrez_id, theme_entrez_id} <= element.entrez_ids: + regs.append(element) + return regs + + def save_potential_regulations(self, cause_entrez_id, theme_entrez_id, key, + label): + """Save potential regulation graphs for review/debugging.""" + import pystow + file_key = '_'.join(list(key) + [cause_entrez_id, theme_entrez_id]) + fname = pystow.join('evex', 'debug', name='%s.pdf' % file_key) + regs = self.find_potential_regulations(cause_entrez_id, theme_entrez_id) + if not regs: + return [] + graph = networkx.compose_all([reg.graph for reg in regs]) + ag = networkx.nx_agraph.to_agraph(graph) + ag.graph_attr['label'] = label + ag.draw(fname, prog='dot') + return regs + + def annotate_path(self, path_nodes): + """Given a raw path of node IDs, create an annotated path. + + The annotated path contains event types for regulation nodes, + relation types, and IDs of leaf entity nodes. + """ + root = self.elements[path_nodes[0]] + path_info = [root.event.get_type()] + for source, target in zip(path_nodes[:-1], path_nodes[1:]): + path_info.append(root.graph.edges[(source, target)]['label']) + if isinstance(self.elements[target], Regulation): + path_info.append(self.elements[target].event.event_type) + elif isinstance(self.elements[target], Entity): + path_info.append(target) + return path_info + + +def process_annotations(ann_file): + """Iterate over the rows of an annotations file and build up objects.""" + elements = {} + reader = csv.reader(ann_file, delimiter='\t', quotechar=None) + for row in reader: + # The first element is always the UID + uid = row[0] + assert len(row) == 2 or len(row) == 3 + # If the row has 3 elements, then the last one is a value + value = row[2] if len(row) == 3 else None + # The second element can have multiple space-separated parts + parts = row[1].split() + # If this is an entity of some type + if parts[0] in {'GGP', 'Entity'}: + entity = Entity(uid, parts[0], int(parts[1]), int(parts[2]), value) + elements[uid] = entity + # These represent entity references like Entrez IDs + elif parts[0] == 'Reference': + ref_ns, ref_id = parts[2].split(':', maxsplit=1) + elements[parts[1]].references[ref_ns] = ref_id + # These are various event types, we enumerate them explicitly in + # the standoff_event_types variable to make sure it's not some + # other type of row. + elif parts[0] in standoff_event_types: + event = Event(uid, parts[0], int(parts[1]), int(parts[2]), value) + elements[uid] = event + # These are confidence values associated with regulations but also + # other things like Negation. An additional complication is that it + # can either represent a numerical of a qualitative confidence level. + elif parts[0] == 'Confidence': + # Negation confidence + if isinstance(parts[1], Negation): + elements[parts[1]].confidence = float(value) + # Regulation confidence value + elif len(row) == 3: + elements[parts[1]].confidence_val = float(value) + # Regulation confidence level + else: + elements[parts[1]].confidence_level = parts[2] + # Represents a negation for a regulation + elif parts[0] == 'Negation': + elements[uid] = Negation(uid) + elements[parts[1]].negation = elements[uid] + # Represents a speculation for a regulation + elif parts[0] == 'Speculation': + elements[uid] = Speculation(uid) + elements[parts[1]].speculation = elements[uid] + # The remainder of cases are regulations. These are either basic + # regulations or special cases like subunit-complex relations. + elif len(row) == 2: + if ':' in parts[0]: + event_type, parent_id = parts[0].split(':') + event = elements[parent_id] + assert event_type == event.event_type + # These events don't have actual objects associated with them so + # we create placeholder events just to propagate the type + elif parts[0] in {'Subunit-Complex', 'Protein-Component'}: + event_type = parts[0] + event = PlaceholderEvent(event_type) + else: + assert False, row + + # The row contains a series of arguments for the regulation that + # need to be parsed out in parts + arguments = {} + for element in parts[1:]: + role, arg_uid = element.split(':') + + # Some regulations are defined out of order, we need a + # placeholder for these elements that can be resolved later + element_obj = elements.get(arg_uid, Unresolved(arg_uid)) + + # There are argument types that there are more than one of, + # e.g., Theme for Binding so we need to sometimes turn + # these into lists. + if role in arguments: + if not isinstance(arguments[role], list): + arguments[role] = [arguments[role]] + arguments[role].append(element_obj) + else: + arguments[role] = element_obj + regulation = Regulation(uid, event, arguments) + elements[uid] = regulation + else: + logger.error('Could not process standoff file row: %s' % row) + break + + # We now need to resolve Unresolved regulation references. At this point + # it's enough if we take them from the elements dict since they would + # now be resolved at that level. + for uid, element in elements.items(): + if isinstance(element, Regulation): + if isinstance(element.event, Unresolved): + element.event = elements[element.event.uid] + for k, v in element.arguments.items(): + if isinstance(v, Unresolved): + element.arguments[k] = elements[v.uid] + + # Now that everything is resolved, we can initialize the regulations + for uid, element in elements.items(): + if isinstance(element, Regulation): + element.initialize() + + return elements + + +# Below we define dataclasses to represent elements of Standoff annotations + +@dataclass +class Negation: + uid: str + confidence: float = None + + +@dataclass +class Speculation: + uid: str + confidence: float = None + + +@dataclass +class Entity: + uid: str + entity_type: str + start: int + end: int + text: str + references: Dict[str, str] = field(default_factory=dict) + + def get_type(self): + return self.entity_type + + +@dataclass +class PlaceholderEvent: + event_type: str + + def get_type(self): + return self.event_type + + +@dataclass +class Event: + uid: str + event_type: str + start: int + end: int + text: str + + def get_type(self): + return self.event_type + + +@dataclass +class Regulation: + uid: str + event: Event + arguments: Dict[str, Any] + confidence_val: float = None + confidence_level: str = None + negation: Negation = None + speculation: Speculation = None + # Dynamically created attributes + entrez_ids = None + entrez_uid_mappings = None + graph = None + + def initialize(self): + # Note this can't be simply post init because of unresolved child + # objects upon initialization + self.entrez_ids = self.find_entrez_ids() + self.entrez_uid_mappings = self.get_entrez_uid_mappings() + self.graph = self.to_graph() + + def to_graph(self): + g = networkx.DiGraph() + add_subgraph(g, self) + return g + + def draw(self, fname): + ag = networkx.nx_agraph.to_agraph(self.graph) + ag.draw(fname, prog='dot') + + def find_entrez_ids(self): + """Return all Entrez IDs under this regulation.""" + entrez_ids = set() + for k, v in self.arguments.items(): + v = [v] if not isinstance(v, list) else v + for vv in v: + if isinstance(vv, Regulation): + entrez_ids |= vv.find_entrez_ids() + elif isinstance(vv, Entity): + entrez_id = vv.references.get('EG') + if entrez_id: + entrez_ids.add(entrez_id) + return entrez_ids + + def get_entrez_uid_mappings(self): + """Return mappings from Entrez IDs to the UIDs of the nodes where it + appears.""" + uid_mappings = defaultdict(list) + for arg_type, arg in self.arguments.items(): + for single_arg in (arg if isinstance(arg, list) else [arg]): + if isinstance(single_arg, Regulation): + for child_entrez_id, child_uids in \ + single_arg.get_entrez_uid_mappings().items(): + for child_uid in child_uids: + uid_mappings[child_entrez_id].append(child_uid) + elif isinstance(single_arg, Entity): + entrez_id = single_arg.references.get('EG') + if entrez_id: + uid_mappings[entrez_id].append(single_arg.uid) + return dict(uid_mappings) + + def paths_to_entrez_id(self, entrez_id): + """Find a path from the root to a given Entrez ID.""" + uids = self.entrez_uid_mappings.get(entrez_id) + paths = [] + for uid in uids: + path_nodes = networkx.shortest_path(self.graph, self.uid, uid) + paths.append(path_nodes) + return paths + + +@dataclass +class Unresolved: + uid: str + + +def add_subgraph(g, obj): + """Recursively build up a graph of standoff objects.""" + label = '{ID | %s} | {event_type | %s}' % (obj.uid, obj.event.get_type()) + if obj.negation: + label += '| {negated | %s}' % True + g.add_node(obj.uid, type='Regulation', + shape='record', + label=label) + for k, v in obj.arguments.items(): + for vv in (v if isinstance(v, list) else [v]): + if isinstance(vv, Regulation): + add_subgraph(g, vv) + else: + label = '{ID | %s} | {type | %s} | {text | %s}' % \ + (vv.uid, vv.get_type(), vv.text) + if isinstance(vv, Entity): + egid = vv.references.get('EG') + if egid: + label += '| {entrez_id | %s}' % egid + g.add_node(vv.uid, + shape='record', + label=label) + g.add_edge(obj.uid, vv.uid, label=k) + + +# The set of event types used in the standoff format +standoff_event_types = { + 'Binding', + 'Acetylation', + 'Deacetylation', + 'Phosphorylation', + 'Dephosphorylation', + 'DNA_methylation', + 'DNA_demethylation', + 'Glycosylation', + 'Deglycosylation', + 'Hydroxylation', + 'Dehydroxylation', + 'Methylation', + 'Demethylation', + 'Ubiquitination', + 'Deubiquitination', + 'Regulation', + 'Positive_regulation', + 'Negative_regulation', + 'Gene_expression', + 'Catalysis', + 'Transcription', + 'Localization', + 'Protein_catabolism', +} + +# Mapping network relation types to regulation types used in the standoff files +# as well as the one with opposite polarity. +type_standoff_mappings = { + 'Binding': ('Binding', 'Binding'), + 'Catalysis of DNA methylation': ('Methylation', 'Demethylation'), + 'Catalysis of acetylation': ('Acetylation', 'Deacethylation'), + 'Catalysis of glycosylation': ('Glycosylation', 'Deglycosylation'), + 'Catalysis of hydroxylation': ('Hydroxylation', 'Dehydroxylation'), + 'Catalysis of methylation': ('DNA_methylation', 'DNA_demethylation'), + 'Catalysis of phosphorylation': ('Phosphorylation', 'Dephosphorylation'), + 'Catalysis of ubiquitination': ('Ubiquitination', 'Deubiquitination'), + 'Indirect_catalysis of acetylation': ('Acetylation', 'Deacethylation'), + 'Indirect_catalysis of methylation': ('Methylation', 'Demethylation'), + 'Indirect_catalysis of ubiquitination': ('Ubiquitination', + 'Deubiquitination'), + 'Indirect_regulation': (None, None), + 'Indirect_regulation of binding': ('Binding', 'Binding'), + 'Indirect_regulation of catabolism': ('Protein_catabolism', + 'Protein_catabolism'), + 'Indirect_regulation of expression': ('Gene_expression', 'Gene_expression'), + 'Indirect_regulation of localization': ('Localization', 'Localization'), + 'Indirect_regulation of phosphorylation': ('Phosphorylation', + 'Dephosphorylation'), + 'Indirect_regulation of transcription': ('Transcription', 'Transcription'), + 'Regulation': (None, None), + 'Regulation of binding': ('Binding', 'Binding'), + 'Regulation of catabolism': ('Protein_catabolism', 'Protein_catabolism'), + 'Regulation of expression': ('Gene_expression', 'Gene_expression'), + 'Regulation of localization': ('Localization', 'Localization'), + 'Regulation of phosphorylation': ('Phosphorylation', 'Dephosphorylation'), + 'Regulation of transcription': ('Transcription', 'Transcription'), +} + +# Network relation type mappings to INDRA Statement types +type_indra_mappings = { + 'Binding': (Complex, Complex), + 'Catalysis of DNA methylation': (Methylation, Demethylation), + 'Catalysis of acetylation': (Acetylation, Deacetylation), + 'Catalysis of glycosylation': (Glycosylation, Deglycosylation), + 'Catalysis of hydroxylation': (Hydroxylation, Dehydroxylation), + 'Catalysis of methylation': (Methylation, Demethylation), + 'Catalysis of phosphorylation': (Phosphorylation, Dephosphorylation), + 'Catalysis of ubiquitination': (Ubiquitination, Deubiquitination), + 'Indirect_catalysis of acetylation': (Acetylation, Deacetylation), + 'Indirect_catalysis of methylation': (Methylation, Demethylation), + 'Indirect_catalysis of ubiquitination': (Ubiquitination, Deubiquitination), + 'Indirect_regulation': None, + 'Indirect_regulation of binding': None, + 'Indirect_regulation of catabolism': None, + 'Indirect_regulation of expression': (IncreaseAmount, DecreaseAmount), + 'Indirect_regulation of localization': None, + 'Indirect_regulation of phosphorylation': (Phosphorylation, + Dephosphorylation), + 'Indirect_regulation of transcription': (IncreaseAmount, DecreaseAmount), + 'Regulation': None, + 'Regulation of binding': None, + 'Regulation of catabolism': None, + 'Regulation of expression': (IncreaseAmount, DecreaseAmount), + 'Regulation of localization': None, + 'Regulation of phosphorylation': (Phosphorylation, Dephosphorylation), + 'Regulation of transcription': (IncreaseAmount, DecreaseAmount), +} diff --git a/indra/tests/make_mock_ontology.py b/indra/tests/make_mock_ontology.py index 1b77ce1d39..597e6b1364 100644 --- a/indra/tests/make_mock_ontology.py +++ b/indra/tests/make_mock_ontology.py @@ -48,7 +48,8 @@ 'EGID:673', 'EGID:5594', 'EGID:5595', 'IDO:0000514', 'LSPCI:18', 'EGID:109880', 'PUBCHEM:56649450', 'MESH:D000086382', 'UP:C7U1M6', 'GO:GO:0008553', "ECCODE:1.1.1.1", "ECCODE:1.1.1", "ECCODE:1.1", - "ECCODE:1", + "ECCODE:1", "HGNC:2367", "HGNC:5", "HGNC:6018", "HGNC:7", "HGNC:7645", + "HGNC:7646", "HGNC:9071", } always_include_ns = {'FPLX', 'INDRA_ACTIVITIES', 'INDRA_MODS'} diff --git a/indra/tests/test_sources/__init__.py b/indra/tests/test_sources/__init__.py index 58533cf73c..ea058fc58c 100644 --- a/indra/tests/test_sources/__init__.py +++ b/indra/tests/test_sources/__init__.py @@ -1,3 +1,11 @@ # -*- coding: utf-8 -*- """A submodule for organizing tests for sources.""" +import os + +RESOURCE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'resources') + + +def get_resource_file(fname): + return os.path.join(RESOURCE_PATH, fname) \ No newline at end of file diff --git a/indra/tests/test_sources/resources/evex_articles.tsv b/indra/tests/test_sources/resources/evex_articles.tsv new file mode 100644 index 0000000000..7fafc2d88c --- /dev/null +++ b/indra/tests/test_sources/resources/evex_articles.tsv @@ -0,0 +1,11 @@ +general_event_id article_id +963594 PMID: 15784508 +963594 PMID: 18006927 +963594 PMID: 8225481 +963594 PMID: 9321764 +963594 PMCID: PMC1903350 +963594 PMCID: PMC2999590 +51221413 PMID: 15010462 +33192948 PMCID: PMC3034019 +4793648 PMID: 18327405 +5486188 PMID: 9502078 diff --git a/indra/tests/test_sources/resources/evex_binding_standoff.pkl b/indra/tests/test_sources/resources/evex_binding_standoff.pkl new file mode 100644 index 0000000000..a1d394910f Binary files /dev/null and b/indra/tests/test_sources/resources/evex_binding_standoff.pkl differ diff --git a/indra/tests/test_sources/resources/evex_rels.tsv b/indra/tests/test_sources/resources/evex_rels.tsv new file mode 100644 index 0000000000..98aa7bc622 --- /dev/null +++ b/indra/tests/test_sources/resources/evex_rels.tsv @@ -0,0 +1,6 @@ +general_event_id source_entrezgene_id target_entrezgene_id confidence negation speculation coarse_type coarse_polarity refined_type refined_polarity +963594 9 10 0.0316748 0 0 Binding Neutral Binding Neutral +51221413 3569 1 -0.488791 0 0 Regulation Unspecified Catalysis of phosphorylation Unspecified +33192948 23 1 -1.45498 0 0 Regulation Unspecified Regulation Unspecified +4793648 1401 2 -1.28857 0 0 Regulation Negative Regulation of expression Negative +5486188 5340 2 -1.56376 0 0 Regulation Positive Regulation of expression Positive diff --git a/indra/tests/test_sources/resources/evex_test_annots.tar.gz b/indra/tests/test_sources/resources/evex_test_annots.tar.gz new file mode 100644 index 0000000000..ccf37a3cdd Binary files /dev/null and b/indra/tests/test_sources/resources/evex_test_annots.tar.gz differ diff --git a/indra/tests/test_sources/test_evex.py b/indra/tests/test_sources/test_evex.py new file mode 100644 index 0000000000..88cd0d9e58 --- /dev/null +++ b/indra/tests/test_sources/test_evex.py @@ -0,0 +1,71 @@ +import pickle +import pandas +from indra.sources.evex.processor import get_sentence_for_offset, \ + EvexProcessor, get_sentence_relative_offset +from indra.statements.validate import assert_valid_statements +from . import get_resource_file + + +def test_process_relations(): + standoff_tar_gz = get_resource_file('evex_test_annots.tar.gz') + relations_df = pandas.read_csv(get_resource_file('evex_rels.tsv'), sep='\t') + articles_df = pandas.read_csv(get_resource_file('evex_articles.tsv'), + sep='\t') + standoff_index = {} + for aid in articles_df.article_id: + paper_prefix, paper_id = aid.split(': ') + key = ( + 'pubmed' if paper_prefix == 'PMID' else 'pmc', + paper_id if paper_prefix == 'PMID' else paper_id.replace('PMC', '') + ) + standoff_index[key] = standoff_tar_gz + + ep = EvexProcessor(relations_df, articles_df, standoff_index) + ep.process_statements() + assert_valid_statements(ep.statements) + assert len(ep.statements) == 12 + for stmt in ep.statements: + assert len(stmt.evidence) == 1 + ev = stmt.evidence[0] + assert ev.text + assert ev.text_refs + for agent in stmt.agent_list(): + assert 'EGID' in agent.db_refs + assert 'TEXT' in agent.db_refs + # Make sure we got good coordinates + assert ev.annotations['agents']['coords'][0][1] > 0 + # Make sure we don't have redundant evidences + assert len({stmt.get_hash(shallow=False) for stmt in ep.statements}) == \ + len(ep.statements) + return ep + + +def test_get_sentence_offset(): + text_lines = [('Interferon-gamma regulates alpha 2-macroglobulin and ' + 'alpha 1-antichymotrypsin expression on the ' + 'pretranslational level in HepG2 cells.')] + line_offsets = [0] + + assert get_sentence_for_offset(text_lines, line_offsets, 17) == \ + text_lines[0] + + text_lines = ['a', 'b', 'c', 'd', 'e', 'f'] + line_offsets = [0, 188, 376, 627, 823, 1129] + assert get_sentence_for_offset(text_lines, line_offsets, 535) == 'c' + + +def test_relative_offset(): + line_offsets = [0, 10, 20] + assert get_sentence_relative_offset(line_offsets, 5) == 5 + assert get_sentence_relative_offset(line_offsets, 15) == 5 + assert get_sentence_relative_offset(line_offsets, 25) == 5 + + +def test_binding_standoff(): + with open(get_resource_file('evex_binding_standoff.pkl'), 'rb') as fh: + standoff = pickle.load(fh) + + source_id = '19' + target_id = '20' + regs = standoff.find_potential_regulations(source_id, target_id) + assert regs diff --git a/indra/util/statement_presentation.py b/indra/util/statement_presentation.py index e46cfafad4..fba4c79424 100644 --- a/indra/util/statement_presentation.py +++ b/indra/util/statement_presentation.py @@ -124,7 +124,7 @@ class to define a `StmtStat`. 'ubibrowser', 'acsn'] """Database source names as they appear in the DB""" -reader_sources = ['geneways', 'tees', 'gnbr', 'semrep', 'isi', 'trips', +reader_sources = ['geneways', 'tees', 'gnbr', 'semrep', 'evex', 'isi', 'trips', 'rlimsp', 'medscan', 'eidos', 'sparser', 'reach'] """Reader source names as they appear in the DB""" diff --git a/setup.py b/setup.py index 211af33177..5c6f492a9d 100644 --- a/setup.py +++ b/setup.py @@ -87,6 +87,7 @@ def main(): 'indra.sources.crog', 'indra.sources.ctd', 'indra.sources.dgi', 'indra.sources.drugbank', 'indra.sources.eidos', + 'indra.sources.evex', 'indra.sources.geneways', 'indra.sources.gnbr', 'indra.sources.hprd', 'indra.sources.hypothesis', 'indra.sources.index_cards',