Skip to content

Commit efa7837

Browse files
authored
Merge pull request #17 from BiomedSciAI/reactome_using_utils
Reactome using utils
2 parents 4622a12 + 79b5977 commit efa7837

File tree

2 files changed

+170
-54
lines changed

2 files changed

+170
-54
lines changed

scripts/tasks_retrival/Reactome_tasks_creation.py

Lines changed: 134 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,75 @@
1-
from os import makedirs
2-
from pathlib import Path
3-
41
import click
52
import pandas as pd
63
import requests
4+
from task_retrieval import verify_source_of_data
5+
6+
from gene_benchmark.tasks import dump_task_definitions
7+
from scripts.tasks_retrival.task_retrieval import list_form_to_onehot_form
8+
9+
TOP_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt"
10+
11+
12+
def get_token_link_for_symbols(symbols: list[str]) -> str:
13+
"""
14+
Creates an analysis service pathways link for a given symbol list.
15+
16+
Args:
17+
----
18+
symbols (list[str]): list of symbols to create a pathways data file for
19+
20+
Returns:
21+
-------
22+
str: the to the csv file with the pathways for the symbols
23+
24+
"""
25+
token = get_token(symbols)
26+
return f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv"
727

828

929
def get_symbol_list(
10-
url="https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json",
11-
):
30+
url: str = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json",
31+
) -> list[str]:
32+
"""
33+
Retrieves the symbol list from a HGNC json like file.
34+
35+
Args:
36+
----
37+
url (str, optional): url for the json file download. Defaults to "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json".
38+
39+
Returns:
40+
-------
41+
list[str]: list of symbols
42+
43+
"""
1244
with requests.get(url) as response:
1345
response.raise_for_status()
1446
reactome_res = response.json()
1547
return [v["symbol"] for v in reactome_res["response"]["docs"]]
1648

1749

1850
def get_token(
19-
token_list,
20-
projection_url="https://reactome.org/AnalysisService/identifiers/projection",
21-
):
51+
identifiers: list[str],
52+
projection_url: str = "https://reactome.org/AnalysisService/identifiers/projection",
53+
) -> str:
54+
"""
55+
Data retrieval from Reactome API requires the use of token that represent a list of identifiers,
56+
the method use the AnalysisService API to get the token for a given identifiers list.
57+
58+
Args:
59+
----
60+
identifiers (list[str]): List of identifiers
61+
projection_url (str, optional): Analysis service link. Defaults to "https://reactome.org/AnalysisService/identifiers/projection".
62+
63+
Returns:
64+
-------
65+
str: A Reactome Analysis service token
66+
67+
"""
2268
headers = {
2369
"Accept": "application/json",
2470
"Content-Type": "text/plain",
2571
}
26-
symbols = "\n".join(token_list)
72+
symbols = "\n".join(identifiers)
2773
response = requests.post(
2874
projection_url,
2975
headers=headers,
@@ -32,37 +78,54 @@ def get_token(
3278
return response.json()["summary"]["token"]
3379

3480

35-
def get_top_level_pathway(
36-
url="https://reactome.org/download/current/ReactomePathwaysRelation.txt",
37-
):
38-
hierarchies_df = pd.read_csv(
39-
url, delimiter="\t", header=0, names=["parent", "child"]
40-
)
81+
def get_top_level_pathway(hierarchies_df: pd.DataFrame) -> set[str]:
82+
"""
83+
Returns the top level pathways from the table of pathways hierarchies.
84+
top level are defined as pathways without a parent.
85+
86+
Args:
87+
----
88+
hierarchies_df (pd.DataFrame): A data frame with a parent and child headers
89+
90+
Returns:
91+
-------
92+
set[str]: a set of top level pathways
93+
94+
"""
4195
pathway_that_are_parents = set(hierarchies_df["parent"].values)
4296
pathway_that_are_children = set(hierarchies_df["child"].values)
4397
pathway_who_are_just_parents = pathway_that_are_parents - pathway_that_are_children
4498
return pathway_who_are_just_parents
4599

46100

47-
def pathway_to_onehot(pathway_df):
48-
any_pathway_genes = list(
49-
set(";".join(pathway_df["Submitted entities found"].values).split(";"))
50-
)
51-
outcomes = pd.DataFrame(
52-
index=any_pathway_genes, columns=pathway_df["Pathway name"], data=False
53-
)
54-
for pathway_idx in pathway_df.index:
55-
path_genes = pathway_df.loc[pathway_idx, "Submitted entities found"].split(";")
56-
pathway_name = pathway_df.loc[pathway_idx, "Pathway name"]
57-
outcomes.loc[path_genes, pathway_name] = True
58-
return outcomes
59-
60-
61-
def dump_to_task(task_dir, outcomes_df):
62-
entities_path = task_dir / "entities.csv"
63-
outcomes_path = task_dir / "outcomes.csv"
64-
pd.Series(outcomes_df.index, name="symbol").to_csv(entities_path, index=False)
65-
outcomes_df.to_csv(outcomes_path, index=False)
101+
def create_top_level_task(
102+
hierarchies_df: pd.DataFrame,
103+
df_path: pd.DataFrame,
104+
entities_name: str = "symbol",
105+
pathway_names: str = "Pathway name",
106+
) -> tuple[pd.Series, pd.DataFrame]:
107+
"""
108+
Creates a top level tasks.
109+
110+
Args:
111+
----
112+
hierarchies_df (pd.DataFrame): The pathways hierarchies table used to find the top pathways
113+
df_path (pd.DataFrame): The pathways themselves, used to extract the gene list.
114+
entities_name (str, optional): name of the entities. Defaults to 'symbol'.
115+
pathway_names (str, optional): names of the pathways (converted from identifiers). Defaults to "Pathway name".
116+
117+
Returns:
118+
-------
119+
tuple[pd.Series,pd.DataFrame]: _description_
120+
121+
"""
122+
top_level = get_top_level_pathway(hierarchies_df)
123+
top_in_file_paths = top_level.intersection(set(df_path.index))
124+
df_path_top = df_path.loc[list(top_in_file_paths), :]
125+
df_path_top.index = df_path_top[pathway_names]
126+
outcomes = list_form_to_onehot_form(df_path_top)
127+
symbols = pd.Series(outcomes.index, name=entities_name)
128+
return symbols, outcomes
66129

67130

68131
@click.command()
@@ -78,7 +141,7 @@ def dump_to_task(task_dir, outcomes_df):
78141
"-n",
79142
type=click.STRING,
80143
help="name for the specific task",
81-
default="Pathways",
144+
default="Pathways HGNC",
82145
)
83146
@click.option(
84147
"--allow-downloads",
@@ -90,33 +153,50 @@ def dump_to_task(task_dir, outcomes_df):
90153
"--pathways-file",
91154
type=click.STRING,
92155
help="Path to the pathways files from reactome available using the analysis GUI",
93-
default="",
156+
default=None,
94157
)
95158
@click.option(
96-
"--top-pathways-file",
159+
"--pathways-relation-file",
97160
type=click.STRING,
98161
help="The location of the ReactomePathwaysRelation file available at https://reactome.org/download-data",
99-
default="",
162+
default=None,
163+
)
164+
@click.option(
165+
"--verbose/--quite",
166+
"-v/-q",
167+
is_flag=True,
168+
default=True,
100169
)
101170
def main(
102-
main_task_directory, task_name, allow_downloads, pathways_file, top_pathways_file
171+
main_task_directory,
172+
task_name,
173+
allow_downloads,
174+
pathways_file,
175+
pathways_relation_file,
176+
verbose,
103177
):
104-
if allow_downloads:
105-
symb_list = get_symbol_list()
106-
token = get_token(symb_list)
107-
url = f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv"
108-
df_path = pd.read_csv(url, index_col="Pathway identifier")
109-
top_level = get_top_level_pathway()
110-
else:
111-
df_path = pd.read_csv(pathways_file)
112-
top_level = pd.read_csv(top_pathways_file)
113178

114-
top_in_file_paths = top_level.intersection(set(df_path.index))
115-
df_path_top = df_path.loc[list(top_in_file_paths), :]
116-
outcomes = pathway_to_onehot(df_path_top)
117-
task_dir = Path(main_task_directory) / f"{task_name}"
118-
makedirs(task_dir, exist_ok=True)
119-
dump_to_task(task_dir, outcomes)
179+
reactom_url = (
180+
get_token_link_for_symbols(get_symbol_list()) if allow_downloads else ""
181+
)
182+
183+
pathways_file = verify_source_of_data(
184+
pathways_file, url=reactom_url, allow_downloads=allow_downloads
185+
)
186+
pathways_relation_file = verify_source_of_data(
187+
pathways_relation_file, url=TOP_PATHWAYS_URL, allow_downloads=allow_downloads
188+
)
189+
df_path = pd.read_csv(pathways_file, index_col="Pathway identifier")
190+
191+
hierarchies_df = pd.read_csv(
192+
pathways_relation_file, delimiter="\t", header=0, names=["parent", "child"]
193+
)
194+
symbols, outcomes = create_top_level_task(hierarchies_df, df_path)
195+
dump_task_definitions(symbols, outcomes, main_task_directory, task_name)
196+
if verbose:
197+
print(
198+
f"{task_name} was created at {main_task_directory} shaped {outcomes.shape}"
199+
)
120200
return
121201

122202

scripts/tasks_retrival/task_retrieval.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,3 +172,39 @@ def get_id_to_symbol_df(list_of_gene_metadata):
172172
gene_metadata_df = gene_metadata_df.drop_duplicates(subset="query")
173173
gene_metadata_df.index = gene_metadata_df["query"]
174174
return gene_metadata_df
175+
176+
177+
def list_form_to_onehot_form(
178+
list_df: pd.DataFrame,
179+
participant_col_name: str = "Submitted entities found",
180+
delimiter: str = ";",
181+
) -> pd.DataFrame:
182+
"""
183+
Give a pathway data frame that has each pathway as a row with
184+
a list of included genes the method creates a data frame where each
185+
row is a gene and each column is a pathway the cells are true when
186+
the gene is participating in the pathways.
187+
188+
Args:
189+
----
190+
pathway_df (pd.DataFrame): A data frame with pathways as rows and a gene in one of the cells
191+
pathway_name (str): The name of the pathways name columns
192+
included_genes (str): The name of the included genes in a pathway
193+
Submitted entities found with the participating genes
194+
195+
Returns:
196+
-------
197+
pd.DataFrame: A one hot dataframe where rows are genes and columns are pathways
198+
199+
"""
200+
full_identifier_list = delimiter.join(list_df[participant_col_name].values).split(
201+
delimiter
202+
)
203+
unique_identifier_list = {x.strip() for x in full_identifier_list}
204+
onehot_df = pd.DataFrame(
205+
index=list(unique_identifier_list), columns=list_df.index, data=False
206+
)
207+
for pathway_idx in list_df.index:
208+
path_genes = list_df.loc[pathway_idx, participant_col_name].split(delimiter)
209+
onehot_df.loc[path_genes, pathway_idx] = True
210+
return onehot_df

0 commit comments

Comments
 (0)