Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reactome using utils #17

Merged
merged 13 commits into from
Jun 25, 2024
188 changes: 134 additions & 54 deletions scripts/tasks_retrival/Reactome_tasks_creation.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,75 @@
from os import makedirs
from pathlib import Path

import click
import pandas as pd
import requests
from task_retrieval import verify_source_of_data

from gene_benchmark.tasks import dump_task_definitions
from scripts.tasks_retrival.task_retrieval import list_form_to_onehot_form

TOP_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt"


def get_token_link_for_symbols(symbols: list[str]) -> str:
"""
Creates an analysis service pathways link for a given symbol list.

Args:
----
symbols (list[str]): list of symbols to create a pathways data file for

Returns:
-------
str: the to the csv file with the pathways for the symbols

"""
token = get_token(symbols)
return f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv"


def get_symbol_list(
url="https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json",
):
url: str = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json",
) -> list[str]:
"""
Retrieves the symbol list from a HGNC json like file.

Args:
----
url (str, optional): url for the json file download. Defaults to "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json".

Returns:
-------
list[str]: list of symbols

"""
with requests.get(url) as response:
response.raise_for_status()
reactome_res = response.json()
return [v["symbol"] for v in reactome_res["response"]["docs"]]


def get_token(
token_list,
projection_url="https://reactome.org/AnalysisService/identifiers/projection",
):
identifiers: list[str],
projection_url: str = "https://reactome.org/AnalysisService/identifiers/projection",
) -> str:
"""
Data retrieval from Reactome API requires the use of token that represent a list of identifiers,
the method use the AnalysisService API to get the token for a given identifiers list.

Args:
----
identifiers (list[str]): List of identifiers
projection_url (str, optional): Analysis service link. Defaults to "https://reactome.org/AnalysisService/identifiers/projection".

Returns:
-------
str: A Reactome Analysis service token

"""
headers = {
"Accept": "application/json",
"Content-Type": "text/plain",
}
symbols = "\n".join(token_list)
symbols = "\n".join(identifiers)
response = requests.post(
projection_url,
headers=headers,
Expand All @@ -32,37 +78,54 @@ def get_token(
return response.json()["summary"]["token"]


def get_top_level_pathway(
url="https://reactome.org/download/current/ReactomePathwaysRelation.txt",
):
hierarchies_df = pd.read_csv(
url, delimiter="\t", header=0, names=["parent", "child"]
)
def get_top_level_pathway(hierarchies_df: pd.DataFrame) -> set[str]:
"""
Returns the top level pathways from the table of pathways hierarchies.
top level are defined as pathways without a parent.

Args:
----
hierarchies_df (pd.DataFrame): A data frame with a parent and child headers

Returns:
-------
set[str]: a set of top level pathways

"""
pathway_that_are_parents = set(hierarchies_df["parent"].values)
pathway_that_are_children = set(hierarchies_df["child"].values)
pathway_who_are_just_parents = pathway_that_are_parents - pathway_that_are_children
return pathway_who_are_just_parents


def pathway_to_onehot(pathway_df):
any_pathway_genes = list(
set(";".join(pathway_df["Submitted entities found"].values).split(";"))
)
outcomes = pd.DataFrame(
index=any_pathway_genes, columns=pathway_df["Pathway name"], data=False
)
for pathway_idx in pathway_df.index:
path_genes = pathway_df.loc[pathway_idx, "Submitted entities found"].split(";")
pathway_name = pathway_df.loc[pathway_idx, "Pathway name"]
outcomes.loc[path_genes, pathway_name] = True
return outcomes


def dump_to_task(task_dir, outcomes_df):
entities_path = task_dir / "entities.csv"
outcomes_path = task_dir / "outcomes.csv"
pd.Series(outcomes_df.index, name="symbol").to_csv(entities_path, index=False)
outcomes_df.to_csv(outcomes_path, index=False)
def create_top_level_task(
hierarchies_df: pd.DataFrame,
df_path: pd.DataFrame,
entities_name: str = "symbol",
pathway_names: str = "Pathway name",
) -> tuple[pd.Series, pd.DataFrame]:
"""
Creates a top level tasks.

Args:
----
hierarchies_df (pd.DataFrame): The pathways hierarchies table used to find the top pathways
df_path (pd.DataFrame): The pathways themselves, used to extract the gene list.
entities_name (str, optional): name of the entities. Defaults to 'symbol'.
pathway_names (str, optional): names of the pathways (converted from identifiers). Defaults to "Pathway name".

Returns:
-------
tuple[pd.Series,pd.DataFrame]: _description_

"""
top_level = get_top_level_pathway(hierarchies_df)
top_in_file_paths = top_level.intersection(set(df_path.index))
df_path_top = df_path.loc[list(top_in_file_paths), :]
df_path_top.index = df_path_top[pathway_names]
outcomes = list_form_to_onehot_form(df_path_top)
symbols = pd.Series(outcomes.index, name=entities_name)
return symbols, outcomes


@click.command()
Expand All @@ -78,7 +141,7 @@ def dump_to_task(task_dir, outcomes_df):
"-n",
type=click.STRING,
help="name for the specific task",
default="Pathways",
default="Pathways HGNC",
)
@click.option(
"--allow-downloads",
Expand All @@ -90,33 +153,50 @@ def dump_to_task(task_dir, outcomes_df):
"--pathways-file",
type=click.STRING,
help="Path to the pathways files from reactome available using the analysis GUI",
default="",
default=None,
)
@click.option(
"--top-pathways-file",
"--pathways-relation-file",
type=click.STRING,
help="The location of the ReactomePathwaysRelation file available at https://reactome.org/download-data",
default="",
default=None,
)
@click.option(
"--verbose/--quite",
"-v/-q",
is_flag=True,
default=True,
)
def main(
main_task_directory, task_name, allow_downloads, pathways_file, top_pathways_file
main_task_directory,
task_name,
allow_downloads,
pathways_file,
pathways_relation_file,
verbose,
):
if allow_downloads:
symb_list = get_symbol_list()
token = get_token(symb_list)
url = f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv"
df_path = pd.read_csv(url, index_col="Pathway identifier")
top_level = get_top_level_pathway()
else:
df_path = pd.read_csv(pathways_file)
top_level = pd.read_csv(top_pathways_file)

top_in_file_paths = top_level.intersection(set(df_path.index))
df_path_top = df_path.loc[list(top_in_file_paths), :]
outcomes = pathway_to_onehot(df_path_top)
task_dir = Path(main_task_directory) / f"{task_name}"
makedirs(task_dir, exist_ok=True)
dump_to_task(task_dir, outcomes)
reactom_url = (
get_token_link_for_symbols(get_symbol_list()) if allow_downloads else ""
)

pathways_file = verify_source_of_data(
pathways_file, url=reactom_url, allow_downloads=allow_downloads
)
pathways_relation_file = verify_source_of_data(
pathways_relation_file, url=TOP_PATHWAYS_URL, allow_downloads=allow_downloads
)
df_path = pd.read_csv(pathways_file, index_col="Pathway identifier")

hierarchies_df = pd.read_csv(
edenjenzohar marked this conversation as resolved.
Show resolved Hide resolved
pathways_relation_file, delimiter="\t", header=0, names=["parent", "child"]
)
symbols, outcomes = create_top_level_task(hierarchies_df, df_path)
dump_task_definitions(symbols, outcomes, main_task_directory, task_name)
if verbose:
print(
f"{task_name} was created at {main_task_directory} shaped {outcomes.shape}"
)
return


Expand Down
36 changes: 36 additions & 0 deletions scripts/tasks_retrival/task_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,39 @@ def get_id_to_symbol_df(list_of_gene_metadata):
gene_metadata_df = gene_metadata_df.drop_duplicates(subset="query")
gene_metadata_df.index = gene_metadata_df["query"]
return gene_metadata_df


def list_form_to_onehot_form(
list_df: pd.DataFrame,
participant_col_name: str = "Submitted entities found",
delimiter: str = ";",
) -> pd.DataFrame:
"""
Give a pathway data frame that has each pathway as a row with
a list of included genes the method creates a data frame where each
row is a gene and each column is a pathway the cells are true when
the gene is participating in the pathways.

Args:
----
pathway_df (pd.DataFrame): A data frame with pathways as rows and a gene in one of the cells
pathway_name (str): The name of the pathways name columns
included_genes (str): The name of the included genes in a pathway
Submitted entities found with the participating genes

Returns:
-------
pd.DataFrame: A one hot dataframe where rows are genes and columns are pathways

"""
full_identifier_list = delimiter.join(list_df[participant_col_name].values).split(
delimiter
)
unique_identifier_list = {x.strip() for x in full_identifier_list}
onehot_df = pd.DataFrame(
index=list(unique_identifier_list), columns=list_df.index, data=False
)
for pathway_idx in list_df.index:
path_genes = list_df.loc[pathway_idx, participant_col_name].split(delimiter)
onehot_df.loc[path_genes, pathway_idx] = True
return onehot_df
Loading