diff --git a/scripts/tasks_retrival/Reactome_tasks_creation.py b/scripts/tasks_retrival/Reactome_tasks_creation.py index 5706562..7a63054 100644 --- a/scripts/tasks_retrival/Reactome_tasks_creation.py +++ b/scripts/tasks_retrival/Reactome_tasks_creation.py @@ -1,14 +1,46 @@ -from os import makedirs -from pathlib import Path - import click import pandas as pd import requests +from task_retrieval import verify_source_of_data + +from gene_benchmark.tasks import dump_task_definitions +from scripts.tasks_retrival.task_retrieval import list_form_to_onehot_form + +TOP_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt" + + +def get_token_link_for_symbols(symbols: list[str]) -> str: + """ + Creates an analysis service pathways link for a given symbol list. + + Args: + ---- + symbols (list[str]): list of symbols to create a pathways data file for + + Returns: + ------- + str: the to the csv file with the pathways for the symbols + + """ + token = get_token(symbols) + return f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv" def get_symbol_list( - url="https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json", -): + url: str = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json", +) -> list[str]: + """ + Retrieves the symbol list from a HGNC json like file. + + Args: + ---- + url (str, optional): url for the json file download. Defaults to "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json". + + Returns: + ------- + list[str]: list of symbols + + """ with requests.get(url) as response: response.raise_for_status() reactome_res = response.json() @@ -16,14 +48,28 @@ def get_symbol_list( def get_token( - token_list, - projection_url="https://reactome.org/AnalysisService/identifiers/projection", -): + identifiers: list[str], + projection_url: str = "https://reactome.org/AnalysisService/identifiers/projection", +) -> str: + """ + Data retrieval from Reactome API requires the use of token that represent a list of identifiers, + the method use the AnalysisService API to get the token for a given identifiers list. + + Args: + ---- + identifiers (list[str]): List of identifiers + projection_url (str, optional): Analysis service link. Defaults to "https://reactome.org/AnalysisService/identifiers/projection". + + Returns: + ------- + str: A Reactome Analysis service token + + """ headers = { "Accept": "application/json", "Content-Type": "text/plain", } - symbols = "\n".join(token_list) + symbols = "\n".join(identifiers) response = requests.post( projection_url, headers=headers, @@ -32,37 +78,54 @@ def get_token( return response.json()["summary"]["token"] -def get_top_level_pathway( - url="https://reactome.org/download/current/ReactomePathwaysRelation.txt", -): - hierarchies_df = pd.read_csv( - url, delimiter="\t", header=0, names=["parent", "child"] - ) +def get_top_level_pathway(hierarchies_df: pd.DataFrame) -> set[str]: + """ + Returns the top level pathways from the table of pathways hierarchies. + top level are defined as pathways without a parent. + + Args: + ---- + hierarchies_df (pd.DataFrame): A data frame with a parent and child headers + + Returns: + ------- + set[str]: a set of top level pathways + + """ pathway_that_are_parents = set(hierarchies_df["parent"].values) pathway_that_are_children = set(hierarchies_df["child"].values) pathway_who_are_just_parents = pathway_that_are_parents - pathway_that_are_children return pathway_who_are_just_parents -def pathway_to_onehot(pathway_df): - any_pathway_genes = list( - set(";".join(pathway_df["Submitted entities found"].values).split(";")) - ) - outcomes = pd.DataFrame( - index=any_pathway_genes, columns=pathway_df["Pathway name"], data=False - ) - for pathway_idx in pathway_df.index: - path_genes = pathway_df.loc[pathway_idx, "Submitted entities found"].split(";") - pathway_name = pathway_df.loc[pathway_idx, "Pathway name"] - outcomes.loc[path_genes, pathway_name] = True - return outcomes - - -def dump_to_task(task_dir, outcomes_df): - entities_path = task_dir / "entities.csv" - outcomes_path = task_dir / "outcomes.csv" - pd.Series(outcomes_df.index, name="symbol").to_csv(entities_path, index=False) - outcomes_df.to_csv(outcomes_path, index=False) +def create_top_level_task( + hierarchies_df: pd.DataFrame, + df_path: pd.DataFrame, + entities_name: str = "symbol", + pathway_names: str = "Pathway name", +) -> tuple[pd.Series, pd.DataFrame]: + """ + Creates a top level tasks. + + Args: + ---- + hierarchies_df (pd.DataFrame): The pathways hierarchies table used to find the top pathways + df_path (pd.DataFrame): The pathways themselves, used to extract the gene list. + entities_name (str, optional): name of the entities. Defaults to 'symbol'. + pathway_names (str, optional): names of the pathways (converted from identifiers). Defaults to "Pathway name". + + Returns: + ------- + tuple[pd.Series,pd.DataFrame]: _description_ + + """ + top_level = get_top_level_pathway(hierarchies_df) + top_in_file_paths = top_level.intersection(set(df_path.index)) + df_path_top = df_path.loc[list(top_in_file_paths), :] + df_path_top.index = df_path_top[pathway_names] + outcomes = list_form_to_onehot_form(df_path_top) + symbols = pd.Series(outcomes.index, name=entities_name) + return symbols, outcomes @click.command() @@ -78,7 +141,7 @@ def dump_to_task(task_dir, outcomes_df): "-n", type=click.STRING, help="name for the specific task", - default="Pathways", + default="Pathways HGNC", ) @click.option( "--allow-downloads", @@ -90,33 +153,50 @@ def dump_to_task(task_dir, outcomes_df): "--pathways-file", type=click.STRING, help="Path to the pathways files from reactome available using the analysis GUI", - default="", + default=None, ) @click.option( - "--top-pathways-file", + "--pathways-relation-file", type=click.STRING, help="The location of the ReactomePathwaysRelation file available at https://reactome.org/download-data", - default="", + default=None, +) +@click.option( + "--verbose/--quite", + "-v/-q", + is_flag=True, + default=True, ) def main( - main_task_directory, task_name, allow_downloads, pathways_file, top_pathways_file + main_task_directory, + task_name, + allow_downloads, + pathways_file, + pathways_relation_file, + verbose, ): - if allow_downloads: - symb_list = get_symbol_list() - token = get_token(symb_list) - url = f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv" - df_path = pd.read_csv(url, index_col="Pathway identifier") - top_level = get_top_level_pathway() - else: - df_path = pd.read_csv(pathways_file) - top_level = pd.read_csv(top_pathways_file) - top_in_file_paths = top_level.intersection(set(df_path.index)) - df_path_top = df_path.loc[list(top_in_file_paths), :] - outcomes = pathway_to_onehot(df_path_top) - task_dir = Path(main_task_directory) / f"{task_name}" - makedirs(task_dir, exist_ok=True) - dump_to_task(task_dir, outcomes) + reactom_url = ( + get_token_link_for_symbols(get_symbol_list()) if allow_downloads else "" + ) + + pathways_file = verify_source_of_data( + pathways_file, url=reactom_url, allow_downloads=allow_downloads + ) + pathways_relation_file = verify_source_of_data( + pathways_relation_file, url=TOP_PATHWAYS_URL, allow_downloads=allow_downloads + ) + df_path = pd.read_csv(pathways_file, index_col="Pathway identifier") + + hierarchies_df = pd.read_csv( + pathways_relation_file, delimiter="\t", header=0, names=["parent", "child"] + ) + symbols, outcomes = create_top_level_task(hierarchies_df, df_path) + dump_task_definitions(symbols, outcomes, main_task_directory, task_name) + if verbose: + print( + f"{task_name} was created at {main_task_directory} shaped {outcomes.shape}" + ) return diff --git a/scripts/tasks_retrival/task_retrieval.py b/scripts/tasks_retrival/task_retrieval.py index 8bc20ff..3b4d7b4 100644 --- a/scripts/tasks_retrival/task_retrieval.py +++ b/scripts/tasks_retrival/task_retrieval.py @@ -172,3 +172,39 @@ def get_id_to_symbol_df(list_of_gene_metadata): gene_metadata_df = gene_metadata_df.drop_duplicates(subset="query") gene_metadata_df.index = gene_metadata_df["query"] return gene_metadata_df + + +def list_form_to_onehot_form( + list_df: pd.DataFrame, + participant_col_name: str = "Submitted entities found", + delimiter: str = ";", +) -> pd.DataFrame: + """ + Give a pathway data frame that has each pathway as a row with + a list of included genes the method creates a data frame where each + row is a gene and each column is a pathway the cells are true when + the gene is participating in the pathways. + + Args: + ---- + pathway_df (pd.DataFrame): A data frame with pathways as rows and a gene in one of the cells + pathway_name (str): The name of the pathways name columns + included_genes (str): The name of the included genes in a pathway + Submitted entities found with the participating genes + + Returns: + ------- + pd.DataFrame: A one hot dataframe where rows are genes and columns are pathways + + """ + full_identifier_list = delimiter.join(list_df[participant_col_name].values).split( + delimiter + ) + unique_identifier_list = {x.strip() for x in full_identifier_list} + onehot_df = pd.DataFrame( + index=list(unique_identifier_list), columns=list_df.index, data=False + ) + for pathway_idx in list_df.index: + path_genes = list_df.loc[pathway_idx, participant_col_name].split(delimiter) + onehot_df.loc[path_genes, pathway_idx] = True + return onehot_df