Merge pull request #17 from BiomedSciAI/reactome_using_utils

yoavkt · web-flow · commit efa7837a7357 · 2024-06-25T18:35:10.000+03:00
Reactome using utils
diff --git a/scripts/tasks_retrival/Reactome_tasks_creation.py b/scripts/tasks_retrival/Reactome_tasks_creation.py
@@ -1,29 +1,75 @@
-from os import makedirs
-from pathlib import Path
-
 import click
 import pandas as pd
 import requests
+from task_retrieval import verify_source_of_data
+
+from gene_benchmark.tasks import dump_task_definitions
+from scripts.tasks_retrival.task_retrieval import list_form_to_onehot_form
+
+TOP_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt"
+
+
+def get_token_link_for_symbols(symbols: list[str]) -> str:
+    """
+    Creates an analysis service pathways link for a given symbol list.
+
+    Args:
+    ----
+        symbols (list[str]): list of symbols to create a pathways data file for
+
+    Returns:
+    -------
+        str: the to the csv file with the pathways for the symbols
+
+    """
+    token = get_token(symbols)
+    return f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv"
 
 
 def get_symbol_list(
-    url="https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json",
-):
+    url: str = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json",
+) -> list[str]:
+    """
+    Retrieves the symbol list from a HGNC json like file.
+
+    Args:
+    ----
+        url (str, optional): url for the json file download. Defaults to "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json".
+
+    Returns:
+    -------
+        list[str]: list of symbols
+
+    """
     with requests.get(url) as response:
         response.raise_for_status()
         reactome_res = response.json()
     return [v["symbol"] for v in reactome_res["response"]["docs"]]
 
 
 def get_token(
-    token_list,
-    projection_url="https://reactome.org/AnalysisService/identifiers/projection",
-):
+    identifiers: list[str],
+    projection_url: str = "https://reactome.org/AnalysisService/identifiers/projection",
+) -> str:
+    """
+    Data retrieval from Reactome API requires the use of token that represent a list of identifiers,
+       the method use the  AnalysisService API to get the token for a given identifiers list.
+
+    Args:
+    ----
+        identifiers (list[str]): List of identifiers
+        projection_url (str, optional): Analysis service link. Defaults to "https://reactome.org/AnalysisService/identifiers/projection".
+
+    Returns:
+    -------
+        str: A Reactome Analysis service token
+
+    """
     headers = {
         "Accept": "application/json",
         "Content-Type": "text/plain",
     }
-    symbols = "\n".join(token_list)
+    symbols = "\n".join(identifiers)
     response = requests.post(
         projection_url,
         headers=headers,
@@ -32,37 +78,54 @@ def get_token(
     return response.json()["summary"]["token"]
 
 
-def get_top_level_pathway(
-    url="https://reactome.org/download/current/ReactomePathwaysRelation.txt",
-):
-    hierarchies_df = pd.read_csv(
-        url, delimiter="\t", header=0, names=["parent", "child"]
-    )
+def get_top_level_pathway(hierarchies_df: pd.DataFrame) -> set[str]:
+    """
+    Returns the top level pathways from the table of pathways hierarchies.
+        top level are defined as pathways without a parent.
+
+    Args:
+    ----
+        hierarchies_df (pd.DataFrame): A data frame with a parent and child headers
+
+    Returns:
+    -------
+        set[str]: a set of top level pathways
+
+    """
     pathway_that_are_parents = set(hierarchies_df["parent"].values)
     pathway_that_are_children = set(hierarchies_df["child"].values)
     pathway_who_are_just_parents = pathway_that_are_parents - pathway_that_are_children
     return pathway_who_are_just_parents
 
 
-def pathway_to_onehot(pathway_df):
-    any_pathway_genes = list(
-        set(";".join(pathway_df["Submitted entities found"].values).split(";"))
-    )
-    outcomes = pd.DataFrame(
-        index=any_pathway_genes, columns=pathway_df["Pathway name"], data=False
-    )
-    for pathway_idx in pathway_df.index:
-        path_genes = pathway_df.loc[pathway_idx, "Submitted entities found"].split(";")
-        pathway_name = pathway_df.loc[pathway_idx, "Pathway name"]
-        outcomes.loc[path_genes, pathway_name] = True
-    return outcomes
-
-
-def dump_to_task(task_dir, outcomes_df):
-    entities_path = task_dir / "entities.csv"
-    outcomes_path = task_dir / "outcomes.csv"
-    pd.Series(outcomes_df.index, name="symbol").to_csv(entities_path, index=False)
-    outcomes_df.to_csv(outcomes_path, index=False)
+def create_top_level_task(
+    hierarchies_df: pd.DataFrame,
+    df_path: pd.DataFrame,
+    entities_name: str = "symbol",
+    pathway_names: str = "Pathway name",
+) -> tuple[pd.Series, pd.DataFrame]:
+    """
+    Creates a top level tasks.
+
+    Args:
+    ----
+        hierarchies_df (pd.DataFrame): The pathways hierarchies table used to find the top pathways
+        df_path (pd.DataFrame): The pathways themselves, used to extract the gene list.
+        entities_name (str, optional): name of the entities. Defaults to 'symbol'.
+        pathway_names (str, optional): names of the pathways (converted from identifiers). Defaults to "Pathway name".
+
+    Returns:
+    -------
+        tuple[pd.Series,pd.DataFrame]: _description_
+
+    """
+    top_level = get_top_level_pathway(hierarchies_df)
+    top_in_file_paths = top_level.intersection(set(df_path.index))
+    df_path_top = df_path.loc[list(top_in_file_paths), :]
+    df_path_top.index = df_path_top[pathway_names]
+    outcomes = list_form_to_onehot_form(df_path_top)
+    symbols = pd.Series(outcomes.index, name=entities_name)
+    return symbols, outcomes
 
 
 @click.command()
@@ -78,7 +141,7 @@ def dump_to_task(task_dir, outcomes_df):
     "-n",
     type=click.STRING,
     help="name for the specific task",
-    default="Pathways",
+    default="Pathways HGNC",
 )
 @click.option(
     "--allow-downloads",
@@ -90,33 +153,50 @@ def dump_to_task(task_dir, outcomes_df):
     "--pathways-file",
     type=click.STRING,
     help="Path to the pathways files from reactome available using the analysis GUI",
-    default="",
+    default=None,
 )
 @click.option(
-    "--top-pathways-file",
+    "--pathways-relation-file",
     type=click.STRING,
     help="The location of the ReactomePathwaysRelation file available at https://reactome.org/download-data",
-    default="",
+    default=None,
+)
+@click.option(
+    "--verbose/--quite",
+    "-v/-q",
+    is_flag=True,
+    default=True,
 )
 def main(
-    main_task_directory, task_name, allow_downloads, pathways_file, top_pathways_file
+    main_task_directory,
+    task_name,
+    allow_downloads,
+    pathways_file,
+    pathways_relation_file,
+    verbose,
 ):
-    if allow_downloads:
-        symb_list = get_symbol_list()
-        token = get_token(symb_list)
-        url = f"https://reactome.org/AnalysisService/download/{token}/pathways/TOTAL/result.csv"
-        df_path = pd.read_csv(url, index_col="Pathway identifier")
-        top_level = get_top_level_pathway()
-    else:
-        df_path = pd.read_csv(pathways_file)
-        top_level = pd.read_csv(top_pathways_file)
 
-    top_in_file_paths = top_level.intersection(set(df_path.index))
-    df_path_top = df_path.loc[list(top_in_file_paths), :]
-    outcomes = pathway_to_onehot(df_path_top)
-    task_dir = Path(main_task_directory) / f"{task_name}"
-    makedirs(task_dir, exist_ok=True)
-    dump_to_task(task_dir, outcomes)
+    reactom_url = (
+        get_token_link_for_symbols(get_symbol_list()) if allow_downloads else ""
+    )
+
+    pathways_file = verify_source_of_data(
+        pathways_file, url=reactom_url, allow_downloads=allow_downloads
+    )
+    pathways_relation_file = verify_source_of_data(
+        pathways_relation_file, url=TOP_PATHWAYS_URL, allow_downloads=allow_downloads
+    )
+    df_path = pd.read_csv(pathways_file, index_col="Pathway identifier")
+
+    hierarchies_df = pd.read_csv(
+        pathways_relation_file, delimiter="\t", header=0, names=["parent", "child"]
+    )
+    symbols, outcomes = create_top_level_task(hierarchies_df, df_path)
+    dump_task_definitions(symbols, outcomes, main_task_directory, task_name)
+    if verbose:
+        print(
+            f"{task_name} was created at {main_task_directory} shaped {outcomes.shape}"
+        )
     return
 
 
diff --git a/scripts/tasks_retrival/task_retrieval.py b/scripts/tasks_retrival/task_retrieval.py
@@ -172,3 +172,39 @@ def get_id_to_symbol_df(list_of_gene_metadata):
     gene_metadata_df = gene_metadata_df.drop_duplicates(subset="query")
     gene_metadata_df.index = gene_metadata_df["query"]
     return gene_metadata_df
+
+
+def list_form_to_onehot_form(
+    list_df: pd.DataFrame,
+    participant_col_name: str = "Submitted entities found",
+    delimiter: str = ";",
+) -> pd.DataFrame:
+    """
+    Give a pathway data frame that has each pathway as a row with
+       a list of included genes the method creates a data frame where each
+       row is a gene and each column is a pathway the cells are true when
+       the gene is participating in the pathways.
+
+    Args:
+    ----
+        pathway_df (pd.DataFrame): A data frame with pathways as rows and a gene in one of the cells
+        pathway_name (str): The name of the pathways name columns
+        included_genes (str): The name of the included genes in a pathway
+        Submitted entities found with the participating genes
+
+    Returns:
+    -------
+        pd.DataFrame: A one hot dataframe where rows are genes and columns are pathways
+
+    """
+    full_identifier_list = delimiter.join(list_df[participant_col_name].values).split(
+        delimiter
+    )
+    unique_identifier_list = {x.strip() for x in full_identifier_list}
+    onehot_df = pd.DataFrame(
+        index=list(unique_identifier_list), columns=list_df.index, data=False
+    )
+    for pathway_idx in list_df.index:
+        path_genes = list_df.loc[pathway_idx, participant_col_name].split(delimiter)
+        onehot_df.loc[path_genes, pathway_idx] = True
+    return onehot_df