1
- from os import makedirs
2
- from pathlib import Path
3
-
4
1
import click
5
2
import pandas as pd
6
3
import requests
4
+ from task_retrieval import verify_source_of_data
5
+
6
+ from gene_benchmark .tasks import dump_task_definitions
7
+ from scripts .tasks_retrival .task_retrieval import list_form_to_onehot_form
8
+
9
+ TOP_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt"
10
+
11
+
12
+ def get_token_link_for_symbols (symbols : list [str ]) -> str :
13
+ """
14
+ Creates an analysis service pathways link for a given symbol list.
15
+
16
+ Args:
17
+ ----
18
+ symbols (list[str]): list of symbols to create a pathways data file for
19
+
20
+ Returns:
21
+ -------
22
+ str: the to the csv file with the pathways for the symbols
23
+
24
+ """
25
+ token = get_token (symbols )
26
+ return f"https://reactome.org/AnalysisService/download/{ token } /pathways/TOTAL/result.csv"
7
27
8
28
9
29
def get_symbol_list (
10
- url = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" ,
11
- ):
30
+ url : str = "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" ,
31
+ ) -> list [str ]:
32
+ """
33
+ Retrieves the symbol list from a HGNC json like file.
34
+
35
+ Args:
36
+ ----
37
+ url (str, optional): url for the json file download. Defaults to "https://g-a8b222.dd271.03c0.data.globus.org/pub/databases/genenames/hgnc/json/hgnc_complete_set.json".
38
+
39
+ Returns:
40
+ -------
41
+ list[str]: list of symbols
42
+
43
+ """
12
44
with requests .get (url ) as response :
13
45
response .raise_for_status ()
14
46
reactome_res = response .json ()
15
47
return [v ["symbol" ] for v in reactome_res ["response" ]["docs" ]]
16
48
17
49
18
50
def get_token (
19
- token_list ,
20
- projection_url = "https://reactome.org/AnalysisService/identifiers/projection" ,
21
- ):
51
+ identifiers : list [str ],
52
+ projection_url : str = "https://reactome.org/AnalysisService/identifiers/projection" ,
53
+ ) -> str :
54
+ """
55
+ Data retrieval from Reactome API requires the use of token that represent a list of identifiers,
56
+ the method use the AnalysisService API to get the token for a given identifiers list.
57
+
58
+ Args:
59
+ ----
60
+ identifiers (list[str]): List of identifiers
61
+ projection_url (str, optional): Analysis service link. Defaults to "https://reactome.org/AnalysisService/identifiers/projection".
62
+
63
+ Returns:
64
+ -------
65
+ str: A Reactome Analysis service token
66
+
67
+ """
22
68
headers = {
23
69
"Accept" : "application/json" ,
24
70
"Content-Type" : "text/plain" ,
25
71
}
26
- symbols = "\n " .join (token_list )
72
+ symbols = "\n " .join (identifiers )
27
73
response = requests .post (
28
74
projection_url ,
29
75
headers = headers ,
@@ -32,37 +78,54 @@ def get_token(
32
78
return response .json ()["summary" ]["token" ]
33
79
34
80
35
- def get_top_level_pathway (
36
- url = "https://reactome.org/download/current/ReactomePathwaysRelation.txt" ,
37
- ):
38
- hierarchies_df = pd .read_csv (
39
- url , delimiter = "\t " , header = 0 , names = ["parent" , "child" ]
40
- )
81
+ def get_top_level_pathway (hierarchies_df : pd .DataFrame ) -> set [str ]:
82
+ """
83
+ Returns the top level pathways from the table of pathways hierarchies.
84
+ top level are defined as pathways without a parent.
85
+
86
+ Args:
87
+ ----
88
+ hierarchies_df (pd.DataFrame): A data frame with a parent and child headers
89
+
90
+ Returns:
91
+ -------
92
+ set[str]: a set of top level pathways
93
+
94
+ """
41
95
pathway_that_are_parents = set (hierarchies_df ["parent" ].values )
42
96
pathway_that_are_children = set (hierarchies_df ["child" ].values )
43
97
pathway_who_are_just_parents = pathway_that_are_parents - pathway_that_are_children
44
98
return pathway_who_are_just_parents
45
99
46
100
47
- def pathway_to_onehot (pathway_df ):
48
- any_pathway_genes = list (
49
- set (";" .join (pathway_df ["Submitted entities found" ].values ).split (";" ))
50
- )
51
- outcomes = pd .DataFrame (
52
- index = any_pathway_genes , columns = pathway_df ["Pathway name" ], data = False
53
- )
54
- for pathway_idx in pathway_df .index :
55
- path_genes = pathway_df .loc [pathway_idx , "Submitted entities found" ].split (";" )
56
- pathway_name = pathway_df .loc [pathway_idx , "Pathway name" ]
57
- outcomes .loc [path_genes , pathway_name ] = True
58
- return outcomes
59
-
60
-
61
- def dump_to_task (task_dir , outcomes_df ):
62
- entities_path = task_dir / "entities.csv"
63
- outcomes_path = task_dir / "outcomes.csv"
64
- pd .Series (outcomes_df .index , name = "symbol" ).to_csv (entities_path , index = False )
65
- outcomes_df .to_csv (outcomes_path , index = False )
101
+ def create_top_level_task (
102
+ hierarchies_df : pd .DataFrame ,
103
+ df_path : pd .DataFrame ,
104
+ entities_name : str = "symbol" ,
105
+ pathway_names : str = "Pathway name" ,
106
+ ) -> tuple [pd .Series , pd .DataFrame ]:
107
+ """
108
+ Creates a top level tasks.
109
+
110
+ Args:
111
+ ----
112
+ hierarchies_df (pd.DataFrame): The pathways hierarchies table used to find the top pathways
113
+ df_path (pd.DataFrame): The pathways themselves, used to extract the gene list.
114
+ entities_name (str, optional): name of the entities. Defaults to 'symbol'.
115
+ pathway_names (str, optional): names of the pathways (converted from identifiers). Defaults to "Pathway name".
116
+
117
+ Returns:
118
+ -------
119
+ tuple[pd.Series,pd.DataFrame]: _description_
120
+
121
+ """
122
+ top_level = get_top_level_pathway (hierarchies_df )
123
+ top_in_file_paths = top_level .intersection (set (df_path .index ))
124
+ df_path_top = df_path .loc [list (top_in_file_paths ), :]
125
+ df_path_top .index = df_path_top [pathway_names ]
126
+ outcomes = list_form_to_onehot_form (df_path_top )
127
+ symbols = pd .Series (outcomes .index , name = entities_name )
128
+ return symbols , outcomes
66
129
67
130
68
131
@click .command ()
@@ -78,7 +141,7 @@ def dump_to_task(task_dir, outcomes_df):
78
141
"-n" ,
79
142
type = click .STRING ,
80
143
help = "name for the specific task" ,
81
- default = "Pathways" ,
144
+ default = "Pathways HGNC " ,
82
145
)
83
146
@click .option (
84
147
"--allow-downloads" ,
@@ -90,33 +153,50 @@ def dump_to_task(task_dir, outcomes_df):
90
153
"--pathways-file" ,
91
154
type = click .STRING ,
92
155
help = "Path to the pathways files from reactome available using the analysis GUI" ,
93
- default = "" ,
156
+ default = None ,
94
157
)
95
158
@click .option (
96
- "--top- pathways-file" ,
159
+ "--pathways-relation -file" ,
97
160
type = click .STRING ,
98
161
help = "The location of the ReactomePathwaysRelation file available at https://reactome.org/download-data" ,
99
- default = "" ,
162
+ default = None ,
163
+ )
164
+ @click .option (
165
+ "--verbose/--quite" ,
166
+ "-v/-q" ,
167
+ is_flag = True ,
168
+ default = True ,
100
169
)
101
170
def main (
102
- main_task_directory , task_name , allow_downloads , pathways_file , top_pathways_file
171
+ main_task_directory ,
172
+ task_name ,
173
+ allow_downloads ,
174
+ pathways_file ,
175
+ pathways_relation_file ,
176
+ verbose ,
103
177
):
104
- if allow_downloads :
105
- symb_list = get_symbol_list ()
106
- token = get_token (symb_list )
107
- url = f"https://reactome.org/AnalysisService/download/{ token } /pathways/TOTAL/result.csv"
108
- df_path = pd .read_csv (url , index_col = "Pathway identifier" )
109
- top_level = get_top_level_pathway ()
110
- else :
111
- df_path = pd .read_csv (pathways_file )
112
- top_level = pd .read_csv (top_pathways_file )
113
178
114
- top_in_file_paths = top_level .intersection (set (df_path .index ))
115
- df_path_top = df_path .loc [list (top_in_file_paths ), :]
116
- outcomes = pathway_to_onehot (df_path_top )
117
- task_dir = Path (main_task_directory ) / f"{ task_name } "
118
- makedirs (task_dir , exist_ok = True )
119
- dump_to_task (task_dir , outcomes )
179
+ reactom_url = (
180
+ get_token_link_for_symbols (get_symbol_list ()) if allow_downloads else ""
181
+ )
182
+
183
+ pathways_file = verify_source_of_data (
184
+ pathways_file , url = reactom_url , allow_downloads = allow_downloads
185
+ )
186
+ pathways_relation_file = verify_source_of_data (
187
+ pathways_relation_file , url = TOP_PATHWAYS_URL , allow_downloads = allow_downloads
188
+ )
189
+ df_path = pd .read_csv (pathways_file , index_col = "Pathway identifier" )
190
+
191
+ hierarchies_df = pd .read_csv (
192
+ pathways_relation_file , delimiter = "\t " , header = 0 , names = ["parent" , "child" ]
193
+ )
194
+ symbols , outcomes = create_top_level_task (hierarchies_df , df_path )
195
+ dump_task_definitions (symbols , outcomes , main_task_directory , task_name )
196
+ if verbose :
197
+ print (
198
+ f"{ task_name } was created at { main_task_directory } shaped { outcomes .shape } "
199
+ )
120
200
return
121
201
122
202
0 commit comments