diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 287db24..d32d5da 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -31,12 +31,12 @@ jobs: - name: Create tasks needed for testing run: | - python scripts/tasks_retrival/gene2gene_task_creation.py --allow-downloads True - python scripts/tasks_retrival/Genecorpus_tasks_creation.py --allow-downloads True - python scripts/tasks_retrival/HLA_task_creation.py --allow-downloads True - python scripts/tasks_retrival/HPA_tasks_creation.py --allow-downloads True - python scripts/tasks_retrival/humantfs_task_creation.py --allow-downloads True - python scripts/tasks_retrival/Reactome_tasks_creation.py --allow-downloads True + python scripts/tasks_retrieval/gene2gene_task_creation.py --allow-downloads True + python scripts/tasks_retrieval/Genecorpus_tasks_creation.py --allow-downloads True + python scripts/tasks_retrieval/HLA_task_creation.py --allow-downloads True + python scripts/tasks_retrieval/HPA_tasks_creation.py --allow-downloads True + python scripts/tasks_retrieval/humantfs_task_creation.py --allow-downloads True + python scripts/tasks_retrieval/Reactome_tasks_creation.py --allow-downloads True - name: Test with pytest run: | diff --git a/scripts/data_retrieval/gene_disease_association.py b/scripts/data_retrieval/gene_disease_association.py deleted file mode 100644 index 032da4c..0000000 --- a/scripts/data_retrieval/gene_disease_association.py +++ /dev/null @@ -1,88 +0,0 @@ -import click -import mygene -import pandas as pd - - -def get_symbols(gene_targetId_list): - """ - given s list of gene id's (names Like ENSG00000006468) this method - uses the MyGenInfo package to retrieve the gene symbol (name like PLAC4). - - Args: - ---- - gene_targetId_list (list): list of gene id's (names Like ENSG00000006468) - - Returns: - ------- - list: List of corresponding symbols - - """ - mg = mygene.MyGeneInfo() - list_of_gene_metadata = mg.querymany( - gene_targetId_list, species="human", fields="symbol" - ) - gene_metadata_df = get_id_to_symbol_df(list_of_gene_metadata) - return [gene_metadata_df.loc[x, "symbol"] for x in gene_targetId_list] - - -def get_id_to_symbol_df(list_of_gene_metadata): - """ - The method converts a list of gene metadata into a data frame, - each dictionary will contain the field symbol and the gene id as the query value - Args: - list_of_gene_metadata (list): list containing gene metadata. - - Returns - ------- - pd.DataFrame: a data frame with the gene id as index with the symbol as value - - """ - gene_metadata_df = pd.DataFrame(list_of_gene_metadata) - # some target id have multiple symbols - gene_metadata_df = gene_metadata_df.drop_duplicates(subset="query") - gene_metadata_df.index = gene_metadata_df["query"] - return gene_metadata_df - - -def download_gda_data(): - prq_location = "https://ftp.ebi.ac.uk/pub/databases/opentargets/platform/23.09/output/etl/parquet/" - res = [] - file_exist = True - part_ind = 0 - while file_exist: - try: - link_ars = f"associationByDatasourceDirect/part-{part_ind:05}-6866be1a-be5d-40cf-bdf6-627bef1d0410-c000.snappy.parquet" - gda_df = pd.read_parquet(prq_location + link_ars) - res.append(gda_df) - part_ind = part_ind + 1 - except: - file_exist = False - break - - return pd.concat(res) - - -@click.command() -@click.option( - "--output-file-name", - type=click.STRING, - help="The output file name", - default="gene_disease_association.csv", -) -@click.option( - "--association-type", - type=click.STRING, - help="The type of association to save", - default="genetic_association", -) -def main(output_file_name, association_type): - disease_ass_df = download_gda_data() - gda_df = disease_ass_df.loc[disease_ass_df["datatypeId"] == association_type, :] - print(f"Adding symbols for: {gda_df.shape[0]} associations") - sym = get_symbols(gda_df["targetId"]) - gda_df.loc[:, "symbol"] = sym - gda_df.to_csv(output_file_name) - - -if __name__ == "__main__": - main() diff --git a/scripts/encodings_retrival/extract_bag_of_words_encodings.py b/scripts/encodings_retrieval/extract_bag_of_words_encodings.py similarity index 100% rename from scripts/encodings_retrival/extract_bag_of_words_encodings.py rename to scripts/encodings_retrieval/extract_bag_of_words_encodings.py diff --git a/scripts/encodings_retrival/extract_cellPLM_encodings.py b/scripts/encodings_retrieval/extract_cellPLM_encodings.py similarity index 100% rename from scripts/encodings_retrival/extract_cellPLM_encodings.py rename to scripts/encodings_retrieval/extract_cellPLM_encodings.py diff --git a/scripts/encodings_retrival/extract_gene2vec_encodings.py b/scripts/encodings_retrieval/extract_gene2vec_encodings.py similarity index 100% rename from scripts/encodings_retrival/extract_gene2vec_encodings.py rename to scripts/encodings_retrieval/extract_gene2vec_encodings.py diff --git a/scripts/encodings_retrival/extract_geneformer_encodings.py b/scripts/encodings_retrieval/extract_geneformer_encodings.py similarity index 100% rename from scripts/encodings_retrival/extract_geneformer_encodings.py rename to scripts/encodings_retrieval/extract_geneformer_encodings.py diff --git a/scripts/encodings_retrival/extract_scGPT_encodings.py b/scripts/encodings_retrieval/extract_scGPT_encodings.py similarity index 100% rename from scripts/encodings_retrival/extract_scGPT_encodings.py rename to scripts/encodings_retrieval/extract_scGPT_encodings.py diff --git a/scripts/tasks_retrival/Genecorpus_tasks_creation.py b/scripts/tasks_retrieval/Genecorpus_tasks_creation.py similarity index 100% rename from scripts/tasks_retrival/Genecorpus_tasks_creation.py rename to scripts/tasks_retrieval/Genecorpus_tasks_creation.py diff --git a/scripts/tasks_retrival/HLA_task_creation.py b/scripts/tasks_retrieval/HLA_task_creation.py similarity index 100% rename from scripts/tasks_retrival/HLA_task_creation.py rename to scripts/tasks_retrieval/HLA_task_creation.py diff --git a/scripts/tasks_retrival/HPA_tasks_creation.py b/scripts/tasks_retrieval/HPA_tasks_creation.py similarity index 96% rename from scripts/tasks_retrival/HPA_tasks_creation.py rename to scripts/tasks_retrieval/HPA_tasks_creation.py index 5cbbee3..f3f9121 100644 --- a/scripts/tasks_retrival/HPA_tasks_creation.py +++ b/scripts/tasks_retrieval/HPA_tasks_creation.py @@ -2,7 +2,7 @@ import pandas as pd from gene_benchmark.tasks import dump_task_definitions -from scripts.tasks_retrival.task_retrieval import ( +from scripts.tasks_retrieval.task_retrieval import ( check_data_type, create_single_label_task, load_yaml_file, @@ -64,7 +64,7 @@ def create_tasks(data, main_task_directory, verbose=False): "--columns-to-use-yaml", type=click.STRING, help="A path to a yaml file containing the column names to be used as tasks", - default="scripts/tasks_retrival/hpa_column_names_for_tasks.yaml", + default="scripts/tasks_retrieval/hpa_column_names_for_tasks.yaml", ) @click.option( "--main-task-directory", diff --git a/scripts/tasks_retrival/Reactome_tasks_creation.py b/scripts/tasks_retrieval/Reactome_tasks_creation.py similarity index 98% rename from scripts/tasks_retrival/Reactome_tasks_creation.py rename to scripts/tasks_retrieval/Reactome_tasks_creation.py index 7a63054..bd57a17 100644 --- a/scripts/tasks_retrival/Reactome_tasks_creation.py +++ b/scripts/tasks_retrieval/Reactome_tasks_creation.py @@ -4,7 +4,7 @@ from task_retrieval import verify_source_of_data from gene_benchmark.tasks import dump_task_definitions -from scripts.tasks_retrival.task_retrieval import list_form_to_onehot_form +from scripts.tasks_retrieval.task_retrieval import list_form_to_onehot_form TOP_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathwaysRelation.txt" diff --git a/scripts/tasks_retrival/gene2gene_task_creation.py b/scripts/tasks_retrieval/gene2gene_task_creation.py similarity index 100% rename from scripts/tasks_retrival/gene2gene_task_creation.py rename to scripts/tasks_retrieval/gene2gene_task_creation.py diff --git a/scripts/tasks_retrival/gene_disease_association_task_creation.py b/scripts/tasks_retrieval/gene_disease_association_task_creation.py similarity index 98% rename from scripts/tasks_retrival/gene_disease_association_task_creation.py rename to scripts/tasks_retrieval/gene_disease_association_task_creation.py index 0b9d877..85c4aaf 100644 --- a/scripts/tasks_retrival/gene_disease_association_task_creation.py +++ b/scripts/tasks_retrieval/gene_disease_association_task_creation.py @@ -38,7 +38,7 @@ from task_retrieval import get_symbols, verify_source_of_data from gene_benchmark.tasks import dump_task_definitions -from scripts.tasks_retrival.task_retrieval import print_numerical_task_report +from scripts.tasks_retrieval.task_retrieval import print_numerical_task_report DATA_URL = ( "https://ftp.ebi.ac.uk/pub/databases/opentargets/platform/23.09/output/etl/parquet/" diff --git a/scripts/tasks_retrival/hpa_column_names_for_tasks.yaml b/scripts/tasks_retrieval/hpa_column_names_for_tasks.yaml similarity index 100% rename from scripts/tasks_retrival/hpa_column_names_for_tasks.yaml rename to scripts/tasks_retrieval/hpa_column_names_for_tasks.yaml diff --git a/scripts/tasks_retrival/humantfs_task_creation.py b/scripts/tasks_retrieval/humantfs_task_creation.py similarity index 100% rename from scripts/tasks_retrival/humantfs_task_creation.py rename to scripts/tasks_retrieval/humantfs_task_creation.py diff --git a/scripts/tasks_retrival/task_retrieval.py b/scripts/tasks_retrieval/task_retrieval.py similarity index 100% rename from scripts/tasks_retrival/task_retrieval.py rename to scripts/tasks_retrieval/task_retrieval.py diff --git a/tasks/README.MD b/tasks/README.MD index 333d828..ceffce3 100644 --- a/tasks/README.MD +++ b/tasks/README.MD @@ -1,5 +1,5 @@ # Task retrieval -Our package allows users to easily evaluate the gene encodings on hundreds of tasks from varying sources and types. The package does not include the task data itself to avoid redistributing data from various sources with various public licenses. However, we provide an easy [command line interface](../scripts/tasks_retrival/) to download and populate a folder with the tasks in the appropriate format. The list of tasks and their descriptions are available in an [excel file](task_descriptions.xlsx). +Our package allows users to easily evaluate the gene encodings on hundreds of tasks from varying sources and types. The package does not include the task data itself to avoid redistributing data from various sources with various public licenses. However, we provide an easy [command line interface](../scripts/tasks_retrieval/) to download and populate a folder with the tasks in the appropriate format. The list of tasks and their descriptions are available in an [excel file](task_descriptions.xlsx). Note that the interface allows to download data from various sources. To do so the user needs to explicitly allow this. Use this option only if you trust the URLs appearing in the scripts.