From 33265ce5a84dc7590a1132f3476cce2a94fe2f97 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Mon, 2 Dec 2024 16:13:32 +0100 Subject: [PATCH 1/7] allow custom datasets --- drevalpy/datasets/curvecurator.py | 2 +- drevalpy/utils.py | 35 +++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/drevalpy/datasets/curvecurator.py b/drevalpy/datasets/curvecurator.py index f6ed52e..9b4674e 100644 --- a/drevalpy/datasets/curvecurator.py +++ b/drevalpy/datasets/curvecurator.py @@ -169,7 +169,7 @@ def postprocess(output_folder: str | Path, dataset_name: str): This function reads the curves.txt file created by CurveCurator, which contains the fitted curve parameters and postprocesses it to be used by drevalpy. - :param output_folder: Path to the output folder of CurveCurator containin the curves.txt file. + :param output_folder: Path to the output folder of CurveCurator containing the curves.txt file. :param dataset_name: The name of the dataset, will be used to prepend the postprocessed .csv file """ output_folder = Path(output_folder) diff --git a/drevalpy/utils.py b/drevalpy/utils.py index 0962407..9587c73 100644 --- a/drevalpy/utils.py +++ b/drevalpy/utils.py @@ -1,7 +1,7 @@ """Utility functions for the evaluation pipeline.""" import argparse -import os +from pathlib import Path from typing import Optional from sklearn.base import TransformerMixin @@ -186,6 +186,7 @@ def check_arguments(args) -> None: :param args: arguments passed from the command line :raises AssertionError: if any of the arguments is invalid :raises ValueError: if the number of cross-validation splits or curve_curator_cores is less than 1 + :raises FileNotFoundError: if a custom dataset name was specified and the input file could not be found. """ if not args.models: raise AssertionError("At least one model must be specified") @@ -205,13 +206,28 @@ def check_arguments(args) -> None: f"want to use your own baseline, you need to implement a new model class and add it to " f"the MODEL_FACTORY in the models init" ) - if args.dataset_name not in AVAILABLE_DATASETS: - raise AssertionError( - f"Invalid dataset name. Available datasets are {list(AVAILABLE_DATASETS.keys())} " - f"If you want to use your own dataset, you need to implement a new response dataset loader " - f"and add it to the AVAILABLE_DATASETS in the response_datasets init" - ) + if args.curve_curator: + expected_custom_input = Path(args.path_data) / args.dataset_name / f"{args.dataset_name}_raw.csv" + if not expected_custom_input.is_file(): + raise FileNotFoundError( + "You specified the curve_curator option with a custom dataset name which requires raw " + "viability data to be located at {expected_custom_input} but the file does not exist. " + "Please check the 'path_data' and 'dataset_name' arguments and ensure the raw viability " + "input file is located at //_raw.csv." + ) + else: + expected_custom_input = Path(args.path_data) / args.dataset_name / f"{args.dataset_name}_raw.csv" + if not expected_custom_input.is_file(): + raise FileNotFoundError( + "You specified a custom dataset name which requires prefit curve data to be located at " + "{expected_custom_input} but the file does not exist. Please check the 'path_data' and " + "'dataset_name' arguments and ensure the prefit curve data is located at input file is " + "located at //.csv." + ) + + if args.curve_curator and args.curve_curator_cores < 1: + raise ValueError("Number of cores for CurveCurator must be greater than 0.") for dataset in args.cross_study_datasets: if dataset not in AVAILABLE_DATASETS: @@ -223,7 +239,7 @@ def check_arguments(args) -> None: ) # if the path to args.path_data does not exist, create the directory - os.makedirs(args.path_data, exist_ok=True) + Path(args.path_data).mkdir(parents=True, exist_ok=True) if args.n_cv_splits <= 1: raise ValueError("Number of cross-validation splits must be greater than 1.") @@ -234,9 +250,6 @@ def check_arguments(args) -> None: raise AssertionError( "At least one invalid randomization mode. Available randomization modes are SVCC, " "SVRC, SVSC, SVRD" ) - if args.curve_curator: - if args.curve_curator_cores < 1: - raise ValueError("Number of cores for CurveCurator must be greater than 0.") if args.measure not in ["LN_IC50", "response"]: raise ValueError("Only 'LN_IC50' and 'response' are currently available as a drug response measure.") From 3ff91c734a36e2eaff6952a0cf882e47c9a7e29a Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Tue, 3 Dec 2024 12:06:16 +0100 Subject: [PATCH 2/7] fixed argument checks for allowed measures --- drevalpy/datasets/curvecurator.py | 4 ++-- drevalpy/utils.py | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drevalpy/datasets/curvecurator.py b/drevalpy/datasets/curvecurator.py index 9b4674e..45f88ea 100644 --- a/drevalpy/datasets/curvecurator.py +++ b/drevalpy/datasets/curvecurator.py @@ -175,13 +175,13 @@ def postprocess(output_folder: str | Path, dataset_name: str): output_folder = Path(output_folder) required_columns = { "Name": "Name", - "pEC50": "pEC50", + "pEC50": "pEC50_curvecurator", "pEC50 Error": "pEC50Error", "Curve Slope": "Slope", "Curve Front": "Front", "Curve Back": "Back", "Curve Fold Change": "FoldChange", - "Curve AUC": "AUC", + "Curve AUC": "AUC_curvecurator", "Curve R2": "R2", "Curve P_Value": "pValue", "Curve Relevance Score": "RelevanceScore", diff --git a/drevalpy/utils.py b/drevalpy/utils.py index 2aa1973..8ab2f93 100644 --- a/drevalpy/utils.py +++ b/drevalpy/utils.py @@ -257,8 +257,13 @@ def check_arguments(args) -> None: if args.n_trials_robustness < 0: raise ValueError("Number of trials for robustness test must be greater than or equal to 0") - if args.measure not in ["LN_IC50", "response"]: - raise ValueError("Only 'LN_IC50' and 'response' are currently available as a drug response measure.") + allowed_measures = ["LN_IC50", "EC50", "IC50", "pEC50", "AUC", "response"] + allowed_measures.extend([f"{m}_curvecurator" for m in allowed_measures]) + if args.measure not in allowed_measures: + raise ValueError( + "Only 'LN_IC50', 'EC50', 'IC50', 'pEC50', 'AUC', 'response' or their equivalents including " + "the '_curvecurator' suffix are allowed drug response measures." + ) if args.response_transformation not in ["None", "standard", "minmax", "robust"]: raise AssertionError("Invalid response_transformation. Choose from None, standard, minmax, robust") From 5773660d824276733b4d42d1beadca8268c89339 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Tue, 3 Dec 2024 15:55:16 +0100 Subject: [PATCH 3/7] fixed f-string --- drevalpy/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drevalpy/utils.py b/drevalpy/utils.py index 8ab2f93..272b938 100644 --- a/drevalpy/utils.py +++ b/drevalpy/utils.py @@ -212,7 +212,7 @@ def check_arguments(args) -> None: if not expected_custom_input.is_file(): raise FileNotFoundError( "You specified the curve_curator option with a custom dataset name which requires raw " - "viability data to be located at {expected_custom_input} but the file does not exist. " + f"viability data to be located at {expected_custom_input} but the file does not exist. " "Please check the 'path_data' and 'dataset_name' arguments and ensure the raw viability " "input file is located at //_raw.csv." ) @@ -221,7 +221,7 @@ def check_arguments(args) -> None: if not expected_custom_input.is_file(): raise FileNotFoundError( "You specified a custom dataset name which requires prefit curve data to be located at " - "{expected_custom_input} but the file does not exist. Please check the 'path_data' and " + f"{expected_custom_input} but the file does not exist. Please check the 'path_data' and " "'dataset_name' arguments and ensure the prefit curve data is located at input file is " "located at //.csv." ) From 10eef2454afc09252d0651f5fb1ea40b3b45cdfd Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Tue, 3 Dec 2024 16:54:35 +0100 Subject: [PATCH 4/7] make paths absolute --- drevalpy/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drevalpy/utils.py b/drevalpy/utils.py index 272b938..8eb66e3 100644 --- a/drevalpy/utils.py +++ b/drevalpy/utils.py @@ -208,7 +208,7 @@ def check_arguments(args) -> None: ) if args.dataset_name not in AVAILABLE_DATASETS: if args.curve_curator: - expected_custom_input = Path(args.path_data) / args.dataset_name / f"{args.dataset_name}_raw.csv" + expected_custom_input = Path(args.path_data).absolute() / args.dataset_name / f"{args.dataset_name}_raw.csv" if not expected_custom_input.is_file(): raise FileNotFoundError( "You specified the curve_curator option with a custom dataset name which requires raw " @@ -217,7 +217,7 @@ def check_arguments(args) -> None: "input file is located at //_raw.csv." ) else: - expected_custom_input = Path(args.path_data) / args.dataset_name / f"{args.dataset_name}_raw.csv" + expected_custom_input = Path(args.path_data).absolute() / args.dataset_name / f"{args.dataset_name}_raw.csv" if not expected_custom_input.is_file(): raise FileNotFoundError( "You specified a custom dataset name which requires prefit curve data to be located at " @@ -281,7 +281,6 @@ def main(args) -> None: :param args: passed from command line """ check_arguments(args) - # PIPELINE: LOAD_RESPONSE response_data, cross_study_datasets = get_datasets( dataset_name=args.dataset_name, From f1eb84eb8abe67aaf30939250733ce591894a692 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Wed, 4 Dec 2024 15:53:06 +0100 Subject: [PATCH 5/7] fix incorrect column name --- drevalpy/datasets/curvecurator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drevalpy/datasets/curvecurator.py b/drevalpy/datasets/curvecurator.py index 45f88ea..b7711f7 100644 --- a/drevalpy/datasets/curvecurator.py +++ b/drevalpy/datasets/curvecurator.py @@ -131,7 +131,7 @@ def ic50(front, back, slope, pec50): front = model_params_df["Front"].values back = model_params_df["Back"].values slope = model_params_df["Slope"].values - pec50 = model_params_df["pEC50"].values + pec50 = model_params_df["pEC50_curvecurator"].values model_params_df["IC50_curvecurator"] = ic50(front, back, slope, pec50) @@ -197,7 +197,7 @@ def postprocess(output_folder: str | Path, dataset_name: str): ) fitted_curve_data[["cell_line_id", "drug_id"]] = fitted_curve_data.Name.str.split("|", expand=True) fitted_curve_data["EC50_curvecurator"] = np.power( - 10, -fitted_curve_data["pEC50"].values + 10, -fitted_curve_data["pEC50_curvecurator"].values ) # in CurveCurator 10^-pEC50 = EC50 _calc_ic50(fitted_curve_data) fitted_curve_data.to_csv(output_folder / f"{dataset_name}.csv", index=None) From a7535422ffe982ab724eb38c622189e482e3724f Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Wed, 4 Dec 2024 16:11:52 +0100 Subject: [PATCH 6/7] updated docs for custom datasets and measure arg --- docs/usage.rst | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 98f9f32..2486360 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -24,19 +24,19 @@ Options: * ``-h, --help``: Show help message and exit. * ``--run_id RUN_ID``: Identifier for the run. Will be used as a prefix for all output files. -* ``--path_data PATH_DATA``: Path to the data directory. All data files should be stored in this directory and will be downloaded into this directory. The location of the datasets are resolved by //.csv. If providing raw viability data, the file need to be name _raw.csv instead and --curve_curator needs to be specified for automated curve fitting (see curve_curator option for details). +* ``--path_data PATH_DATA``: Path to the data directory. All data files should be stored in this directory and will be downloaded into this directory. The location of the datasets are resolved by ``//.csv``. If providing raw viability data, the file needs to be named ``_raw.csv`` instead and ``--curve_curator`` needs to be specified for automated curve fitting (see ``--curve_curator`` for details and also check the :ref:`usage:Custom Datasets` section). * ``--models MODELS [MODELS ...]``: List of models to evaluate. For a list of available models, see the :ref:`usage:Available Models` section. * ``--baselines BASELINES [BASELINES ...]``: List of baselines to evaluate. For a list of available baselines, see the :ref:`usage:Available Models` section. * ``--test_mode TEST_MODE [TEST_MODE ...]``: Which tests to run (LPO=Leave-random-Pairs-Out, LCO=Leave-Cell-line-Out, LDO=Leave-Drug-Out). Can be a list of test runs e.g. 'LPO LCO LDO' to run all tests. Default is LPO. For more information, see the :ref:`usage:Available Settings` section. * ``--randomization_mode RANDOMIZATION_MODE [RANDOMIZATION_MODE ...]``: Which randomization mode to use. Can be a list of randomization modes e.g. 'SVCC SVCD SVRC SVRD' to run all randomization modes. Default is None. For more information, see the :ref:`usage:Available Randomization Tests` section. * ``--randomization_type RANDOMIZATION_TYPE``: Which randomization type to use. Default is 'permutation'. For more information, see the :ref:`usage:Available Randomization Tests` section. * ``--n_trials_robustness N_TRIALS_ROBUSTNESS``: Number of trials for robustness testing. Default is 0, which means no robustness testing. For more information, see the :ref:`usage:Robustness Test` section. -* ``--dataset_name DATASET_NAME``: Name of the dataset to use. For a list of available datasets, see the :ref:`usage:Available Datasets` section. +* ``--dataset_name DATASET_NAME``: Name of the dataset to use. For a list of available datasets, see the :ref:`usage:Available Datasets` section. For information on how to use custom datasets, see the :ref:`usage:Custom Datasets` section. * ``--cross_study_datasets CROSS_STUDY_DATASETS [CROSS_STUDY_DATASETS ...]``: List of datasets to use for cross-study validation. For a list of available datasets, see the :ref:`usage:Available Datasets` section. * ``--path_out PATH_OUT``: Path to the output directory. All output files will be stored in this directory. -* ``--measure MEASURE``: The name of the measure to predict, can be one of ['LN_IC50']. If curve_curator is True, this measure is appended with "_curvecurator", e.g. "response_curvecurator" to distinguish between measures provided by the original source of a dataset, or the measures fit by CurveCurator. -* ``--curve_curator``: If set, the measure is appended with "_curvecurator". If a custom dataset_name was provided, this will invoke the fitting procedure of raw viability data, which is expected to exist at //_raw.csv. The fitted dataset will be stored in the same folder, in a file called .csv -* ``--curve_curator_cores [CORES]``: Number of cores to use for CurveCurator fitting. Only used when curve_curator is set. +* ``--measure MEASURE``: The name of the measure to use, default 'LN_IC50'. If using one of the available datasets (see ``--dataset_name``), this is restricted to one of ['LN_IC50', 'EC50', 'IC50', 'pEC50', 'AUC', 'response']. This corresponds to the names of the columns that contain theses measures in the provided input dataset. If providing a custom dataset, this may differ. If the option ``--curve_curator`` is set, the prefix '_curvecurator' is automatically appended, e.g. 'LN_IC50_curvecurator', to allow using the refit measures instead of the ones originally published for the available datasets, allowing for better dataset comparability (refit measures are already provided in the available datasets or computed as part of the fitting procedure when providing custom raw viability datasets, see ``--curve_curator`` for details). +* ``--curve_curator``: If set, the measure is appended with '_curvecurator'. If a custom dataset_name was provided, this will invoke the fitting procedure of raw viability data, which is expected to exist at ``//_raw.csv``. The fitted dataset will be stored in the same folder, in a file called ``.csv``. Also check the :ref:`usage:Custom Datasets` section. +* ``--curve_curator_cores CORES``: Number of cores to use for CurveCurator fitting. Only used when ``--curve_curator`` is set. * ``--overwrite``: If set, existing files will be overwritten. * ``--optim_metric OPTIM_METRIC``: The metric to optimize for during hyperparameter tuning. Default is 'R^2'. For more information, see the :ref:`usage:Available Metrics` section. * ``--n_cv_splits N_CV_SPLITS``: Number of cross-validation splits. Default is 7. @@ -160,12 +160,12 @@ We provide commonly used datasets to evaluate your model on (GDSC1, GDSC2, CCLE, | Toy_Data | 40 | 98 | A toy dataset for testing purposes. | +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+ -If using the ``--curve_curator`` option with these datasets, the desired measure provided with the ``--measure``option is appended with "_curvecurator", e.g. "IC50_curvecurator". +If using the ``--curve_curator`` option with these datasets, the desired measure provided with the ``--measure`` option is appended with "_curvecurator", e.g. "IC50_curvecurator". In the provided datasets, these are the measures calculated with the same fitting procedure using CurveCurator. To use the measures reported from the original publications of the dataset, do not set the ``--curve_curator`` option. This however makes it hard to do cross-study comparisons, since the measures may not be directly comparable due to differences in the fitting procedures used by the original authors. -It is therefore recommended to alway use DrEvalPy with the `--curve_curator` option, even when providing your own custom datasets (see next section). +It is therefore recommended to always use DrEvalPy with the ``--curve_curator`` option, even when providing your own custom datasets (see next section). Custom Datasets --------------- @@ -177,7 +177,7 @@ the available datasets in the previous section. * DrEvalPy expects a csv-formatted file in the location ``//_raw.csv`` (corresponding to the ``--path_data`` and ``--dataset_name`` options), which contains the raw viability data in long format with the columns ["dose", "response", "sample", "drug"] and an optional "replicate" column. - If replicates are provided, the procedure will fit one curve for all replicates. + If replicates are provided, the procedure will fit one curve per sample / drug pair using all replicates. * The options ``--curve_curator`` and ``--curve_curator_cores`` must be set. * Available measures are ["AUC", "pEC50", "EC50", "IC50"]. * DrEvalPy provides all results of the fitting in the same folder including the fitted curves in a file folder ``//.csv`` @@ -185,7 +185,9 @@ the available datasets in the previous section. **Prefit viability data** * DrEvalPy expects a csv-formatted file in the location ``//.csv`` (corresponding to the ``--path_data`` and ``--dataset_name`` options), - with at least the columns ["cell_line_id", "drug_id", "] where measure corresponds to what is provided using the `--measure` option. + with at least the columns ["cell_line_id", "drug_id", "] where is replaced with the name of the measure you provide +* Available measures depend on the column names and can be provided using the `--measure` option. +* It is required that you use measure names that are also working with the available datasets if you use the ``--cross_study_datasets`` option Available Randomization Tests ----------------------------- From cbf60836b2723cee886a4728dadfd7534a483ab0 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Wed, 4 Dec 2024 17:16:53 +0100 Subject: [PATCH 7/7] ignore errors in ic50_calculation --- drevalpy/datasets/curvecurator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drevalpy/datasets/curvecurator.py b/drevalpy/datasets/curvecurator.py index b7711f7..9db6791 100644 --- a/drevalpy/datasets/curvecurator.py +++ b/drevalpy/datasets/curvecurator.py @@ -126,7 +126,8 @@ def _calc_ic50(model_params_df: pd.DataFrame): """ def ic50(front, back, slope, pec50): - return (np.log10((front - back) / (0.5 + back)) - slope * pec50) / slope + with np.errstate(invalid="ignore"): + return (np.log10((front - back) / (0.5 + back)) - slope * pec50) / slope front = model_params_df["Front"].values back = model_params_df["Back"].values