Merge pull request #90 from daisybio/fix/allow_custom_dataset_names

Fix argument checking and allow multiple measures
daisybio · Dec 9, 2024 · 3ee0361 · 3ee0361
2 parents de1d7ce + cbf6083
commit 3ee0361
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 30 deletions.
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -24,19 +24,19 @@ Options:
 
 * ``-h, --help``: Show help message and exit.
 * ``--run_id RUN_ID``: Identifier for the run. Will be used as a prefix for all output files.
-* ``--path_data PATH_DATA``: Path to the data directory. All data files should be stored in this directory and will be downloaded into this directory. The location of the datasets are resolved by <path_data>/<dataset_name>/<dataset_name>.csv. If providing raw viability data, the file need to be name <dataset_name>_raw.csv instead and --curve_curator needs to be specified for automated curve fitting (see curve_curator option for details).
+* ``--path_data PATH_DATA``: Path to the data directory. All data files should be stored in this directory and will be downloaded into this directory. The location of the datasets are resolved by ``<path_data>/<dataset_name>/<dataset_name>.csv``. If providing raw viability data, the file needs to be named ``<dataset_name>_raw.csv`` instead and ``--curve_curator`` needs to be specified for automated curve fitting (see ``--curve_curator`` for details and also check the :ref:`usage:Custom Datasets` section).
 * ``--models MODELS [MODELS ...]``: List of models to evaluate. For a list of available models, see the :ref:`usage:Available Models` section.
 * ``--baselines BASELINES [BASELINES ...]``: List of baselines to evaluate. For a list of available baselines, see the :ref:`usage:Available Models` section.
 * ``--test_mode TEST_MODE [TEST_MODE ...]``: Which tests to run (LPO=Leave-random-Pairs-Out, LCO=Leave-Cell-line-Out, LDO=Leave-Drug-Out). Can be a list of test runs e.g. 'LPO LCO LDO' to run all tests. Default is LPO. For more information, see the :ref:`usage:Available Settings` section.
 * ``--randomization_mode RANDOMIZATION_MODE [RANDOMIZATION_MODE ...]``: Which randomization mode to use. Can be a list of randomization modes e.g. 'SVCC SVCD SVRC SVRD' to run all randomization modes. Default is None. For more information, see the :ref:`usage:Available Randomization Tests` section.
 * ``--randomization_type RANDOMIZATION_TYPE``: Which randomization type to use. Default is 'permutation'. For more information, see the :ref:`usage:Available Randomization Tests` section.
 * ``--n_trials_robustness N_TRIALS_ROBUSTNESS``: Number of trials for robustness testing. Default is 0, which means no robustness testing. For more information, see the :ref:`usage:Robustness Test` section.
-* ``--dataset_name DATASET_NAME``: Name of the dataset to use. For a list of available datasets, see the :ref:`usage:Available Datasets` section.
+* ``--dataset_name DATASET_NAME``: Name of the dataset to use. For a list of available datasets, see the :ref:`usage:Available Datasets` section. For information on how to use custom datasets, see the :ref:`usage:Custom Datasets` section.
 * ``--cross_study_datasets CROSS_STUDY_DATASETS [CROSS_STUDY_DATASETS ...]``: List of datasets to use for cross-study validation. For a list of available datasets, see the :ref:`usage:Available Datasets` section.
 * ``--path_out PATH_OUT``: Path to the output directory. All output files will be stored in this directory.
-* ``--measure MEASURE``: The name of the measure to predict, can be one of ['LN_IC50']. If curve_curator is True, this measure is appended with "_curvecurator", e.g. "response_curvecurator" to distinguish between measures provided by the original source of a dataset, or the measures fit by CurveCurator.
-* ``--curve_curator``: If set, the measure is appended with "_curvecurator". If a custom dataset_name was provided, this will invoke the fitting procedure of raw viability data, which is expected to exist at <path_data>/<dataset_name>/<dataset_name>_raw.csv. The fitted dataset will be stored in the same folder, in a file called <dataset_name>.csv
-* ``--curve_curator_cores [CORES]``: Number of cores to use for CurveCurator fitting. Only used when curve_curator is set.
+* ``--measure MEASURE``: The name of the measure to use, default 'LN_IC50'. If using one of the available datasets (see ``--dataset_name``), this is restricted to one of ['LN_IC50', 'EC50', 'IC50', 'pEC50', 'AUC', 'response']. This corresponds to the names of the columns that contain theses measures in the provided input dataset. If providing a custom dataset, this may differ. If the option ``--curve_curator`` is set, the prefix '_curvecurator' is automatically appended, e.g. 'LN_IC50_curvecurator', to allow using the refit measures instead of the ones originally published for the available datasets, allowing for better dataset comparability (refit measures are already provided in the available datasets or computed as part of the fitting procedure when providing custom raw viability datasets, see ``--curve_curator`` for details).
+* ``--curve_curator``: If set, the measure is appended with '_curvecurator'. If a custom dataset_name was provided, this will invoke the fitting procedure of raw viability data, which is expected to exist at ``<path_data>/<dataset_name>/<dataset_name>_raw.csv``. The fitted dataset will be stored in the same folder, in a file called ``<dataset_name>.csv``. Also check the :ref:`usage:Custom Datasets` section.
+* ``--curve_curator_cores CORES``: Number of cores to use for CurveCurator fitting. Only used when ``--curve_curator`` is set.
 * ``--overwrite``: If set, existing files will be overwritten.
 * ``--optim_metric OPTIM_METRIC``: The metric to optimize for during hyperparameter tuning. Default is 'R^2'. For more information, see the :ref:`usage:Available Metrics` section.
 * ``--n_cv_splits N_CV_SPLITS``: Number of cross-validation splits. Default is 7.
@@ -160,12 +160,12 @@ We provide commonly used datasets to evaluate your model on (GDSC1, GDSC2, CCLE,
 | Toy_Data          | 40              | 98                  | A toy dataset for testing purposes.                                                                                   |
 +-------------------+-----------------+---------------------+-----------------------------------------------------------------------------------------------------------------------+
 
-If using the ``--curve_curator`` option with these datasets, the desired measure provided with the  ``--measure``option is appended with "_curvecurator", e.g. "IC50_curvecurator".
+If using the ``--curve_curator`` option with these datasets, the desired measure provided with the ``--measure`` option is appended with "_curvecurator", e.g. "IC50_curvecurator".
 In the provided datasets, these are the measures calculated with the same fitting procedure using CurveCurator. To use the measures reported from the original publications of the
 dataset, do not set the ``--curve_curator`` option.
 
 This however makes it hard to do cross-study comparisons, since the measures may not be directly comparable due to differences in the fitting procedures used by the original authors.
-It is therefore recommended to alway use DrEvalPy with the `--curve_curator` option, even when providing your own custom datasets (see next section).
+It is therefore recommended to always use DrEvalPy with the ``--curve_curator`` option, even when providing your own custom datasets (see next section).
 
 Custom Datasets
 ---------------
@@ -177,15 +177,17 @@ the available datasets in the previous section.
 
 * DrEvalPy expects a csv-formatted file in the location ``<path_data>/<dataset>/<dataset_name>_raw.csv`` (corresponding to the ``--path_data`` and ``--dataset_name`` options),
   which contains the raw viability data in long format with the columns ["dose", "response", "sample", "drug"] and an optional "replicate" column.
-  If replicates are provided, the procedure will fit one curve for all replicates.
+  If replicates are provided, the procedure will fit one curve per sample / drug pair using all replicates.
 * The options ``--curve_curator`` and ``--curve_curator_cores`` must be set.
 * Available measures are ["AUC", "pEC50", "EC50", "IC50"].
 * DrEvalPy provides all results of the fitting in the same folder including the fitted curves in a file folder ``<path_data>/<dataset>/<dataset_name>.csv``
 
 **Prefit viability data**
 
 * DrEvalPy expects a csv-formatted file in the location ``<path_data>/<dataset>/<dataset_name>.csv`` (corresponding to the ``--path_data`` and ``--dataset_name`` options),
-  with at least the columns ["cell_line_id", "drug_id", <measure>"] where measure corresponds to what is provided using the `--measure` option.
+  with at least the columns ["cell_line_id", "drug_id", <measure>"] where <measure> is replaced with the name of the measure you provide
+* Available measures depend on the column names and can be provided using the `--measure` option.
+* It is required that you use measure names that are also working with the available datasets if you use the ``--cross_study_datasets`` option
 
 Available Randomization Tests
 -----------------------------

diff --git a/drevalpy/datasets/curvecurator.py b/drevalpy/datasets/curvecurator.py
@@ -126,12 +126,13 @@ def _calc_ic50(model_params_df: pd.DataFrame):
     """
 
     def ic50(front, back, slope, pec50):
-        return (np.log10((front - back) / (0.5 + back)) - slope * pec50) / slope
+        with np.errstate(invalid="ignore"):
+            return (np.log10((front - back) / (0.5 + back)) - slope * pec50) / slope
 
     front = model_params_df["Front"].values
     back = model_params_df["Back"].values
     slope = model_params_df["Slope"].values
-    pec50 = model_params_df["pEC50"].values
+    pec50 = model_params_df["pEC50_curvecurator"].values
 
     model_params_df["IC50_curvecurator"] = ic50(front, back, slope, pec50)
 
@@ -169,19 +170,19 @@ def postprocess(output_folder: str | Path, dataset_name: str):
     This function reads the curves.txt file created by CurveCurator, which contains the
     fitted curve parameters and postprocesses it to be used by drevalpy.
 
-    :param output_folder: Path to the output folder of CurveCurator containin the curves.txt file.
+    :param output_folder: Path to the output folder of CurveCurator containing the curves.txt file.
     :param dataset_name: The name of the dataset, will be used to prepend the postprocessed <dataset_name>.csv file
     """
     output_folder = Path(output_folder)
     required_columns = {
         "Name": "Name",
-        "pEC50": "pEC50",
+        "pEC50": "pEC50_curvecurator",
         "pEC50 Error": "pEC50Error",
         "Curve Slope": "Slope",
         "Curve Front": "Front",
         "Curve Back": "Back",
         "Curve Fold Change": "FoldChange",
-        "Curve AUC": "AUC",
+        "Curve AUC": "AUC_curvecurator",
         "Curve R2": "R2",
         "Curve P_Value": "pValue",
         "Curve Relevance Score": "RelevanceScore",
@@ -197,7 +198,7 @@ def postprocess(output_folder: str | Path, dataset_name: str):
     )
     fitted_curve_data[["cell_line_id", "drug_id"]] = fitted_curve_data.Name.str.split("|", expand=True)
     fitted_curve_data["EC50_curvecurator"] = np.power(
-        10, -fitted_curve_data["pEC50"].values
+        10, -fitted_curve_data["pEC50_curvecurator"].values
     )  # in CurveCurator 10^-pEC50 = EC50
     _calc_ic50(fitted_curve_data)
     fitted_curve_data.to_csv(output_folder / f"{dataset_name}.csv", index=None)

diff --git a/drevalpy/utils.py b/drevalpy/utils.py
@@ -1,7 +1,7 @@
 """Utility functions for the evaluation pipeline."""
 
 import argparse
-import os
+from pathlib import Path
 from typing import Optional
 
 from sklearn.base import TransformerMixin
@@ -186,6 +186,7 @@ def check_arguments(args) -> None:
     :param args: arguments passed from the command line
     :raises AssertionError: if any of the arguments is invalid
     :raises ValueError: if the number of cross-validation splits or curve_curator_cores is less than 1
+    :raises FileNotFoundError: if a custom dataset name was specified and the input file could not be found.
     """
     if not args.models:
         raise AssertionError("At least one model must be specified")
@@ -205,13 +206,28 @@ def check_arguments(args) -> None:
                 f"want to use your own baseline, you need to implement a new model class and add it to "
                 f"the MODEL_FACTORY in the models init"
             )
-
     if args.dataset_name not in AVAILABLE_DATASETS:
-        raise AssertionError(
-            f"Invalid dataset name. Available datasets are {list(AVAILABLE_DATASETS.keys())} "
-            f"If you want to use your own dataset, you need to implement a new response dataset loader "
-            f"and add it to the AVAILABLE_DATASETS in the response_datasets init"
-        )
+        if args.curve_curator:
+            expected_custom_input = Path(args.path_data).absolute() / args.dataset_name / f"{args.dataset_name}_raw.csv"
+            if not expected_custom_input.is_file():
+                raise FileNotFoundError(
+                    "You specified the curve_curator option with a custom dataset name which requires raw "
+                    f"viability data to be located at {expected_custom_input} but the file does not exist. "
+                    "Please check the 'path_data' and 'dataset_name' arguments and ensure the raw viability "
+                    "input file is located at <path_data>/<dataset_name>/<dataset_name>_raw.csv."
+                )
+        else:
+            expected_custom_input = Path(args.path_data).absolute() / args.dataset_name / f"{args.dataset_name}_raw.csv"
+            if not expected_custom_input.is_file():
+                raise FileNotFoundError(
+                    "You specified a custom dataset name which requires prefit curve data to be located at "
+                    f"{expected_custom_input} but the file does not exist. Please check the 'path_data' and "
+                    "'dataset_name' arguments and ensure the prefit curve data is located at input file is "
+                    "located at <path_data>/<dataset_name>/<dataset_name>.csv."
+                )
+
+    if args.curve_curator and args.curve_curator_cores < 1:
+        raise ValueError("Number of cores for CurveCurator must be greater than 0.")
 
     for dataset in args.cross_study_datasets:
         if dataset not in AVAILABLE_DATASETS:
@@ -223,7 +239,7 @@ def check_arguments(args) -> None:
             )
 
     # if the path to args.path_data does not exist, create the directory
-    os.makedirs(args.path_data, exist_ok=True)
+    Path(args.path_data).mkdir(parents=True, exist_ok=True)
 
     if args.n_cv_splits <= 1:
         raise ValueError("Number of cross-validation splits must be greater than 1.")
@@ -241,12 +257,13 @@ def check_arguments(args) -> None:
     if args.n_trials_robustness < 0:
         raise ValueError("Number of trials for robustness test must be greater than or equal to 0")
 
-    if args.curve_curator:
-        if args.curve_curator_cores < 1:
-            raise ValueError("Number of cores for CurveCurator must be greater than 0.")
-
-    if args.measure not in ["LN_IC50", "response"]:
-        raise ValueError("Only 'LN_IC50' and 'response' are currently available as a drug response measure.")
+    allowed_measures = ["LN_IC50", "EC50", "IC50", "pEC50", "AUC", "response"]
+    allowed_measures.extend([f"{m}_curvecurator" for m in allowed_measures])
+    if args.measure not in allowed_measures:
+        raise ValueError(
+            "Only 'LN_IC50', 'EC50', 'IC50', 'pEC50', 'AUC', 'response' or their equivalents including "
+            "the '_curvecurator' suffix are allowed drug response measures."
+        )
 
     if args.response_transformation not in ["None", "standard", "minmax", "robust"]:
         raise AssertionError("Invalid response_transformation. Choose from None, standard, minmax, robust")
@@ -264,7 +281,6 @@ def main(args) -> None:
     :param args: passed from command line
     """
     check_arguments(args)
-
     # PIPELINE: LOAD_RESPONSE
     response_data, cross_study_datasets = get_datasets(
         dataset_name=args.dataset_name,