enh: add group variable, regression test, update README

satra · satra · commit 4c007aa88e6e · 2020-06-20T14:51:53.000-04:00
diff --git a/README.md b/README.md
@@ -29,22 +29,22 @@ pip install pydra-ml
 
 This repo installs `pydraml` a CLI to allow usage without any programming.
 
-To test the CLI for a classification example, copy the `pydra_ml/tests/data/breast_cancer.csv` and 
+To test the CLI for a classification example, copy the `pydra_ml/tests/data/breast_cancer.csv` and
 `short-spec.json.sample` to a folder and run.
 
 ```
 $ pydraml -s short-spec.json.sample
 ```
-To check a regression example, copy `pydra_ml/tests/data/diabetes_table.csv` and `diabetes_spec.json`
-to a folder and run.
+To check a regression example, copy `pydra_ml/tests/data/diabetes_table.csv` and
+`diabetes_spec.json` to a folder and run.
 
 ```
 $ pydraml -s diabetes_spec.json
 ```
 
-For each case pydra-ml will generate a result folder with the spec file name that includes
-`test-{metric}-{timestamp}.png` file for each metric together with a pickled results file 
-containing all the scores from the model evaluations.
+For each case pydra-ml will generate a result folder with the spec file name that
+includes `test-{metric}-{timestamp}.png` file for each metric together with a
+pickled results file containing all the scores from the model evaluations.
 
 ```
 $ pydraml --help
@@ -82,14 +82,17 @@ will want to generate `x_indices` programmatically.
   group.
 - *x_indices*: Numeric (0-based) or string list of columns to use as input features
 - *target_vars*: String list of target variable (at present only one is supported)
+- *group_var*: String to indicate column to use for grouping
 - *n_splits*: Number of shuffle split iterations to use
 - *test_size*: Fraction of data to use for test set in each iteration
 - *clf_info*: List of scikit-learn classifiers to use.
 - *permute*: List of booleans to indicate whether to generate a null model or not
 - *gen_shap*: Boolean indicating whether shap values are generated
 - *nsamples*: Number of samples to use for shap estimation
 - *l1_reg*: Type of regularizer to use for shap estimation
-- *plot_top_n_shap*: Number or proportion of top SHAP values to plot (e.g., 16 or 0.1 for top 10%). Set to 1.0 (float) to plot all features or 1 (int) to plot top first feature.
+- *plot_top_n_shap*: Number or proportion of top SHAP values to plot (e.g., 16
+or 0.1 for top 10%). Set to 1.0 (float) to plot all features or 1 (int) to plot
+top first feature.
 - *metrics*: scikit-learn metric to use
 
 ## `clf_info` specification
@@ -113,6 +116,7 @@ then an empty dictionary **MUST** be provided as parameter 3.
  "x_indices": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
  18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
  "target_vars": ["target"],
+ "group_var": null,
  "n_splits": 100,
  "test_size": 0.2,
  "clf_info": [
@@ -140,25 +144,46 @@ then an empty dictionary **MUST** be provided as parameter 3.
 
 ## Output:
 The workflow will output:
-- `results-{timestamp}.pkl` containing 1 list per model used. For example, if assigned to variable `results`, it is accessed through `results[0]` to `results[N]`
-(if `permute: [false,true]` then it will output the model trained on the labels first `results[0]` and the model trained on permuted labels second `results[1]`.
+- `results-{timestamp}.pkl` containing 1 list per model used. For example, if
+assigned to variable `results`, it is accessed through `results[0]` to `results[N]`
+(if `permute: [false,true]` then it will output the model trained on the labels
+first `results[0]` and the model trained on permuted labels second `results[1]`.
 Each model contains:
-    - `dict` accesed through `results[0][0]` with model information: `{'ml_wf.clf_info': ['sklearn.neural_network', 'MLPClassifier', {'alpha': 1, 'max_iter': 1000}], 'ml_wf.permute': False}`
-    - `pydra Result obj` accesed through `results[0][1]` with attribute `output` which itself has attributes:
+    - `dict` accesed through `results[0][0]` with model information:
+     `{'ml_wf.clf_info': ['sklearn.neural_network', 'MLPClassifier',
+         {'alpha': 1, 'max_iter': 1000}], 'ml_wf.permute': False}`
+    - `pydra Result obj` accesed through `results[0][1]` with attribute `output`
+      which itself has attributes:
         - `feature_names`: from the columns of the data csv.
         And the following attributes organized in N lists for N bootstrapping samples:
         - `output`: N lists, each one with two lists for true and predicted labels.
         - `score`: N lists each one containing M different metric scores.
-        - `shaps`: N lists each one with a list of shape (P,F) where P is the amount of predictions and F the different SHAP values for each feature. `shaps` is empty if `gen_shap` is set to `false` or if `permute` is set to true.
-- One figure per metric with performance distribution across splits (with or without null distribution trained on permuted labels)
+        - `shaps`: N lists each one with a list of shape (P,F) where P is the
+        amount of predictions and F the different SHAP values for each feature.
+        `shaps` is empty if `gen_shap` is set to `false` or if `permute` is set
+        to true.
+- One figure per metric with performance distribution across splits (with or
+without null distribution trained on permuted labels)
 - `shap-{timestamp}` dir
     - SHAP values are computed for each prediction in each split's test set
-    (e.g., 30 bootstrapping splits with 100 prediction will create (30,100) array). The mean is taken across predictions for each split (e.g., resulting in a (64,30) array for 64 features and 30 bootstrapping samples).
-    - For binary classification, a more accurate display of feature importance obtained by splitting predictions into TP, TN, FP, and FN,
-    which in turn can allow for error auditing (i.e., what a model pays attention to when making incorrect/false predictions)
-        - `quadrant_indexes.pkl`: The TP, TN, FP, FN indexes are saved in  as a `dict` with one `key` per model (permuted models without SHAP values will be skipped automatically), and each key `values` being a bootstrapping split.
-        - `summary_values_shap_{model_name}_{prediction_type}.csv` contains all SHAP values and summary statistics ranked by the mean SHAP value across bootstrapping splits. A sample_n column can be empty or NaN if this split did not have the type of prediction in the filename (e.g., you may not have FNs or FPs in a given split with high performance).
-        - `summary_shap_{model_name}_{plot_top_n_shap}.png` contains SHAP value summary statistics for all features (set to 1.0) or only the top N most important features for better visualization.
+    (e.g., 30 bootstrapping splits with 100 prediction will create (30,100) array).
+     The mean is taken across predictions for each split (e.g., resulting in a
+     (64,30) array for 64 features and 30 bootstrapping samples).
+    - For binary classification, a more accurate display of feature importance
+    obtained by splitting predictions into TP, TN, FP, and FN, which in turn can
+    allow for error auditing (i.e., what a model pays attention to when making
+    incorrect/false predictions)
+        - `quadrant_indexes.pkl`: The TP, TN, FP, FN indexes are saved in  as a
+        `dict` with one `key` per model (permuted models without SHAP values will
+        be skipped automatically), and each key `values` being a bootstrapping split.
+        - `summary_values_shap_{model_name}_{prediction_type}.csv` contains all
+        SHAP values and summary statistics ranked by the mean SHAP value across
+        bootstrapping splits. A sample_n column can be empty or NaN if this split
+        did not have the type of prediction in the filename (e.g., you may not
+        have FNs or FPs in a given split with high performance).
+        - `summary_shap_{model_name}_{plot_top_n_shap}.png` contains SHAP value
+        summary statistics for all features (set to 1.0) or only the top N most
+        important features for better visualization.
 
 
 ## Developer installation
@@ -171,10 +196,14 @@ cd pydra-ml
 pip install -e .[dev]
 ```
 
-It is also useful to install pre-commit:
+It is also useful to install pre-commit, which takes care of styling when
+committing code. When pre-commit is used you may have to run git commit twice,
+since pre-commit may make additional changes to your code for styling and will
+not commit these changes by default:
+
 ```
 pip install pre-commit
-pre-commit
+pre-commit install
 ```
 
 ### Project structure
diff --git a/diabetes_spec.json b/diabetes_spec.json
@@ -1,6 +1,7 @@
-{"filename": "./diabetes_table.csv",
+{"filename": "diabetes_table.csv",
  "x_indices": [0,1,2,3,4,5,6,7,8,9],
  "target_vars": ["target"],
+ "group_var": null,
  "n_splits": 4,
  "test_size": 0.2,
  "clf_info": [
@@ -14,4 +15,4 @@
   "l1_reg": "aic",
   "plot_top_n_shap": 10,
   "metrics":["explained_variance_score","mean_squared_error","mean_absolute_error"]
-}
+}
diff --git a/long-spec.json.sample b/long-spec.json.sample
@@ -2,6 +2,7 @@
  "x_indices": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
  18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
  "target_vars": ["target"],
+ "group_var": null,
  "n_splits": 100,
  "test_size": 0.2,
  "clf_info": [
diff --git a/pydra_ml/classifier.py b/pydra_ml/classifier.py
@@ -55,6 +55,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
             filename=wf.lzin.filename,
             x_indices=wf.lzin.x_indices,
             target_vars=wf.lzin.target_vars,
+            group=wf.lzin.group_var,
         )
     )
     wf.add(
diff --git a/pydra_ml/tests/test_classifier.py b/pydra_ml/tests/test_classifier.py
@@ -1,30 +1,63 @@
 import os
 from ..classifier import gen_workflow, run_workflow
 
-clfs = [
-    ("sklearn.neural_network", "MLPClassifier", {"alpha": 1, "max_iter": 1000}),
-    ("sklearn.naive_bayes", "GaussianNB", {}),
-]
-csv_file = os.path.join(os.path.dirname(__file__), "data", "breast_cancer.csv")
-inputs = {
-    "filename": csv_file,
-    "x_indices": range(30),
-    "target_vars": ("target",),
-    "n_splits": 2,
-    "test_size": 0.2,
-    "clf_info": clfs,
-    "permute": [True, False],
-    "gen_shap": True,
-    "nsamples": 5,
-    "l1_reg": "aic",
-    "plot_top_n_shap": 16,
-    "metrics": ["roc_auc_score", "accuracy_score"],
-}
-
 
 def test_classifier(tmpdir):
+    clfs = [
+        ("sklearn.neural_network", "MLPClassifier", {"alpha": 1, "max_iter": 1000}),
+        ("sklearn.naive_bayes", "GaussianNB", {}),
+    ]
+    csv_file = os.path.join(os.path.dirname(__file__), "data", "breast_cancer.csv")
+    inputs = {
+        "filename": csv_file,
+        "x_indices": range(30),
+        "target_vars": ("target",),
+        "group_var": None,
+        "n_splits": 2,
+        "test_size": 0.2,
+        "clf_info": clfs,
+        "permute": [True, False],
+        "gen_shap": True,
+        "nsamples": 5,
+        "l1_reg": "aic",
+        "plot_top_n_shap": 16,
+        "metrics": ["roc_auc_score", "accuracy_score"],
+    }
     wf = gen_workflow(inputs, cache_dir=tmpdir)
     results = run_workflow(wf, "cf", {"n_procs": 1})
     assert results[0][0]["ml_wf.clf_info"][1] == "MLPClassifier"
     assert results[0][0]["ml_wf.permute"]
     assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0]
+
+
+def test_regressor(tmpdir):
+    clfs = [
+        ("sklearn.neural_network", "MLPRegressor", {"alpha": 1, "max_iter": 1000}),
+        (
+            "sklearn.linear_model",
+            "LinearRegression",
+            {"fit_intercept": True, "normalize": True},
+        ),
+    ]
+    csv_file = os.path.join(os.path.dirname(__file__), "data", "diabetes_table.csv")
+    inputs = {
+        "filename": csv_file,
+        "x_indices": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        "target_vars": ["target"],
+        "group_var": None,
+        "n_splits": 2,
+        "test_size": 0.2,
+        "clf_info": clfs,
+        "permute": [True, False],
+        "gen_shap": True,
+        "nsamples": 5,
+        "l1_reg": "aic",
+        "plot_top_n_shap": 10,
+        "metrics": ["explained_variance_score"],
+    }
+
+    wf = gen_workflow(inputs, cache_dir=tmpdir)
+    results = run_workflow(wf, "cf", {"n_procs": 1})
+    assert results[0][0]["ml_wf.clf_info"][1] == "MLPRegressor"
+    assert results[0][0]["ml_wf.permute"]
+    assert results[0][1].output.score[0][0] < results[1][1].output.score[0][0]
diff --git a/short-spec.json.sample b/short-spec.json.sample
@@ -2,6 +2,7 @@
  "x_indices": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
  18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
  "target_vars": ["target"],
+ "group_var": null,
  "n_splits": 2,
  "test_size": 0.2,
  "clf_info": [

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):`
`55`	`55`	`filename=wf.lzin.filename,`
`56`	`56`	`x_indices=wf.lzin.x_indices,`
`57`	`57`	`target_vars=wf.lzin.target_vars,`
	`58`	`+ group=wf.lzin.group_var,`
`58`	`59`	`)`
`59`	`60`	`)`
`60`	`61`	`wf.add(`