diff --git a/folktexts/acs/acs_questions.py b/folktexts/acs/acs_questions.py
index 57790de..b986c7b 100755
--- a/folktexts/acs/acs_questions.py
+++ b/folktexts/acs/acs_questions.py
@@ -6,7 +6,7 @@
 from folktexts.qa_interface import MultipleChoiceQA as _MultipleChoiceQA
 
 from . import acs_columns
-from .acs_tasks import _acs_columns_map
+from .acs_tasks import acs_columns_map
 
 # Map of numeric ACS questions
 acs_numeric_qa_map: dict[str, object] = {
@@ -25,7 +25,7 @@
 # ... include all multiple-choice questions defined in the column descriptions
 acs_multiple_choice_qa_map.update({
     col_to_text.name: col_to_text.question
-    for col_to_text in _acs_columns_map.values()
+    for col_to_text in acs_columns_map.values()
     if (
         isinstance(col_to_text, ColumnToText)
         and col_to_text._question is not None
diff --git a/folktexts/acs/acs_tasks.py b/folktexts/acs/acs_tasks.py
index 1d2f709..2797e2d 100755
--- a/folktexts/acs/acs_tasks.py
+++ b/folktexts/acs/acs_tasks.py
@@ -23,7 +23,7 @@
 )
 
 # Map of ACS column names to ColumnToText objects
-_acs_columns_map: dict[str, object] = {
+acs_columns_map: dict[str, object] = {
     col_mapper.name: col_mapper
     for col_mapper in acs_columns.__dict__.values()
     if isinstance(col_mapper, _ColumnToText)
@@ -37,13 +37,39 @@ class ACSTaskMetadata(TaskMetadata):
     # The ACS task object from the folktables package
     folktables_obj: BasicProblem = None
 
+    @classmethod
+    def make_task(
+        cls,
+        name: str,
+        description: str,
+        features: list[str],
+        target: str,
+        target_threshold: Threshold = None,
+        sensitive_attribute: str = None,
+    ) -> ACSTaskMetadata:
+        # Validate columns mappings exist
+        if not all(col in acs_columns_map for col in (features + [target])):
+            raise ValueError("Not all columns have mappings to text descriptions.")
+
+        # TODO: CHECK IF THIS WORKS!
+        return cls(
+            name=name,
+            description=description,
+            features=features,
+            target=target,
+            cols_to_text=acs_columns_map,
+            target_threshold=target_threshold,
+            sensitive_attribute=sensitive_attribute,
+            folktables_obj=None,
+        )
+
     @classmethod
     def make_folktables_task(
         cls,
         name: str,
         description: str,
         target_threshold: Threshold = None,
-    ) -> "ACSTaskMetadata":
+    ) -> ACSTaskMetadata:
 
         # Get the task object from the folktables package
         try:
@@ -56,7 +82,7 @@ def make_folktables_task(
             description=description,
             features=folktables_task.features,
             target=folktables_task.target,
-            cols_to_text=_acs_columns_map,
+            cols_to_text=acs_columns_map,
             sensitive_attribute=folktables_task.group,
             target_threshold=target_threshold,
             folktables_obj=folktables_task,
@@ -125,6 +151,6 @@ def __hash__(self) -> int:
         *acs_travel_time_task.features,
     })),
     target="HINS2",
-    cols_to_text=_acs_columns_map,
+    cols_to_text=acs_columns_map,
     target_threshold=acs_health_insurance_threshold,
 )
diff --git a/folktexts/benchmark.py b/folktexts/benchmark.py
index 86b5550..de76784 100755
--- a/folktexts/benchmark.py
+++ b/folktexts/benchmark.py
@@ -218,6 +218,7 @@ def run(self, results_root_dir: str | Path, fit_threshold: int | bool = 0) -> fl
             predictions_save_path=test_predictions_save_path,
             labels=y_test,  # used only to save alongside predictions in disk
         )
+        self._y_test_scores = self.llm_clf._get_positive_class_scores(self._y_test_scores)
 
         # If requested, fit the threshold on a small portion of the train set
         if fit_threshold:
diff --git a/folktexts/classifier.py b/folktexts/classifier.py
index 407533a..49dacfd 100755
--- a/folktexts/classifier.py
+++ b/folktexts/classifier.py
@@ -138,9 +138,9 @@ def fit(self, X, y, *, false_pos_cost=1.0, false_neg_cost=1.0, **kwargs):
         """Uses the provided data sample to fit the prediction threshold."""
 
         # Compute risk estimates for the data
-        y_pred_scores = self.predict_proba(X, **kwargs)
-        if len(y_pred_scores.shape) > 1:
-            y_pred_scores = y_pred_scores[:, -1]
+        y_pred_scores = self._get_positive_class_scores(
+            self.predict_proba(X, **kwargs)
+        )
 
         # Compute the best threshold for the given data
         self.threshold = compute_best_threshold(
@@ -172,7 +172,7 @@ def _make_predictions_multiclass(pos_class_scores: np.ndarray) -> np.ndarray:
 
     def predict(
         self,
-        data: pd.DataFrame | Dataset,
+        data: pd.DataFrame,
         batch_size: int = None,
         context_size: int = None,
         predictions_save_path: str | Path = None,
@@ -186,13 +186,7 @@ def predict(
             predictions_save_path=predictions_save_path,
             labels=labels,
         )
-        if isinstance(risk_scores, dict):
-            return {
-                data_type: (self._get_positive_class_scores(data_scores) >= self.threshold).astype(int)
-                for data_type, data_scores in risk_scores.items()
-            }
-        else:
-            return (self._get_positive_class_scores(risk_scores) >= self.threshold).astype(int)
+        return (self._get_positive_class_scores(risk_scores) >= self.threshold).astype(int)
 
     def _load_predictions_from_disk(
         self,
diff --git a/folktexts/cli/launch_experiments.py b/folktexts/cli/launch_experiments_htcondor.py
similarity index 90%
rename from folktexts/cli/launch_experiments.py
rename to folktexts/cli/launch_experiments_htcondor.py
index 0e8b934..d6e08dc 100755
--- a/folktexts/cli/launch_experiments.py
+++ b/folktexts/cli/launch_experiments_htcondor.py
@@ -35,9 +35,7 @@
 
 # Path to the executable script to run
 # EXECUTABLE_PATH = Path(__file__).parent.resolve() / "run_acs_benchmark.py"
-EXECUTABLE_PATH = Path(__file__).parent.resolve() / "eval_feature_importance.py"
-# TODO ^ pass executable path as cmd line arg
-logging.warning(f"Using executable path: {EXECUTABLE_PATH}")
+# EXECUTABLE_PATH = Path(__file__).parent.resolve() / "eval_feature_importance.py"
 
 ##################
 # Global configs #
@@ -76,7 +74,8 @@
 
 
 # Function that defines common settings among all LLM-as-clf experiments
-def make_llm_as_clf_experiment(
+def make_llm_clf_experiment(
+    executable_path: str,
     model_name: str,
     task: str,
     results_root_dir: str,
@@ -111,7 +110,7 @@ def make_llm_as_clf_experiment(
 
     # Define experiment
     exp = Experiment(
-        executable_path=EXECUTABLE_PATH.as_posix(),
+        executable_path=executable_path,
         kwargs=dict(
             model=model_path,
             task=task,
@@ -146,6 +145,13 @@ def setup_arg_parser() -> argparse.ArgumentParser:
     # Init parser
     parser = argparse.ArgumentParser(description="Launch experiments to evaluate LLMs as classifiers.")
 
+    parser.add_argument(
+        "--executable-path",
+        type=str,
+        help="[string] Path to the executable script to run.",
+        required=True,
+    )
+
     parser.add_argument(
         "--results-root-dir",
         type=str,
@@ -194,11 +200,13 @@ def main():
     # Parse extra kwargs
     from ._utils import cmd_line_args_to_kwargs
     extra_kwargs = cmd_line_args_to_kwargs(extra_kwargs)
-    # TODO: use the run_acs_benchmark.py parser to parse extra kwargs
-    #       with `setup_arg_parser().convert_arg_line_to_args(extra_kwargs)` !!!
 
+    # Prepare command-line arguments
     models = args.model or LLM_MODELS
     tasks = args.task or ACS_TASKS
+    executable_path = Path(args.executable_path).resolve()
+    if not executable_path.exists() or not executable_path.is_file():
+        raise FileNotFoundError(f"Executable script not found at '{executable_path}'.")
 
     # Load experiment from JSON file if provided
     if args.experiment_json:
@@ -209,7 +217,8 @@ def main():
     # Otherwise, run all experiments planned
     else:
         all_experiments = [
-            make_llm_as_clf_experiment(
+            make_llm_clf_experiment(
+                executable_path=executable_path.as_posix(),
                 model_name=model,
                 task=task,
                 results_root_dir=args.results_root_dir,
diff --git a/folktexts/task.py b/folktexts/task.py
index 74128eb..a04175d 100755
--- a/folktexts/task.py
+++ b/folktexts/task.py
@@ -104,7 +104,11 @@ def create_task_with_feature_subset(self, feature_subset: Iterable[str]):
 
         # Check if features are a subset of the original features
         if not set(feature_subset).issubset(self.features):
-            raise ValueError("`feature_subset` must be a subset of the original features.")
+            raise ValueError(
+                f"`feature_subset` must be a subset of the original features; "
+                f"following features are not in the original set: "
+                f"{set(feature_subset) - set(self.features)}"
+            )
 
         # Return new TaskMetadata object
         return dataclasses.replace(
diff --git a/requirements/main.txt b/requirements/main.txt
index f60fbc8..f5b54a6 100644
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -1,8 +1,8 @@
 folktables~=0.0.12
+scikit-learn>=1.3
 numpy
 pandas
 tqdm
-scikit-learn
 accelerate
 transformers
 torch