From 3a8aa792fe5fcc270623cc28fdeff8cb43150900 Mon Sep 17 00:00:00 2001
From: Matt Garber <matthew.garber@childrens.harvard.edu>
Date: Fri, 6 Dec 2024 15:39:43 -0500
Subject: [PATCH 1/4] Packaging mode for studies

---
 cumulus_library/actions/builder.py            | 203 +++++++++++++++---
 cumulus_library/builders/valueset/umls.py     |   6 +-
 cumulus_library/cli.py                        |  44 +++-
 cumulus_library/cli_parser.py                 |   7 +-
 tests/conftest.py                             |   8 +
 tests/test_athena.py                          |   5 -
 tests/test_cli.py                             |  95 +++++++-
 .../reference_sql/counts.sql                  |   9 +
 .../reference_sql/meta.sql                    |   9 +
 .../study_python_counts_valid/module1.py      |   4 +-
 .../study_python_counts_valid/module2.py      |   2 +-
 tests/test_databases.py                       |  26 +--
 tests/test_discovery.py                       |   4 -
 tests/test_duckdb.py                          |   5 -
 tests/test_dynamic_manifest.py                |  16 --
 tests/test_static_file.py                     |   7 -
 tests/test_umls_api.py                        |  17 --
 .../valueset/test_additional_rules_builder.py |   5 -
 .../valueset/test_rxnorm_valueset_builder.py  |   5 -
 tests/valueset/test_umls.py                   |   7 -
 tests/valueset/test_valueset_builder.py       |   5 -
 21 files changed, 327 insertions(+), 162 deletions(-)
 create mode 100644 tests/test_data/study_dynamic_prefix/reference_sql/counts.sql
 create mode 100644 tests/test_data/study_dynamic_prefix/reference_sql/meta.sql

diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
index 119463a9..b1aa2028 100644
--- a/cumulus_library/actions/builder.py
+++ b/cumulus_library/actions/builder.py
@@ -7,11 +7,13 @@
 import re
 import sys
 import tomllib
+import zipfile
 
 from rich.progress import Progress, TaskID
 
 from cumulus_library import (
     BaseTableBuilder,
+    CountsBuilder,
     base_utils,
     databases,
     enums,
@@ -40,7 +42,10 @@ def _load_and_execute_builder(
     db_parser: databases.DatabaseParser = None,
     write_reference_sql: bool = False,
     doc_str: str | None = None,
-) -> None:
+    prepare: bool = False,
+    data_path: pathlib.Path | None = None,
+    query_count: int | None = None,
+) -> int:
     """Loads a table builder from a file.
 
     :param config: a StudyConfig object
@@ -49,6 +54,10 @@ def _load_and_execute_builder(
     :keyword db_parser: an object implementing DatabaseParser for the target database
     :keyword write_reference_sql: if true, writes sql to disk inside a study's directory
     :keyword doc_str: A string to insert between queries written to disk
+    :keyword prepare: If true, will render query instead of executing
+    :keyword data_path: If prepare is true, the path to write rendered data to
+    :keyword query_count: if prepare is true, the number of queries already rendered
+    :returns: number of processed queries
     """
 
     # Since we have to support arbitrary user-defined python files here, we
@@ -79,8 +88,7 @@ def _load_and_execute_builder(
             f"Error loading {manifest._study_path}{filename}\n"
             "Custom builders must extend the BaseTableBuilder class."
         )
-
-    # Remove instances of intermediate classes, if present
+    # Remove instances of intermediate classes, if present (usually not)
     table_builder_subclasses = list(
         filter(lambda x: x.__name__ != "CountsBuilder", table_builder_subclasses)
     )
@@ -89,7 +97,10 @@ def _load_and_execute_builder(
     # remove it so it doesn't interfere with the next python module to
     # execute, since the subclass would otherwise hang around.
     table_builder_class = table_builder_subclasses[0]
-    table_builder = table_builder_class()
+    if issubclass(table_builder_class, CountsBuilder):
+        table_builder = table_builder_class(study_prefix=manifest.get_study_prefix())
+    else:
+        table_builder = table_builder_class()
     if write_reference_sql:
         prefix = manifest.get_study_prefix()
         table_builder.prepare_queries(config=config, manifest=manifest, parser=db_parser)
@@ -105,19 +116,30 @@ def _load_and_execute_builder(
         table_builder.write_queries(
             path=pathlib.Path(f"{manifest._study_path}/reference_sql/" + new_filename)
         )
+    elif prepare:
+        table_builder.prepare_queries(
+            config=config,
+            manifest=manifest,
+            parser=db_parser,
+        )
+        _render_output(
+            manifest.get_study_prefix(), table_builder.queries, data_path, filename, query_count
+        )
     else:
         table_builder.execute_queries(
             config=config,
             manifest=manifest,
             parser=db_parser,
         )
-
+    num_queries = len(table_builder.queries)
     # After running the executor code, we'll remove
     # it so it doesn't interfere with the next python module to
     # execute, since the subclass would otherwise hang around.
     del sys.modules[table_builder_module.__name__]
     del table_builder_module
 
+    return num_queries
+
 
 def run_protected_table_builder(
     config: base_utils.StudyConfig,
@@ -136,13 +158,31 @@ def run_protected_table_builder(
 
 
 def _run_workflow(
-    config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
-) -> None:
+    config: base_utils.StudyConfig,
+    manifest: study_manifest.StudyManifest,
+    filename: str,
+    prepare: str,
+    data_path: pathlib.Path,
+    query_count: int,
+) -> int:
     """Loads workflow config from toml definitions and executes workflow
 
     :param config: a StudyConfig object
     :param manifest: a StudyManifest object
+    :param filename: Filename of the workflow config
+    :param prepare: If true, will render query instead of executing
+    :param data_path: If prepare is true, the path to write rendered data to
+    :param query_count: if prepare is true, the number of queries already rendered
+    :returns: a count of processed queries
     """
+    toml_path = pathlib.Path(f"{manifest._study_path}/{filename}")
+    if prepare:
+        with open(toml_path) as file:
+            workflow_config = file.read()
+        _render_output(
+            manifest.get_study_prefix(), [workflow_config], data_path, filename, query_count, True
+        )
+        return 1
     existing_stats = []
     if not config.stats_build:
         existing_stats = (
@@ -157,14 +197,13 @@ def _run_workflow(
     # but we're letting it slide so that builders function similarly
     # across the board
     safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
-    toml_path = pathlib.Path(f"{manifest._study_path}/{filename}")
     with open(toml_path, "rb") as file:
         workflow_config = tomllib.load(file)
         config_type = workflow_config["config_type"]
         target_table = workflow_config.get("target_table", workflow_config.get("table_prefix", ""))
 
     if (target_table,) in existing_stats and not config.stats_build:
-        return
+        return 0
     match config_type:
         case "psm":
             builder = psm_builder.PsmBuilder(
@@ -195,6 +234,7 @@ def _run_workflow(
             table_name=f"{target_table}_{safe_timestamp}",
             view_name=target_table,
         )
+    return len(builder.queries)
 
 
 def build_matching_files(
@@ -203,13 +243,23 @@ def build_matching_files(
     *,
     builder: str | None,
     db_parser: databases.DatabaseParser = None,
+    prepare: bool = False,
+    data_path: pathlib.Path,
 ):
     """targets all table builders matching a target string for running
 
     :param config: a StudyConfig object
     :param manifest: a StudyManifest object
     :keyword builder: filename of a module implementing a TableBuilder
-    :keyword db_parser: an object implementing DatabaseParser for the target database"""
+    :keyword db_parser: an object implementing DatabaseParser for the target database
+    :keyword prepare: If true, will render query instead of executing
+    :keyword data_path: If prepare is true, the path to write rendered data to
+    """
+    if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]:
+        sys.exit(
+            f"{manifest.get_study_prefix()} does not support prepare mode. It must be run "
+            "directly against a target database."
+        )
     all_generators = manifest.get_all_generators()
     matches = []
     if not builder:  # pragma: no cover
@@ -218,7 +268,14 @@ def build_matching_files(
         for file in all_generators:
             if file.find(builder) != -1:
                 matches.append(file)
-    build_study(config, manifest, db_parser=db_parser, file_list=matches)
+    build_study(
+        config,
+        manifest,
+        db_parser=db_parser,
+        file_list=matches,
+        prepare=prepare,
+        data_path=data_path,
+    )
 
 
 def build_study(
@@ -228,35 +285,87 @@ def build_study(
     db_parser: databases.DatabaseParser = None,
     continue_from: str | None = None,
     file_list: list | None = None,
-) -> list:
+    prepare: bool = False,
+    data_path: pathlib.Path | None = None,
+) -> None:
     """Creates tables in the schema by iterating through the sql_config.file_names
 
     :param config: a StudyConfig object
     :param manifest: a StudyManifest object
+    :keyword db_parser: a parser for the target database
     :keyword continue_from: Name of a file to resume table creation from
-    :returns: loaded queries (for unit testing only)
+    :keyword prepare: If true, will render query instead of executing
+    :keyword data_path: If prepare is true, the path to write rendered data to
     """
+    if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]:
+        sys.exit(
+            f"{manifest.get_study_prefix()} does not support prepare mode. It must be run "
+            "directly against a target database."
+        )
     if file_list is None:
         file_list = manifest.get_file_list(continue_from)
+    if prepare:
+        data_dir = data_path / manifest.get_study_prefix()
+        files = data_dir.glob("*")
+        [file.unlink() for file in files if file.is_file()]
+    query_count = 0
     for file in file_list:
         if file.endswith(".py"):
-            _load_and_execute_builder(
+            query_count += _load_and_execute_builder(
                 config=config,
                 manifest=manifest,
                 filename=file,
                 db_parser=db_parser,
+                prepare=prepare,
+                data_path=data_path,
+                query_count=query_count,
             )
         elif file.endswith(".toml"):
-            _run_workflow(config=config, manifest=manifest, filename=file)
+            query_count += _run_workflow(
+                config=config,
+                manifest=manifest,
+                filename=file,
+                prepare=prepare,
+                data_path=data_path,
+                query_count=query_count,
+            )
         elif file.endswith(".sql"):
-            _run_raw_queries(config=config, manifest=manifest, filename=file)
+            query_count += _run_raw_queries(
+                config=config,
+                manifest=manifest,
+                filename=file,
+                prepare=prepare,
+                data_path=data_path,
+                query_count=query_count,
+            )
         else:
             raise errors.StudyManifestParsingError
+    if prepare:
+        with zipfile.ZipFile(
+            f"{data_path}/{manifest.get_study_prefix()}.zip", "w", zipfile.ZIP_DEFLATED
+        ) as z:
+            for file in data_dir.glob("*"):
+                z.write(file, file.relative_to(data_dir))
 
 
 def _run_raw_queries(
-    config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
-):
+    config: base_utils.StudyConfig,
+    manifest: study_manifest.StudyManifest,
+    filename: str,
+    prepare: bool,
+    data_path: pathlib.Path,
+    query_count: int,
+) -> int:
+    """Creates tables in the schema by iterating through the sql_config.file_names
+
+    :param config: a StudyConfig object
+    :param manifest: a StudyManifest object
+    :param filename: the name of the sql file to read
+    :param prepare: If true, will render query instead of executing
+    :param data_path: If prepare is true, the path to write rendered data to
+    :param query_count: the number of queries currently processed
+    :returns number of queries processed:
+    """
     queries = []
     for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{filename}")):
         queries.append([query, filename])
@@ -266,25 +375,49 @@ def _run_raw_queries(
             f"`{manifest.get_study_prefix()}__",
             "`",
         )
-    # We'll explicitly create a cursor since recreating cursors for each
-    # table in a study is slightly slower for some databases
-    cursor = config.db.cursor()
-    # We want to only show a progress bar if we are :not: printing SQL lines
-    with base_utils.get_progress_bar(disable=config.verbose) as progress:
-        task = progress.add_task(
-            f"Building tables from {filename}...",
-            total=len(queries),
-            visible=not config.verbose,
+    if prepare:
+        _render_output(
+            manifest.get_study_prefix(), [q[0] for q in queries], data_path, filename, query_count
         )
-        _execute_build_queries(
-            config=config,
-            manifest=manifest,
-            cursor=cursor,
-            queries=queries,
-            progress=progress,
-            task=task,
+    else:
+        # We'll explicitly create a cursor since recreating cursors for each
+        # table in a study is slightly slower for some databases
+        cursor = config.db.cursor()
+        # We want to only show a progress bar if we are :not: printing SQL lines
+        with base_utils.get_progress_bar(disable=config.verbose) as progress:
+            task = progress.add_task(
+                f"Building tables from {filename}...",
+                total=len(queries),
+                visible=not config.verbose,
+            )
+            _execute_build_queries(
+                config=config,
+                manifest=manifest,
+                cursor=cursor,
+                queries=queries,
+                progress=progress,
+                task=task,
+            )
+    return len(queries)
+
+
+def _render_output(study_name, outputs, data_path, filename, count, is_toml=False):
+    if is_toml:
+        suffix = "toml"
+    else:
+        suffix = "sql"
+    for output in enumerate(outputs):
+        if is_toml:
+            name = "config"
+        else:
+            name = re.search(r"(.*)__\w*", output[1])[0].lower().replace(" ", "_")
+        file_path = data_path / (
+            f"{study_name}/{count+output[0]:04d}.{filename.rsplit('.', 1)[0]}"
+            f".{output[0]:02d}.{name}.{suffix}"
         )
-    return queries
+        file_path.parent.mkdir(exist_ok=True, parents=True)
+        with open(file_path, "w") as f:
+            f.write(output[1])
 
 
 def _query_error(
diff --git a/cumulus_library/builders/valueset/umls.py b/cumulus_library/builders/valueset/umls.py
index 966452a8..a6cc8c36 100644
--- a/cumulus_library/builders/valueset/umls.py
+++ b/cumulus_library/builders/valueset/umls.py
@@ -92,10 +92,12 @@ def generate_umls_tables(
                     f"SELECT count(*) from {study_prefix}{table_prefix}umls_valuesets_rels"  # noqa: S608
                 ).fetchone()[0]
 
-    cursor.execute(f"""CREATE TABLE {study_prefix}{table_prefix}umls_valuesets AS
+    cursor.execute(
+        f"""CREATE TABLE {study_prefix}{table_prefix}umls_valuesets AS 
 SELECT distinct r.rxcui, v.*
 FROM {study_prefix}{table_prefix}umls_valuesets_rels as v, rxnorm.RXNCONSO as r
 WHERE v.code = r.code
 AND v.sab=r.sab
 --AND v.tty=r.tty
-""")  # noqa: S608
+"""  # noqa: S608
+    )
diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
index f2cd0b71..33e61711 100755
--- a/cumulus_library/cli.py
+++ b/cumulus_library/cli.py
@@ -93,17 +93,24 @@ def clean_and_build_study(
         *,
         options: dict[str, str],
         continue_from: str | None = None,
+        prepare: bool = False,
+        data_path: pathlib.Path | None = None,
     ) -> None:
         """Recreates study views/tables
 
         :param target: A path to the study directory
-        :param options: The dictionary of study-specific options
+        :keyword options: The dictionary of study-specific options
         :keyword continue_from: Restart a run from a specific sql file (for dev only)
+        :keyword prepare: If true, will render query instead of executing
+        :keyword data_path: If prepare is true, the path to write rendered data to
         """
         manifest = study_manifest.StudyManifest(target, self.data_path, options=options)
         try:
-            builder.run_protected_table_builder(config=self.get_config(manifest), manifest=manifest)
-            if not continue_from:
+            if not prepare:
+                builder.run_protected_table_builder(
+                    config=self.get_config(manifest), manifest=manifest
+                )
+            if not continue_from and not prepare:
                 log_utils.log_transaction(
                     config=self.get_config(manifest),
                     manifest=manifest,
@@ -114,7 +121,7 @@ def clean_and_build_study(
                     manifest=manifest,
                 )
 
-            else:
+            elif not prepare:
                 log_utils.log_transaction(
                     config=self.get_config(manifest),
                     manifest=manifest,
@@ -125,12 +132,15 @@ def clean_and_build_study(
                 config=self.get_config(manifest),
                 manifest=manifest,
                 continue_from=continue_from,
+                data_path=data_path,
+                prepare=prepare,
             )
-            log_utils.log_transaction(
-                config=self.get_config(manifest),
-                manifest=manifest,
-                status=enums.LogStatuses.FINISHED,
-            )
+            if not prepare:
+                log_utils.log_transaction(
+                    config=self.get_config(manifest),
+                    manifest=manifest,
+                    status=enums.LogStatuses.FINISHED,
+                )
 
         except errors.StudyManifestFilesystemError as e:
             # This should be thrown prior to any database connections, so
@@ -150,18 +160,24 @@ def build_matching_files(
         table_builder_name: str,
         *,
         options: dict[str, str],
+        prepare: bool = False,
+        data_path: pathlib.Path,
     ) -> None:
         """Runs a single table builder
 
         :param target: A path to the study directory
         :param table_builder_name: a builder file referenced in the study's manifest
-        :param options: The dictionary of study-specific options
+        :keyword options: The dictionary of study-specific options
+        :keyword prepare: If true, will render query instead of executing
+        :keyword data_path: If prepare is true, the path to write rendered data to
         """
         manifest = study_manifest.StudyManifest(target, options=options)
         builder.build_matching_files(
             config=self.get_config(manifest),
             manifest=manifest,
             builder=table_builder_name,
+            prepare=prepare,
+            data_path=data_path,
         )
 
     ### Data exporters
@@ -325,13 +341,19 @@ def run_cli(args: dict):
                 for target in args["target"]:
                     if args["builder"]:
                         runner.build_matching_files(
-                            study_dict[target], args["builder"], options=args["options"]
+                            study_dict[target],
+                            args["builder"],
+                            options=args["options"],
+                            prepare=args["prepare"],
+                            data_path=args["data_path"],
                         )
                     else:
                         runner.clean_and_build_study(
                             study_dict[target],
                             continue_from=args["continue_from"],
                             options=args["options"],
+                            prepare=args["prepare"],
+                            data_path=args["data_path"],
                         )
 
             elif args["action"] == "export":
diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py
index ec1830b9..6d9dd5e4 100644
--- a/cumulus_library/cli_parser.py
+++ b/cumulus_library/cli_parser.py
@@ -38,7 +38,7 @@ def add_custom_option(parser: argparse.ArgumentParser) -> None:
 def add_data_path_argument(parser: argparse.ArgumentParser) -> None:
     """Adds path arg to a subparser"""
     parser.add_argument(
-        "data_path",
+        "--data_path",
         default="./",
         nargs="?",
         help=(
@@ -208,6 +208,11 @@ def get_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Forces file downloads/uploads to occur, even if they already exist",
     )
+    build.add_argument(
+        "--prepare",
+        action="store_true",
+        help=("Prepares queries for remote execution."),
+    )
     build.add_argument(
         "--statistics",
         action="store_true",
diff --git a/tests/conftest.py b/tests/conftest.py
index f602a247..824544d0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,7 +3,9 @@
 import copy
 import datetime
 import json
+import os
 import pathlib
+from unittest import mock
 
 import numpy
 import pandas
@@ -197,6 +199,12 @@ def debug_diff_tables(cols, found, ref, pos=0):
 # Database fixtures
 
 
+@pytest.fixture(scope="session", autouse=True)
+def mock_env():
+    with mock.patch.dict(os.environ, clear=True):
+        yield
+
+
 @pytest.fixture
 def mock_db(tmp_path):
     """Provides a DuckDatabaseBackend for local testing"""
diff --git a/tests/test_athena.py b/tests/test_athena.py
index 940b7708..18e233f4 100644
--- a/tests/test_athena.py
+++ b/tests/test_athena.py
@@ -1,7 +1,6 @@
 """Edge case testing for Athena database support"""
 
 import json
-import os
 import pathlib
 from unittest import mock
 
@@ -46,10 +45,6 @@ def test_schema_parsing():
     assert expected == parser.parse_found_schema(schema)
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @mock.patch("botocore.session.Session")
 def test_upload_parquet_response_handling(mock_session):
     path = pathlib.Path(__file__).resolve().parent
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6d50bc22..546d0fba 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -673,7 +673,6 @@ def test_cli_upload_filter(mock_upload_data, mock_glob, args, calls):
 
 
 @pytest.mark.parametrize("mode", ["cli", "env"])
-@mock.patch.dict(os.environ, clear=True)
 # early exit with a dumb error
 @mock.patch("cumulus_library.base_utils.StudyConfig", side_effect=ZeroDivisionError)
 def test_cli_umls_parsing(mock_config, mode, tmp_path):
@@ -688,7 +687,6 @@ def test_cli_umls_parsing(mock_config, mode, tmp_path):
     assert mock_config.call_args[1]["umls_key"] == "MY-KEY"
 
 
-@mock.patch.dict(os.environ, clear=True)
 def test_cli_single_builder(tmp_path):
     cli.main(cli_args=duckdb_args(["build", "--builder=patient", "--target=core"], tmp_path))
     db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db")
@@ -700,7 +698,6 @@ def test_cli_single_builder(tmp_path):
     } == tables
 
 
-@mock.patch.dict(os.environ, clear=True)
 def test_cli_finds_study_from_manifest_prefix(tmp_path):
     # This study is located inside a folder called `study_different_dir`,
     # but we're going to find it using its real study prefix from the manifest.
@@ -715,7 +712,6 @@ def test_cli_finds_study_from_manifest_prefix(tmp_path):
     assert "study_different_name__table" in tables
 
 
-@mock.patch.dict(os.environ, clear=True)
 @pytest.mark.parametrize(
     "option,raises",
     [
@@ -745,7 +741,6 @@ def test_cli_custom_args(mock_config, tmp_path, option, raises):
         assert called_options[option.split(":")[0]] == option.split(":")[1]
 
 
-@mock.patch.dict(os.environ, clear=True)
 @mock.patch("cumulus_library.base_utils.StudyConfig")
 def test_cli_no_custom_args_yields_empty_dict(mock_config, tmp_path):
     mock_config.return_value.stats_clean = False
@@ -765,7 +760,6 @@ def test_cli_no_custom_args_yields_empty_dict(mock_config, tmp_path):
     assert {} == called_options
 
 
-@mock.patch.dict(os.environ, clear=True)
 def test_cli_import_study(tmp_path):
     test_data = {"string": ["a", "b", None]}
     df = pandas.DataFrame(test_data)
@@ -790,7 +784,6 @@ def test_cli_import_study(tmp_path):
     )
 
 
-@mock.patch.dict(os.environ, clear=True)
 def test_dedicated_schema(tmp_path):
     core_build_args = duckdb_args(
         [
@@ -827,7 +820,6 @@ def test_dedicated_schema(tmp_path):
         assert table in tables
 
 
-@mock.patch.dict(os.environ, clear=True)
 @mock.patch("cumulus_library.databases.duckdb.DuckDatabaseBackend")
 def test_sql_error_handling(mock_backend, tmp_path):
     mock_backend.return_value.cursor.return_value.execute.side_effect = [
@@ -848,7 +840,6 @@ def test_sql_error_handling(mock_backend, tmp_path):
         cli.main(cli_args=build_args)
 
 
-@mock.patch.dict(os.environ, clear=True)
 def test_version(capfd):
     out = None
     with pytest.raises(SystemExit):
@@ -870,7 +861,10 @@ def test_version(capfd):
     assert "study_invalid_bad_query: no version defined" in out
 
 
-@mock.patch.dict(os.environ, clear=True)
+@mock.patch.dict(
+    os.environ,
+    clear=True,
+)
 def test_study_dir(tmp_path):
     os.environ["CUMULUS_LIBRARY_STUDY_DIR"] = str(
         pathlib.Path(__file__).resolve().parent / "test_data/"
@@ -885,3 +879,84 @@ def test_study_dir(tmp_path):
     )
     with does_not_raise():
         cli.main(cli_args=build_args)
+
+
+@pytest.mark.parametrize(
+    "study,expected_queries,generated_query,toml_file",
+    [
+        (
+            "study_valid",
+            [
+                "0000.test.00.create_table_study_valid__table.sql",
+                "0001.test2.00.create_table_study_valid__table2.sql",
+            ],
+            "CREATE TABLE study_valid__table (test int)",
+            None,
+        ),
+        (
+            "study_python_counts_valid",
+            [
+                "0000.module1.00.create_table_if_not_exists_study_python_counts_valid__table1.sql",
+                "0001.module1.00.create_table_if_not_exists_study_python_counts_valid__table1.sql",
+                "0002.module2.00.create_table_if_not_exists_study_python_counts_valid__table2.sql",
+            ],
+            "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table1 (test int);",
+            None,
+        ),
+        (
+            "psm_test",
+            ["0000.psm_cohort.00.create_table_psm_test__psm_cohort.sql"],
+            """CREATE TABLE psm_test__psm_cohort AS (
+    SELECT * FROM core__condition ORDER BY id DESC LIMIT 100 --noqa: AM04
+)""",
+            [
+                "0001.psm_config.00.config.toml",
+                """config_type = "psm"
+classification_json = "dsm5_classifications.json"
+pos_source_table = "psm_test__psm_cohort"
+neg_source_table = "core__condition"
+target_table = "psm_test__psm_encounter_covariate"
+primary_ref = 'encounter_ref'
+count_ref = 'subject_ref'
+count_table = 'core__condition'
+dependent_variable = "example_diagnosis"
+pos_sample_size = 20
+neg_sample_size = 100
+seed = 1234567890
+[join_cols_by_table.core__encounter]
+join_id = "encounter_ref"
+included_cols = [
+    ["gender"], 
+    ["race_display", "race"]
+]
+""",
+            ],
+        ),
+    ],
+)
+def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_file):
+    build_args = duckdb_args(
+        [
+            "build",
+            "-t",
+            study,
+            "-s",
+            "tests/test_data/",
+            "--data_path",
+            str(tmp_path),
+            "--prepare",
+        ],
+        tmp_path,
+    )
+    with does_not_raise():
+        cli.main(cli_args=build_args)
+    queries = list(pathlib.Path(tmp_path).glob("**/*.sql"))
+    for query in queries:
+        assert query.name in expected_queries
+    with open(queries[0]) as f:
+        assert f.read() == generated_query
+    if toml_file:
+        config = next(pathlib.Path(tmp_path).glob("**/*.toml"))
+        assert config.name == toml_file[0]
+        with open(config) as f:
+            assert f.read() == toml_file[1]
diff --git a/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql b/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql
new file mode 100644
index 00000000..8002c3a1
--- /dev/null
+++ b/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql
@@ -0,0 +1,9 @@
+-- noqa: disable=all
+-- This sql was autogenerated as a reference example using the library
+-- CLI. Its format is tied to the specific database it was run against,
+-- and it may not be correct for all databases. Use the CLI's build 
+-- option to derive the best SQL for your dataset.
+
+-- ###########################################################
+
+CREATE TABLE IF NOT EXISTS abc__counts (test int);
diff --git a/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql b/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql
new file mode 100644
index 00000000..9d7aa00d
--- /dev/null
+++ b/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql
@@ -0,0 +1,9 @@
+-- noqa: disable=all
+-- This sql was autogenerated as a reference example using the library
+-- CLI. Its format is tied to the specific database it was run against,
+-- and it may not be correct for all databases. Use the CLI's build 
+-- option to derive the best SQL for your dataset.
+
+-- ###########################################################
+
+CREATE TABLE abc__meta_version AS SELECT 1 AS data_package_version;
diff --git a/tests/test_data/study_python_counts_valid/module1.py b/tests/test_data/study_python_counts_valid/module1.py
index ecbaf6e1..0871e908 100644
--- a/tests/test_data/study_python_counts_valid/module1.py
+++ b/tests/test_data/study_python_counts_valid/module1.py
@@ -4,8 +4,8 @@
 class ModuleOneRunner(cumulus_library.CountsBuilder):
     display_text = "module1"
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, study_prefix):
+        super().__init__(study_prefix="study_python_counts_valid")
 
     def prepare_queries(self, *args, **kwargs):
         self.queries.append(
diff --git a/tests/test_data/study_python_counts_valid/module2.py b/tests/test_data/study_python_counts_valid/module2.py
index c6c7e80a..9d96f130 100644
--- a/tests/test_data/study_python_counts_valid/module2.py
+++ b/tests/test_data/study_python_counts_valid/module2.py
@@ -4,7 +4,7 @@
 class ModuleTwoRunner(cumulus_library.CountsBuilder):
     display_text = "module2"
 
-    def __init__(self):
+    def __init__(self, study_prefix):
         super().__init__(study_prefix="study_python_counts_valid")
 
     def prepare_queries(self, *args, **kwargs):
diff --git a/tests/test_databases.py b/tests/test_databases.py
index bd5d8878..b726fe0e 100644
--- a/tests/test_databases.py
+++ b/tests/test_databases.py
@@ -28,10 +28,6 @@
 }
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @pytest.mark.parametrize(
     "db,data,expected,raises",
     [
@@ -77,10 +73,6 @@ def test_col_types_from_pandas(db, data, expected, raises):
         assert set(expected) == set(vals)
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @pytest.mark.parametrize(
     "db,data,expected,raises",
     [
@@ -222,10 +214,6 @@ def test_pyarrow_types_from_sql(db, data, expected, raises):
             assert vals[index][-1] == expected[index][-1]
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @pytest.mark.parametrize(
     "args,expected_type, raises",
     [
@@ -273,10 +261,6 @@ def test_create_db_backend(args, expected_type, raises):
 ### Database-specific edge case testing
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @pytest.mark.parametrize(
     "args,sse,keycount,expected,raises",
     [
@@ -375,10 +359,6 @@ def test_upload_file_athena(mock_botocore, args, sse, keycount, expected, raises
                 assert kwargs["Key"].endswith(args["file"].name)
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @mock.patch("pyathena.connect")
 def test_athena_pandas_cursor(mock_pyathena):
     mock_as_pandas = mock.MagicMock()
@@ -415,10 +395,6 @@ def test_athena_pandas_cursor(mock_pyathena):
     )
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @mock.patch("pyathena.connect")
 def test_athena_parser(mock_pyathena):
     db = databases.AthenaDatabaseBackend(**ATHENA_KWARGS)
@@ -426,6 +402,8 @@ def test_athena_parser(mock_pyathena):
     assert isinstance(parser, databases.AthenaParser)
 
 
+# Since this test creates an environment variable, create a dedicated
+# mock for it
 @mock.patch.dict(
     os.environ,
     clear=True,
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index e85ad24f..48bb7502 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -10,10 +10,6 @@
 from tests import conftest
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 def test_discovery(tmp_path):
     cli.main(
         cli_args=conftest.duckdb_args(
diff --git a/tests/test_duckdb.py b/tests/test_duckdb.py
index 876c8581..1e8d4764 100644
--- a/tests/test_duckdb.py
+++ b/tests/test_duckdb.py
@@ -6,7 +6,6 @@
 import tempfile
 from datetime import datetime
 from pathlib import Path
-from unittest import mock
 
 import pytest
 
@@ -14,10 +13,6 @@
 from cumulus_library.template_sql import base_templates
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 def test_duckdb_core_build_and_export(tmp_path):
     data_dir = f"{Path(__file__).parent}/test_data/duckdb_data"
     cli.main(
diff --git a/tests/test_dynamic_manifest.py b/tests/test_dynamic_manifest.py
index e82251b7..6b76a512 100644
--- a/tests/test_dynamic_manifest.py
+++ b/tests/test_dynamic_manifest.py
@@ -20,10 +20,6 @@
 ]
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @pytest.mark.parametrize(
     "options,result",
     [
@@ -45,10 +41,6 @@ def test_manifest_with_dynamic_prefix(options, result):
         assert manifest.get_study_prefix() == result
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @mock.patch("sys.executable", new=None)
 def test_manifest_with_dynamic_prefix_and_no_executable():
     """sys.executable must be valid for us to run a Python script"""
@@ -56,10 +48,6 @@ def test_manifest_with_dynamic_prefix_and_no_executable():
         study_manifest.StudyManifest(pathlib.Path("tests/test_data/study_dynamic_prefix"))
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 def test_cli_clean_with_dynamic_prefix(tmp_path):
     cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS, "--option=prefix:dynamic2"], tmp_path))
     cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS], tmp_path))
@@ -80,10 +68,6 @@ def test_cli_clean_with_dynamic_prefix(tmp_path):
     assert "dynamic2__meta_version" not in tables
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 def test_cli_export_with_dynamic_prefix(tmp_path):
     cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS, "--option=prefix:abc"], tmp_path))
     cli.main(cli_args=duckdb_args(["export", *STUDY_ARGS, "--option=prefix:abc"], tmp_path))
diff --git a/tests/test_static_file.py b/tests/test_static_file.py
index bdea564a..ff89ec77 100644
--- a/tests/test_static_file.py
+++ b/tests/test_static_file.py
@@ -1,14 +1,7 @@
-import os
-from unittest import mock
-
 from cumulus_library import cli, databases
 from tests import conftest
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 def test_static_file(tmp_path):
     cli.main(
         cli_args=conftest.duckdb_args(
diff --git a/tests/test_umls_api.py b/tests/test_umls_api.py
index 99c8a30f..b2930b9d 100644
--- a/tests/test_umls_api.py
+++ b/tests/test_umls_api.py
@@ -1,7 +1,6 @@
 import os
 import pathlib
 from contextlib import nullcontext as does_not_raise
-from unittest import mock
 
 import pytest
 import responses
@@ -18,10 +17,6 @@
 EXPANSION_VALUESET_OID = "2.16.840.1.113762.1.4.1106.68"
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @pytest.mark.parametrize(
     "api_key,validator_key,res_text,res_status,raises",
     [
@@ -45,10 +40,6 @@ def test_auth(api_key, validator_key, res_text, res_status, raises):
             assert api.validator_key == api_key
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @responses.activate
 def test_auth_reads_env_var():
     responses.add(responses.GET, AUTH_URL, body="true", status=200)
@@ -63,10 +54,6 @@ def get_valueset_data(file_name):
         return f.read()
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @pytest.mark.parametrize(
     "action,url,oid,expected_oids,raises",
     [
@@ -159,10 +146,6 @@ def test_get_valueset(action, url, oid, expected_oids, raises):
             assert data[i]["id"] == expected_oids[i]
 
 
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @responses.activate
 def test_download_umls(tmp_path):
     # this zip file is just an archive made by targeting the other .json files
diff --git a/tests/valueset/test_additional_rules_builder.py b/tests/valueset/test_additional_rules_builder.py
index 755b9099..a4fb309f 100644
--- a/tests/valueset/test_additional_rules_builder.py
+++ b/tests/valueset/test_additional_rules_builder.py
@@ -1,5 +1,4 @@
 import json
-import os
 import pathlib
 import tomllib
 from unittest import mock
@@ -16,10 +15,6 @@
 
 
 @pytest.mark.parametrize("prefix", [(""), ("foo")])
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @mock.patch("cumulus_library.apis.umls.UmlsApi")
 def test_additional_rules(mock_api, mock_db_config_rxnorm, prefix):
     data_path = pathlib.Path(__file__).parent.parent / "test_data/valueset/"
diff --git a/tests/valueset/test_rxnorm_valueset_builder.py b/tests/valueset/test_rxnorm_valueset_builder.py
index 7494f377..1b2f7edc 100644
--- a/tests/valueset/test_rxnorm_valueset_builder.py
+++ b/tests/valueset/test_rxnorm_valueset_builder.py
@@ -1,5 +1,4 @@
 import json
-import os
 import pathlib
 import tomllib
 from unittest import mock
@@ -15,10 +14,6 @@
 
 
 @pytest.mark.parametrize("prefix", [(""), ("foo")])
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @mock.patch("cumulus_library.apis.umls.UmlsApi")
 def test_rxnorm_valueset_builder(mock_api, mock_db_config_rxnorm, prefix):
     data_path = pathlib.Path(__file__).parent.parent / "test_data/valueset/"
diff --git a/tests/valueset/test_umls.py b/tests/valueset/test_umls.py
index c4d1485c..ea7c21e2 100644
--- a/tests/valueset/test_umls.py
+++ b/tests/valueset/test_umls.py
@@ -1,6 +1,3 @@
-import os
-from unittest import mock
-
 import pytest
 
 from cumulus_library import StudyManifest
@@ -8,10 +5,6 @@
 
 
 @pytest.mark.parametrize("prefix", [(""), ("foo")])
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 def test_umls_lookup(mock_db_config_rxnorm, prefix, tmp_path):
     with open(f"{tmp_path}/manifest.toml", "w", encoding="utf8") as f:
         f.write('study_prefix="test"')
diff --git a/tests/valueset/test_valueset_builder.py b/tests/valueset/test_valueset_builder.py
index b5a8e76c..41fd94ad 100644
--- a/tests/valueset/test_valueset_builder.py
+++ b/tests/valueset/test_valueset_builder.py
@@ -1,5 +1,4 @@
 import json
-import os
 import pathlib
 from contextlib import nullcontext as does_not_raise
 from unittest import mock
@@ -19,10 +18,6 @@
         (data_path / "invalid.toml", pytest.raises(SystemExit)),
     ],
 )
-@mock.patch.dict(
-    os.environ,
-    clear=True,
-)
 @mock.patch("cumulus_library.apis.umls.UmlsApi")
 def test_valueset_builder(mock_api, mock_db_config_rxnorm, config_path, raises):
     with raises:

From 09f61f9a2aa843bed166e39a7c4bcc5e29b6f3b2 Mon Sep 17 00:00:00 2001
From: Matt Garber <matthew.garber@childrens.harvard.edu>
Date: Tue, 10 Dec 2024 08:48:03 -0500
Subject: [PATCH 2/4] PR feedback

---
 cumulus_library/actions/builder.py            | 91 ++++++++++++-------
 cumulus_library/actions/file_generator.py     |  1 +
 cumulus_library/cli.py                        | 47 +++++-----
 cumulus_library/cli_parser.py                 |  4 +-
 tests/test_actions.py                         | 19 +---
 tests/test_cli.py                             |  1 -
 .../reference_sql/counts.sql                  |  9 --
 .../reference_sql/meta.sql                    |  9 --
 8 files changed, 92 insertions(+), 89 deletions(-)
 delete mode 100644 tests/test_data/study_dynamic_prefix/reference_sql/counts.sql
 delete mode 100644 tests/test_data/study_dynamic_prefix/reference_sql/meta.sql

diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
index b1aa2028..f4230214 100644
--- a/cumulus_library/actions/builder.py
+++ b/cumulus_library/actions/builder.py
@@ -43,7 +43,7 @@ def _load_and_execute_builder(
     write_reference_sql: bool = False,
     doc_str: str | None = None,
     prepare: bool = False,
-    data_path: pathlib.Path | None = None,
+    data_path: pathlib.Path,
     query_count: int | None = None,
 ) -> int:
     """Loads a table builder from a file.
@@ -177,10 +177,15 @@ def _run_workflow(
     """
     toml_path = pathlib.Path(f"{manifest._study_path}/{filename}")
     if prepare:
-        with open(toml_path) as file:
+        with open(toml_path, encoding="utf-8") as file:
             workflow_config = file.read()
         _render_output(
-            manifest.get_study_prefix(), [workflow_config], data_path, filename, query_count, True
+            manifest.get_study_prefix(),
+            [workflow_config],
+            data_path,
+            filename,
+            query_count,
+            is_toml=True,
         )
         return 1
     existing_stats = []
@@ -243,7 +248,7 @@ def build_matching_files(
     *,
     builder: str | None,
     db_parser: databases.DatabaseParser = None,
-    prepare: bool = False,
+    prepare: bool,
     data_path: pathlib.Path,
 ):
     """targets all table builders matching a target string for running
@@ -255,11 +260,8 @@ def build_matching_files(
     :keyword prepare: If true, will render query instead of executing
     :keyword data_path: If prepare is true, the path to write rendered data to
     """
-    if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]:
-        sys.exit(
-            f"{manifest.get_study_prefix()} does not support prepare mode. It must be run "
-            "directly against a target database."
-        )
+    if prepare:
+        _check_if_preparable(manifest.get_study_prefix())
     all_generators = manifest.get_all_generators()
     matches = []
     if not builder:  # pragma: no cover
@@ -285,8 +287,8 @@ def build_study(
     db_parser: databases.DatabaseParser = None,
     continue_from: str | None = None,
     file_list: list | None = None,
-    prepare: bool = False,
-    data_path: pathlib.Path | None = None,
+    prepare: bool,
+    data_path: pathlib.Path | None,
 ) -> None:
     """Creates tables in the schema by iterating through the sql_config.file_names
 
@@ -297,17 +299,15 @@ def build_study(
     :keyword prepare: If true, will render query instead of executing
     :keyword data_path: If prepare is true, the path to write rendered data to
     """
-    if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]:
-        sys.exit(
-            f"{manifest.get_study_prefix()} does not support prepare mode. It must be run "
-            "directly against a target database."
-        )
+    if prepare:
+        _check_if_preparable(manifest.get_study_prefix())
     if file_list is None:
         file_list = manifest.get_file_list(continue_from)
     if prepare:
         data_dir = data_path / manifest.get_study_prefix()
-        files = data_dir.glob("*")
-        [file.unlink() for file in files if file.is_file()]
+        for file in data_dir.glob("*"):
+            if file.is_file():
+                file.unlink()
     query_count = 0
     for file in file_list:
         if file.endswith(".py"):
@@ -316,8 +316,8 @@ def build_study(
                 manifest=manifest,
                 filename=file,
                 db_parser=db_parser,
-                prepare=prepare,
                 data_path=data_path,
+                prepare=prepare,
                 query_count=query_count,
             )
         elif file.endswith(".toml"):
@@ -325,8 +325,8 @@ def build_study(
                 config=config,
                 manifest=manifest,
                 filename=file,
-                prepare=prepare,
                 data_path=data_path,
+                prepare=prepare,
                 query_count=query_count,
             )
         elif file.endswith(".sql"):
@@ -334,8 +334,8 @@ def build_study(
                 config=config,
                 manifest=manifest,
                 filename=file,
-                prepare=prepare,
                 data_path=data_path,
+                prepare=prepare,
                 query_count=query_count,
             )
         else:
@@ -344,7 +344,7 @@ def build_study(
         with zipfile.ZipFile(
             f"{data_path}/{manifest.get_study_prefix()}.zip", "w", zipfile.ZIP_DEFLATED
         ) as z:
-            for file in data_dir.glob("*"):
+            for file in data_dir.iterdir():
                 z.write(file, file.relative_to(data_dir))
 
 
@@ -352,8 +352,9 @@ def _run_raw_queries(
     config: base_utils.StudyConfig,
     manifest: study_manifest.StudyManifest,
     filename: str,
+    *,
+    data_path: pathlib.Path | None,
     prepare: bool,
-    data_path: pathlib.Path,
     query_count: int,
 ) -> int:
     """Creates tables in the schema by iterating through the sql_config.file_names
@@ -401,23 +402,49 @@ def _run_raw_queries(
     return len(queries)
 
 
-def _render_output(study_name, outputs, data_path, filename, count, is_toml=False):
+def _render_output(
+    study_name: str,
+    outputs: list,
+    data_path: pathlib.Path,
+    filename: str,
+    count: int,
+    *,
+    is_toml: bool = False,
+):
     if is_toml:
         suffix = "toml"
     else:
         suffix = "sql"
-    for output in enumerate(outputs):
+    for index, output in enumerate(outputs):
         if is_toml:
             name = "config"
         else:
-            name = re.search(r"(.*)__\w*", output[1])[0].lower().replace(" ", "_")
-        file_path = data_path / (
-            f"{study_name}/{count+output[0]:04d}.{filename.rsplit('.', 1)[0]}"
-            f".{output[0]:02d}.{name}.{suffix}"
-        )
+            # This regex attempts to discover the table name, via looking for the first
+            # dunder, and then gets the start of the line its on as a non-quote requiring
+            # part of a file name. So for example, finding SQL that looks like this:
+            #   CREATE TABLE foo__bar AS (varchar baz)
+            # would result in `create_table_foo__bar` being assigned to name. The goal
+            # is to make this at least mildly parsable at the file system level if someone
+            # is reviewing a prepared study
+            name = re.search(r"(.*)__\w*", output)[0].lower().replace(" ", "_")
+        total_index = count + index
+        new_filename = f"{total_index:04d}.{filename.rsplit('.', 1)[0]}.{index:02d}.{name}.{suffix}"
+        file_path = data_path / f"{study_name}/{new_filename}"
+
         file_path.parent.mkdir(exist_ok=True, parents=True)
-        with open(file_path, "w") as f:
-            f.write(output[1])
+        with open(file_path, "w", encoding="UTF-8") as f:
+            f.write(output)
+
+
+def _check_if_preparable(prefix):
+    # This list should include any study which requires interrogating the database to
+    # find if data is available to query (outside of a toml-driven workflow),
+    # which isn't doable as a distributed query
+    if prefix in ["core", "discovery", "data-metrics"]:
+        sys.exit(
+            f"Study '{prefix}'' does not support prepare mode. It must be run "
+            "directly against a target database."
+        )
 
 
 def _query_error(
diff --git a/cumulus_library/actions/file_generator.py b/cumulus_library/actions/file_generator.py
index be4a721a..f3920cfb 100644
--- a/cumulus_library/actions/file_generator.py
+++ b/cumulus_library/actions/file_generator.py
@@ -39,6 +39,7 @@ def run_generate_sql(
             db_parser=db_parser,
             write_reference_sql=True,
             doc_str=doc_str,
+            data_path=None,
         )
 
 
diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
index 33e61711..547c3c75 100755
--- a/cumulus_library/cli.py
+++ b/cumulus_library/cli.py
@@ -106,27 +106,29 @@ def clean_and_build_study(
         """
         manifest = study_manifest.StudyManifest(target, self.data_path, options=options)
         try:
+            print("prep", prepare)
+            print("cont", continue_from)
             if not prepare:
                 builder.run_protected_table_builder(
                     config=self.get_config(manifest), manifest=manifest
                 )
-            if not continue_from and not prepare:
-                log_utils.log_transaction(
-                    config=self.get_config(manifest),
-                    manifest=manifest,
-                    status=enums.LogStatuses.STARTED,
-                )
-                cleaner.clean_study(
-                    config=self.get_config(manifest),
-                    manifest=manifest,
-                )
-
-            elif not prepare:
-                log_utils.log_transaction(
-                    config=self.get_config(manifest),
-                    manifest=manifest,
-                    status=enums.LogStatuses.RESUMED,
-                )
+                if not continue_from:
+                    print("continue")
+                    log_utils.log_transaction(
+                        config=self.get_config(manifest),
+                        manifest=manifest,
+                        status=enums.LogStatuses.STARTED,
+                    )
+                    cleaner.clean_study(
+                        config=self.get_config(manifest),
+                        manifest=manifest,
+                    )
+                else:
+                    log_utils.log_transaction(
+                        config=self.get_config(manifest),
+                        manifest=manifest,
+                        status=enums.LogStatuses.RESUMED,
+                    )
 
             builder.build_study(
                 config=self.get_config(manifest),
@@ -147,11 +149,12 @@ def clean_and_build_study(
             # skipping logging
             raise e  # pragma: no cover
         except Exception as e:
-            log_utils.log_transaction(
-                config=self.get_config(manifest),
-                manifest=manifest,
-                status=enums.LogStatuses.ERROR,
-            )
+            if not prepare:
+                log_utils.log_transaction(
+                    config=self.get_config(manifest),
+                    manifest=manifest,
+                    status=enums.LogStatuses.ERROR,
+                )
             raise e
 
     def build_matching_files(
diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py
index 6d9dd5e4..a1fb80ad 100644
--- a/cumulus_library/cli_parser.py
+++ b/cumulus_library/cli_parser.py
@@ -38,7 +38,7 @@ def add_custom_option(parser: argparse.ArgumentParser) -> None:
 def add_data_path_argument(parser: argparse.ArgumentParser) -> None:
     """Adds path arg to a subparser"""
     parser.add_argument(
-        "--data_path",
+        "data_path",
         default="./",
         nargs="?",
         help=(
@@ -211,7 +211,7 @@ def get_parser() -> argparse.ArgumentParser:
     build.add_argument(
         "--prepare",
         action="store_true",
-        help=("Prepares queries for remote execution."),
+        help=argparse.SUPPRESS,
     )
     build.add_argument(
         "--statistics",
diff --git a/tests/test_actions.py b/tests/test_actions.py
index 9e3a27aa..e963e5b1 100644
--- a/tests/test_actions.py
+++ b/tests/test_actions.py
@@ -172,10 +172,7 @@ def test_run_protected_table_builder(mock_db_config, study_path, stats):
 def test_table_builder(mock_db_config, study_path, verbose, expects, raises):
     with raises:
         manifest = study_manifest.StudyManifest(pathlib.Path(study_path))
-        builder.build_study(
-            config=mock_db_config,
-            manifest=manifest,
-        )
+        builder.build_study(config=mock_db_config, manifest=manifest, data_path=None, prepare=False)
         tables = (
             mock_db_config.db.cursor()
             .execute("SELECT distinct(table_name) FROM information_schema.tables ")
@@ -248,10 +245,7 @@ def test_build_study(mock_db_config, study_path, verbose, expects, raises):
             table_cols_types=table.column_types,
         )
         mock_db_config.db.cursor().execute(query)
-        builder.build_study(
-            config=mock_db_config,
-            manifest=manifest,
-        )
+        builder.build_study(config=mock_db_config, manifest=manifest, data_path=None, prepare=False)
         tables = (
             mock_db_config.db.cursor()
             .execute("SELECT distinct(table_name) FROM information_schema.tables ")
@@ -309,7 +303,7 @@ def test_run_psm_statistics_builders(
                 table_name="psm_test__psm_encounter_2023_06_15",
                 view_name="psm_test__psm_encounter_covariate",
             )
-        builder.build_study(config=config, manifest=manifest)
+        builder.build_study(config=config, manifest=manifest, prepare=False, data_path=None)
         tables = (
             mock_db_stats_config.db.cursor()
             .execute("SELECT distinct(table_name) FROM information_schema.tables")
@@ -333,10 +327,7 @@ def test_invoke_valueset_builder(mock_builder, mock_db_config, tmp_path):
     config = base_utils.StudyConfig(
         db=mock_db_config.db, schema=mock_db_config.schema, stats_build=True
     )
-    builder.build_study(
-        config=config,
-        manifest=manifest,
-    )
+    builder.build_study(config=config, manifest=manifest, prepare=False, data_path=None)
     assert mock_builder.is_called()
 
 
@@ -567,7 +558,7 @@ def test_generate_md(mock_db_config, tmp_path):
         manifest = study_manifest.StudyManifest(
             study_path=pathlib.Path(f"{tmp_path}/study_python_valid/")
         )
-        builder.build_study(config=mock_db_config, manifest=manifest)
+        builder.build_study(config=mock_db_config, manifest=manifest, prepare=False, data_path=None)
         file_generator.run_generate_markdown(config=mock_db_config, manifest=manifest)
         with open(f"{tmp_path}/study_python_valid/study_python_valid_generated.md") as f:
             generated_md = f.read()
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 546d0fba..a3db7fab 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -942,7 +942,6 @@ def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_
             study,
             "-s",
             "tests/test_data/",
-            "--data_path",
             str(tmp_path),
             "--prepare",
         ],
diff --git a/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql b/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql
deleted file mode 100644
index 8002c3a1..00000000
--- a/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql
+++ /dev/null
@@ -1,9 +0,0 @@
--- noqa: disable=all
--- This sql was autogenerated as a reference example using the library
--- CLI. Its format is tied to the specific database it was run against,
--- and it may not be correct for all databases. Use the CLI's build 
--- option to derive the best SQL for your dataset.
-
--- ###########################################################
-
-CREATE TABLE IF NOT EXISTS abc__counts (test int);
diff --git a/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql b/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql
deleted file mode 100644
index 9d7aa00d..00000000
--- a/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql
+++ /dev/null
@@ -1,9 +0,0 @@
--- noqa: disable=all
--- This sql was autogenerated as a reference example using the library
--- CLI. Its format is tied to the specific database it was run against,
--- and it may not be correct for all databases. Use the CLI's build 
--- option to derive the best SQL for your dataset.
-
--- ###########################################################
-
-CREATE TABLE abc__meta_version AS SELECT 1 AS data_package_version;

From 71f7a9b713658f777c08e4dab6fe8dc8c159e80d Mon Sep 17 00:00:00 2001
From: Matt Garber <matthew.garber@childrens.harvard.edu>
Date: Tue, 10 Dec 2024 09:22:26 -0500
Subject: [PATCH 3/4] sqlfluff update

---
 cumulus_library/.sqlfluff          | 1 +
 tests/test_cli.py                  | 3 ++-
 tests/test_data/psm/psm_cohort.sql | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cumulus_library/.sqlfluff b/cumulus_library/.sqlfluff
index a3bccbd4..2a69a5f1 100644
--- a/cumulus_library/.sqlfluff
+++ b/cumulus_library/.sqlfluff
@@ -6,6 +6,7 @@ exclude_rules=
     # these rule overfires on athena nested arrays
     references.from,
     structure.column_order,
+    structure.unused_join,
     aliasing.unused,
     # this rule interferes with FHIR naming conventions
     capitalisation.identifiers
diff --git a/tests/test_cli.py b/tests/test_cli.py
index a3db7fab..7377d222 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -907,7 +907,8 @@ def test_study_dir(tmp_path):
             "psm_test",
             ["0000.psm_cohort.00.create_table_psm_test__psm_cohort.sql"],
             """CREATE TABLE psm_test__psm_cohort AS (
-    SELECT * FROM core__condition ORDER BY id DESC LIMIT 100 --noqa: AM04
+    SELECT * FROM core__condition --noqa: AM04
+    ORDER BY id DESC LIMIT 100
 )""",
             [
                 "0001.psm_config.00.config.toml",
diff --git a/tests/test_data/psm/psm_cohort.sql b/tests/test_data/psm/psm_cohort.sql
index eb971142..13a19814 100644
--- a/tests/test_data/psm/psm_cohort.sql
+++ b/tests/test_data/psm/psm_cohort.sql
@@ -1,3 +1,4 @@
 CREATE TABLE psm_test__psm_cohort AS (
-    SELECT * FROM core__condition ORDER BY id DESC LIMIT 100 --noqa: AM04
+    SELECT * FROM core__condition --noqa: AM04
+    ORDER BY id DESC LIMIT 100
 );

From 944be91b874411c0ec7132af4ca06d4663dc28e1 Mon Sep 17 00:00:00 2001
From: Matt Garber <matthew.garber@childrens.harvard.edu>
Date: Tue, 10 Dec 2024 09:54:46 -0500
Subject: [PATCH 4/4] coverage

---
 cumulus_library/actions/builder.py        |  4 +-
 cumulus_library/actions/file_generator.py |  2 +-
 tests/test_cli.py                         | 57 ++++++++++++-----------
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
index f4230214..39d874bc 100644
--- a/cumulus_library/actions/builder.py
+++ b/cumulus_library/actions/builder.py
@@ -261,7 +261,7 @@ def build_matching_files(
     :keyword data_path: If prepare is true, the path to write rendered data to
     """
     if prepare:
-        _check_if_preparable(manifest.get_study_prefix())
+        _check_if_preparable(manifest.get_study_prefix())  # pragma: no cover
     all_generators = manifest.get_all_generators()
     matches = []
     if not builder:  # pragma: no cover
@@ -306,7 +306,7 @@ def build_study(
     if prepare:
         data_dir = data_path / manifest.get_study_prefix()
         for file in data_dir.glob("*"):
-            if file.is_file():
+            if file.is_file():  # pragma: no cover
                 file.unlink()
     query_count = 0
     for file in file_list:
diff --git a/cumulus_library/actions/file_generator.py b/cumulus_library/actions/file_generator.py
index f3920cfb..1ed943d2 100644
--- a/cumulus_library/actions/file_generator.py
+++ b/cumulus_library/actions/file_generator.py
@@ -31,7 +31,7 @@ def run_generate_sql(
     )
     for file in all_generators:
         if table_builder and file.find(table_builder) == -1:
-            continue
+            continue  # pragma: no cover
         builder._load_and_execute_builder(
             config=config,
             manifest=manifest,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 7377d222..46b6d39e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -882,7 +882,7 @@ def test_study_dir(tmp_path):
 
 
 @pytest.mark.parametrize(
-    "study,expected_queries,generated_query,toml_file",
+    "study,expected_queries,generated_query,toml_file, raises",
     [
         (
             "study_valid",
@@ -892,6 +892,7 @@ def test_study_dir(tmp_path):
             ],
             "CREATE TABLE study_valid__table (test int)",
             None,
+            does_not_raise(),
         ),
         (
             "study_python_counts_valid",
@@ -902,6 +903,7 @@ def test_study_dir(tmp_path):
             ],
             "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table1 (test int);",
             None,
+            does_not_raise(),
         ),
         (
             "psm_test",
@@ -932,31 +934,34 @@ def test_study_dir(tmp_path):
 ]
 """,
             ],
+            does_not_raise(),
         ),
+        ("core", [], "", None, pytest.raises(SystemExit)),
     ],
 )
-def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_file):
-    build_args = duckdb_args(
-        [
-            "build",
-            "-t",
-            study,
-            "-s",
-            "tests/test_data/",
-            str(tmp_path),
-            "--prepare",
-        ],
-        tmp_path,
-    )
-    with does_not_raise():
-        cli.main(cli_args=build_args)
-    queries = list(pathlib.Path(tmp_path).glob("**/*.sql"))
-    for query in queries:
-        assert query.name in expected_queries
-    with open(queries[0]) as f:
-        assert f.read() == generated_query
-    if toml_file:
-        config = next(pathlib.Path(tmp_path).glob("**/*.toml"))
-        assert config.name == toml_file[0]
-        with open(config) as f:
-            assert f.read() == toml_file[1]
+def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_file, raises):
+    with raises:
+        build_args = duckdb_args(
+            [
+                "build",
+                "-t",
+                study,
+                "-s",
+                "tests/test_data/",
+                str(tmp_path),
+                "--prepare",
+            ],
+            tmp_path,
+        )
+        with does_not_raise():
+            cli.main(cli_args=build_args)
+        queries = list(pathlib.Path(tmp_path).glob("**/*.sql"))
+        for query in queries:
+            assert query.name in expected_queries
+        with open(queries[0]) as f:
+            assert f.read() == generated_query
+        if toml_file:
+            config = next(pathlib.Path(tmp_path).glob("**/*.toml"))
+            assert config.name == toml_file[0]
+            with open(config) as f:
+                assert f.read() == toml_file[1]