From 3a8aa792fe5fcc270623cc28fdeff8cb43150900 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Fri, 6 Dec 2024 15:39:43 -0500 Subject: [PATCH 1/4] Packaging mode for studies --- cumulus_library/actions/builder.py | 203 +++++++++++++++--- cumulus_library/builders/valueset/umls.py | 6 +- cumulus_library/cli.py | 44 +++- cumulus_library/cli_parser.py | 7 +- tests/conftest.py | 8 + tests/test_athena.py | 5 - tests/test_cli.py | 95 +++++++- .../reference_sql/counts.sql | 9 + .../reference_sql/meta.sql | 9 + .../study_python_counts_valid/module1.py | 4 +- .../study_python_counts_valid/module2.py | 2 +- tests/test_databases.py | 26 +-- tests/test_discovery.py | 4 - tests/test_duckdb.py | 5 - tests/test_dynamic_manifest.py | 16 -- tests/test_static_file.py | 7 - tests/test_umls_api.py | 17 -- .../valueset/test_additional_rules_builder.py | 5 - .../valueset/test_rxnorm_valueset_builder.py | 5 - tests/valueset/test_umls.py | 7 - tests/valueset/test_valueset_builder.py | 5 - 21 files changed, 327 insertions(+), 162 deletions(-) create mode 100644 tests/test_data/study_dynamic_prefix/reference_sql/counts.sql create mode 100644 tests/test_data/study_dynamic_prefix/reference_sql/meta.sql diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index 119463a9..b1aa2028 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -7,11 +7,13 @@ import re import sys import tomllib +import zipfile from rich.progress import Progress, TaskID from cumulus_library import ( BaseTableBuilder, + CountsBuilder, base_utils, databases, enums, @@ -40,7 +42,10 @@ def _load_and_execute_builder( db_parser: databases.DatabaseParser = None, write_reference_sql: bool = False, doc_str: str | None = None, -) -> None: + prepare: bool = False, + data_path: pathlib.Path | None = None, + query_count: int | None = None, +) -> int: """Loads a table builder from a file. :param config: a StudyConfig object @@ -49,6 +54,10 @@ def _load_and_execute_builder( :keyword db_parser: an object implementing DatabaseParser for the target database :keyword write_reference_sql: if true, writes sql to disk inside a study's directory :keyword doc_str: A string to insert between queries written to disk + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to + :keyword query_count: if prepare is true, the number of queries already rendered + :returns: number of processed queries """ # Since we have to support arbitrary user-defined python files here, we @@ -79,8 +88,7 @@ def _load_and_execute_builder( f"Error loading {manifest._study_path}{filename}\n" "Custom builders must extend the BaseTableBuilder class." ) - - # Remove instances of intermediate classes, if present + # Remove instances of intermediate classes, if present (usually not) table_builder_subclasses = list( filter(lambda x: x.__name__ != "CountsBuilder", table_builder_subclasses) ) @@ -89,7 +97,10 @@ def _load_and_execute_builder( # remove it so it doesn't interfere with the next python module to # execute, since the subclass would otherwise hang around. table_builder_class = table_builder_subclasses[0] - table_builder = table_builder_class() + if issubclass(table_builder_class, CountsBuilder): + table_builder = table_builder_class(study_prefix=manifest.get_study_prefix()) + else: + table_builder = table_builder_class() if write_reference_sql: prefix = manifest.get_study_prefix() table_builder.prepare_queries(config=config, manifest=manifest, parser=db_parser) @@ -105,19 +116,30 @@ def _load_and_execute_builder( table_builder.write_queries( path=pathlib.Path(f"{manifest._study_path}/reference_sql/" + new_filename) ) + elif prepare: + table_builder.prepare_queries( + config=config, + manifest=manifest, + parser=db_parser, + ) + _render_output( + manifest.get_study_prefix(), table_builder.queries, data_path, filename, query_count + ) else: table_builder.execute_queries( config=config, manifest=manifest, parser=db_parser, ) - + num_queries = len(table_builder.queries) # After running the executor code, we'll remove # it so it doesn't interfere with the next python module to # execute, since the subclass would otherwise hang around. del sys.modules[table_builder_module.__name__] del table_builder_module + return num_queries + def run_protected_table_builder( config: base_utils.StudyConfig, @@ -136,13 +158,31 @@ def run_protected_table_builder( def _run_workflow( - config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str -) -> None: + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + filename: str, + prepare: str, + data_path: pathlib.Path, + query_count: int, +) -> int: """Loads workflow config from toml definitions and executes workflow :param config: a StudyConfig object :param manifest: a StudyManifest object + :param filename: Filename of the workflow config + :param prepare: If true, will render query instead of executing + :param data_path: If prepare is true, the path to write rendered data to + :param query_count: if prepare is true, the number of queries already rendered + :returns: a count of processed queries """ + toml_path = pathlib.Path(f"{manifest._study_path}/{filename}") + if prepare: + with open(toml_path) as file: + workflow_config = file.read() + _render_output( + manifest.get_study_prefix(), [workflow_config], data_path, filename, query_count, True + ) + return 1 existing_stats = [] if not config.stats_build: existing_stats = ( @@ -157,14 +197,13 @@ def _run_workflow( # but we're letting it slide so that builders function similarly # across the board safe_timestamp = base_utils.get_tablename_safe_iso_timestamp() - toml_path = pathlib.Path(f"{manifest._study_path}/{filename}") with open(toml_path, "rb") as file: workflow_config = tomllib.load(file) config_type = workflow_config["config_type"] target_table = workflow_config.get("target_table", workflow_config.get("table_prefix", "")) if (target_table,) in existing_stats and not config.stats_build: - return + return 0 match config_type: case "psm": builder = psm_builder.PsmBuilder( @@ -195,6 +234,7 @@ def _run_workflow( table_name=f"{target_table}_{safe_timestamp}", view_name=target_table, ) + return len(builder.queries) def build_matching_files( @@ -203,13 +243,23 @@ def build_matching_files( *, builder: str | None, db_parser: databases.DatabaseParser = None, + prepare: bool = False, + data_path: pathlib.Path, ): """targets all table builders matching a target string for running :param config: a StudyConfig object :param manifest: a StudyManifest object :keyword builder: filename of a module implementing a TableBuilder - :keyword db_parser: an object implementing DatabaseParser for the target database""" + :keyword db_parser: an object implementing DatabaseParser for the target database + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to + """ + if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]: + sys.exit( + f"{manifest.get_study_prefix()} does not support prepare mode. It must be run " + "directly against a target database." + ) all_generators = manifest.get_all_generators() matches = [] if not builder: # pragma: no cover @@ -218,7 +268,14 @@ def build_matching_files( for file in all_generators: if file.find(builder) != -1: matches.append(file) - build_study(config, manifest, db_parser=db_parser, file_list=matches) + build_study( + config, + manifest, + db_parser=db_parser, + file_list=matches, + prepare=prepare, + data_path=data_path, + ) def build_study( @@ -228,35 +285,87 @@ def build_study( db_parser: databases.DatabaseParser = None, continue_from: str | None = None, file_list: list | None = None, -) -> list: + prepare: bool = False, + data_path: pathlib.Path | None = None, +) -> None: """Creates tables in the schema by iterating through the sql_config.file_names :param config: a StudyConfig object :param manifest: a StudyManifest object + :keyword db_parser: a parser for the target database :keyword continue_from: Name of a file to resume table creation from - :returns: loaded queries (for unit testing only) + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to """ + if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]: + sys.exit( + f"{manifest.get_study_prefix()} does not support prepare mode. It must be run " + "directly against a target database." + ) if file_list is None: file_list = manifest.get_file_list(continue_from) + if prepare: + data_dir = data_path / manifest.get_study_prefix() + files = data_dir.glob("*") + [file.unlink() for file in files if file.is_file()] + query_count = 0 for file in file_list: if file.endswith(".py"): - _load_and_execute_builder( + query_count += _load_and_execute_builder( config=config, manifest=manifest, filename=file, db_parser=db_parser, + prepare=prepare, + data_path=data_path, + query_count=query_count, ) elif file.endswith(".toml"): - _run_workflow(config=config, manifest=manifest, filename=file) + query_count += _run_workflow( + config=config, + manifest=manifest, + filename=file, + prepare=prepare, + data_path=data_path, + query_count=query_count, + ) elif file.endswith(".sql"): - _run_raw_queries(config=config, manifest=manifest, filename=file) + query_count += _run_raw_queries( + config=config, + manifest=manifest, + filename=file, + prepare=prepare, + data_path=data_path, + query_count=query_count, + ) else: raise errors.StudyManifestParsingError + if prepare: + with zipfile.ZipFile( + f"{data_path}/{manifest.get_study_prefix()}.zip", "w", zipfile.ZIP_DEFLATED + ) as z: + for file in data_dir.glob("*"): + z.write(file, file.relative_to(data_dir)) def _run_raw_queries( - config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str -): + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + filename: str, + prepare: bool, + data_path: pathlib.Path, + query_count: int, +) -> int: + """Creates tables in the schema by iterating through the sql_config.file_names + + :param config: a StudyConfig object + :param manifest: a StudyManifest object + :param filename: the name of the sql file to read + :param prepare: If true, will render query instead of executing + :param data_path: If prepare is true, the path to write rendered data to + :param query_count: the number of queries currently processed + :returns number of queries processed: + """ queries = [] for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{filename}")): queries.append([query, filename]) @@ -266,25 +375,49 @@ def _run_raw_queries( f"`{manifest.get_study_prefix()}__", "`", ) - # We'll explicitly create a cursor since recreating cursors for each - # table in a study is slightly slower for some databases - cursor = config.db.cursor() - # We want to only show a progress bar if we are :not: printing SQL lines - with base_utils.get_progress_bar(disable=config.verbose) as progress: - task = progress.add_task( - f"Building tables from {filename}...", - total=len(queries), - visible=not config.verbose, + if prepare: + _render_output( + manifest.get_study_prefix(), [q[0] for q in queries], data_path, filename, query_count ) - _execute_build_queries( - config=config, - manifest=manifest, - cursor=cursor, - queries=queries, - progress=progress, - task=task, + else: + # We'll explicitly create a cursor since recreating cursors for each + # table in a study is slightly slower for some databases + cursor = config.db.cursor() + # We want to only show a progress bar if we are :not: printing SQL lines + with base_utils.get_progress_bar(disable=config.verbose) as progress: + task = progress.add_task( + f"Building tables from {filename}...", + total=len(queries), + visible=not config.verbose, + ) + _execute_build_queries( + config=config, + manifest=manifest, + cursor=cursor, + queries=queries, + progress=progress, + task=task, + ) + return len(queries) + + +def _render_output(study_name, outputs, data_path, filename, count, is_toml=False): + if is_toml: + suffix = "toml" + else: + suffix = "sql" + for output in enumerate(outputs): + if is_toml: + name = "config" + else: + name = re.search(r"(.*)__\w*", output[1])[0].lower().replace(" ", "_") + file_path = data_path / ( + f"{study_name}/{count+output[0]:04d}.{filename.rsplit('.', 1)[0]}" + f".{output[0]:02d}.{name}.{suffix}" ) - return queries + file_path.parent.mkdir(exist_ok=True, parents=True) + with open(file_path, "w") as f: + f.write(output[1]) def _query_error( diff --git a/cumulus_library/builders/valueset/umls.py b/cumulus_library/builders/valueset/umls.py index 966452a8..a6cc8c36 100644 --- a/cumulus_library/builders/valueset/umls.py +++ b/cumulus_library/builders/valueset/umls.py @@ -92,10 +92,12 @@ def generate_umls_tables( f"SELECT count(*) from {study_prefix}{table_prefix}umls_valuesets_rels" # noqa: S608 ).fetchone()[0] - cursor.execute(f"""CREATE TABLE {study_prefix}{table_prefix}umls_valuesets AS + cursor.execute( + f"""CREATE TABLE {study_prefix}{table_prefix}umls_valuesets AS SELECT distinct r.rxcui, v.* FROM {study_prefix}{table_prefix}umls_valuesets_rels as v, rxnorm.RXNCONSO as r WHERE v.code = r.code AND v.sab=r.sab --AND v.tty=r.tty -""") # noqa: S608 +""" # noqa: S608 + ) diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index f2cd0b71..33e61711 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -93,17 +93,24 @@ def clean_and_build_study( *, options: dict[str, str], continue_from: str | None = None, + prepare: bool = False, + data_path: pathlib.Path | None = None, ) -> None: """Recreates study views/tables :param target: A path to the study directory - :param options: The dictionary of study-specific options + :keyword options: The dictionary of study-specific options :keyword continue_from: Restart a run from a specific sql file (for dev only) + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to """ manifest = study_manifest.StudyManifest(target, self.data_path, options=options) try: - builder.run_protected_table_builder(config=self.get_config(manifest), manifest=manifest) - if not continue_from: + if not prepare: + builder.run_protected_table_builder( + config=self.get_config(manifest), manifest=manifest + ) + if not continue_from and not prepare: log_utils.log_transaction( config=self.get_config(manifest), manifest=manifest, @@ -114,7 +121,7 @@ def clean_and_build_study( manifest=manifest, ) - else: + elif not prepare: log_utils.log_transaction( config=self.get_config(manifest), manifest=manifest, @@ -125,12 +132,15 @@ def clean_and_build_study( config=self.get_config(manifest), manifest=manifest, continue_from=continue_from, + data_path=data_path, + prepare=prepare, ) - log_utils.log_transaction( - config=self.get_config(manifest), - manifest=manifest, - status=enums.LogStatuses.FINISHED, - ) + if not prepare: + log_utils.log_transaction( + config=self.get_config(manifest), + manifest=manifest, + status=enums.LogStatuses.FINISHED, + ) except errors.StudyManifestFilesystemError as e: # This should be thrown prior to any database connections, so @@ -150,18 +160,24 @@ def build_matching_files( table_builder_name: str, *, options: dict[str, str], + prepare: bool = False, + data_path: pathlib.Path, ) -> None: """Runs a single table builder :param target: A path to the study directory :param table_builder_name: a builder file referenced in the study's manifest - :param options: The dictionary of study-specific options + :keyword options: The dictionary of study-specific options + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to """ manifest = study_manifest.StudyManifest(target, options=options) builder.build_matching_files( config=self.get_config(manifest), manifest=manifest, builder=table_builder_name, + prepare=prepare, + data_path=data_path, ) ### Data exporters @@ -325,13 +341,19 @@ def run_cli(args: dict): for target in args["target"]: if args["builder"]: runner.build_matching_files( - study_dict[target], args["builder"], options=args["options"] + study_dict[target], + args["builder"], + options=args["options"], + prepare=args["prepare"], + data_path=args["data_path"], ) else: runner.clean_and_build_study( study_dict[target], continue_from=args["continue_from"], options=args["options"], + prepare=args["prepare"], + data_path=args["data_path"], ) elif args["action"] == "export": diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py index ec1830b9..6d9dd5e4 100644 --- a/cumulus_library/cli_parser.py +++ b/cumulus_library/cli_parser.py @@ -38,7 +38,7 @@ def add_custom_option(parser: argparse.ArgumentParser) -> None: def add_data_path_argument(parser: argparse.ArgumentParser) -> None: """Adds path arg to a subparser""" parser.add_argument( - "data_path", + "--data_path", default="./", nargs="?", help=( @@ -208,6 +208,11 @@ def get_parser() -> argparse.ArgumentParser: action="store_true", help="Forces file downloads/uploads to occur, even if they already exist", ) + build.add_argument( + "--prepare", + action="store_true", + help=("Prepares queries for remote execution."), + ) build.add_argument( "--statistics", action="store_true", diff --git a/tests/conftest.py b/tests/conftest.py index f602a247..824544d0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,9 @@ import copy import datetime import json +import os import pathlib +from unittest import mock import numpy import pandas @@ -197,6 +199,12 @@ def debug_diff_tables(cols, found, ref, pos=0): # Database fixtures +@pytest.fixture(scope="session", autouse=True) +def mock_env(): + with mock.patch.dict(os.environ, clear=True): + yield + + @pytest.fixture def mock_db(tmp_path): """Provides a DuckDatabaseBackend for local testing""" diff --git a/tests/test_athena.py b/tests/test_athena.py index 940b7708..18e233f4 100644 --- a/tests/test_athena.py +++ b/tests/test_athena.py @@ -1,7 +1,6 @@ """Edge case testing for Athena database support""" import json -import os import pathlib from unittest import mock @@ -46,10 +45,6 @@ def test_schema_parsing(): assert expected == parser.parse_found_schema(schema) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("botocore.session.Session") def test_upload_parquet_response_handling(mock_session): path = pathlib.Path(__file__).resolve().parent diff --git a/tests/test_cli.py b/tests/test_cli.py index 6d50bc22..546d0fba 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -673,7 +673,6 @@ def test_cli_upload_filter(mock_upload_data, mock_glob, args, calls): @pytest.mark.parametrize("mode", ["cli", "env"]) -@mock.patch.dict(os.environ, clear=True) # early exit with a dumb error @mock.patch("cumulus_library.base_utils.StudyConfig", side_effect=ZeroDivisionError) def test_cli_umls_parsing(mock_config, mode, tmp_path): @@ -688,7 +687,6 @@ def test_cli_umls_parsing(mock_config, mode, tmp_path): assert mock_config.call_args[1]["umls_key"] == "MY-KEY" -@mock.patch.dict(os.environ, clear=True) def test_cli_single_builder(tmp_path): cli.main(cli_args=duckdb_args(["build", "--builder=patient", "--target=core"], tmp_path)) db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") @@ -700,7 +698,6 @@ def test_cli_single_builder(tmp_path): } == tables -@mock.patch.dict(os.environ, clear=True) def test_cli_finds_study_from_manifest_prefix(tmp_path): # This study is located inside a folder called `study_different_dir`, # but we're going to find it using its real study prefix from the manifest. @@ -715,7 +712,6 @@ def test_cli_finds_study_from_manifest_prefix(tmp_path): assert "study_different_name__table" in tables -@mock.patch.dict(os.environ, clear=True) @pytest.mark.parametrize( "option,raises", [ @@ -745,7 +741,6 @@ def test_cli_custom_args(mock_config, tmp_path, option, raises): assert called_options[option.split(":")[0]] == option.split(":")[1] -@mock.patch.dict(os.environ, clear=True) @mock.patch("cumulus_library.base_utils.StudyConfig") def test_cli_no_custom_args_yields_empty_dict(mock_config, tmp_path): mock_config.return_value.stats_clean = False @@ -765,7 +760,6 @@ def test_cli_no_custom_args_yields_empty_dict(mock_config, tmp_path): assert {} == called_options -@mock.patch.dict(os.environ, clear=True) def test_cli_import_study(tmp_path): test_data = {"string": ["a", "b", None]} df = pandas.DataFrame(test_data) @@ -790,7 +784,6 @@ def test_cli_import_study(tmp_path): ) -@mock.patch.dict(os.environ, clear=True) def test_dedicated_schema(tmp_path): core_build_args = duckdb_args( [ @@ -827,7 +820,6 @@ def test_dedicated_schema(tmp_path): assert table in tables -@mock.patch.dict(os.environ, clear=True) @mock.patch("cumulus_library.databases.duckdb.DuckDatabaseBackend") def test_sql_error_handling(mock_backend, tmp_path): mock_backend.return_value.cursor.return_value.execute.side_effect = [ @@ -848,7 +840,6 @@ def test_sql_error_handling(mock_backend, tmp_path): cli.main(cli_args=build_args) -@mock.patch.dict(os.environ, clear=True) def test_version(capfd): out = None with pytest.raises(SystemExit): @@ -870,7 +861,10 @@ def test_version(capfd): assert "study_invalid_bad_query: no version defined" in out -@mock.patch.dict(os.environ, clear=True) +@mock.patch.dict( + os.environ, + clear=True, +) def test_study_dir(tmp_path): os.environ["CUMULUS_LIBRARY_STUDY_DIR"] = str( pathlib.Path(__file__).resolve().parent / "test_data/" @@ -885,3 +879,84 @@ def test_study_dir(tmp_path): ) with does_not_raise(): cli.main(cli_args=build_args) + + +@pytest.mark.parametrize( + "study,expected_queries,generated_query,toml_file", + [ + ( + "study_valid", + [ + "0000.test.00.create_table_study_valid__table.sql", + "0001.test2.00.create_table_study_valid__table2.sql", + ], + "CREATE TABLE study_valid__table (test int)", + None, + ), + ( + "study_python_counts_valid", + [ + "0000.module1.00.create_table_if_not_exists_study_python_counts_valid__table1.sql", + "0001.module1.00.create_table_if_not_exists_study_python_counts_valid__table1.sql", + "0002.module2.00.create_table_if_not_exists_study_python_counts_valid__table2.sql", + ], + "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table1 (test int);", + None, + ), + ( + "psm_test", + ["0000.psm_cohort.00.create_table_psm_test__psm_cohort.sql"], + """CREATE TABLE psm_test__psm_cohort AS ( + SELECT * FROM core__condition ORDER BY id DESC LIMIT 100 --noqa: AM04 +)""", + [ + "0001.psm_config.00.config.toml", + """config_type = "psm" +classification_json = "dsm5_classifications.json" +pos_source_table = "psm_test__psm_cohort" +neg_source_table = "core__condition" +target_table = "psm_test__psm_encounter_covariate" +primary_ref = 'encounter_ref' +count_ref = 'subject_ref' +count_table = 'core__condition' +dependent_variable = "example_diagnosis" +pos_sample_size = 20 +neg_sample_size = 100 +seed = 1234567890 +[join_cols_by_table.core__encounter] +join_id = "encounter_ref" +included_cols = [ + ["gender"], + ["race_display", "race"] +] +""", + ], + ), + ], +) +def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_file): + build_args = duckdb_args( + [ + "build", + "-t", + study, + "-s", + "tests/test_data/", + "--data_path", + str(tmp_path), + "--prepare", + ], + tmp_path, + ) + with does_not_raise(): + cli.main(cli_args=build_args) + queries = list(pathlib.Path(tmp_path).glob("**/*.sql")) + for query in queries: + assert query.name in expected_queries + with open(queries[0]) as f: + assert f.read() == generated_query + if toml_file: + config = next(pathlib.Path(tmp_path).glob("**/*.toml")) + assert config.name == toml_file[0] + with open(config) as f: + assert f.read() == toml_file[1] diff --git a/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql b/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql new file mode 100644 index 00000000..8002c3a1 --- /dev/null +++ b/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql @@ -0,0 +1,9 @@ +-- noqa: disable=all +-- This sql was autogenerated as a reference example using the library +-- CLI. Its format is tied to the specific database it was run against, +-- and it may not be correct for all databases. Use the CLI's build +-- option to derive the best SQL for your dataset. + +-- ########################################################### + +CREATE TABLE IF NOT EXISTS abc__counts (test int); diff --git a/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql b/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql new file mode 100644 index 00000000..9d7aa00d --- /dev/null +++ b/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql @@ -0,0 +1,9 @@ +-- noqa: disable=all +-- This sql was autogenerated as a reference example using the library +-- CLI. Its format is tied to the specific database it was run against, +-- and it may not be correct for all databases. Use the CLI's build +-- option to derive the best SQL for your dataset. + +-- ########################################################### + +CREATE TABLE abc__meta_version AS SELECT 1 AS data_package_version; diff --git a/tests/test_data/study_python_counts_valid/module1.py b/tests/test_data/study_python_counts_valid/module1.py index ecbaf6e1..0871e908 100644 --- a/tests/test_data/study_python_counts_valid/module1.py +++ b/tests/test_data/study_python_counts_valid/module1.py @@ -4,8 +4,8 @@ class ModuleOneRunner(cumulus_library.CountsBuilder): display_text = "module1" - def __init__(self): - super().__init__() + def __init__(self, study_prefix): + super().__init__(study_prefix="study_python_counts_valid") def prepare_queries(self, *args, **kwargs): self.queries.append( diff --git a/tests/test_data/study_python_counts_valid/module2.py b/tests/test_data/study_python_counts_valid/module2.py index c6c7e80a..9d96f130 100644 --- a/tests/test_data/study_python_counts_valid/module2.py +++ b/tests/test_data/study_python_counts_valid/module2.py @@ -4,7 +4,7 @@ class ModuleTwoRunner(cumulus_library.CountsBuilder): display_text = "module2" - def __init__(self): + def __init__(self, study_prefix): super().__init__(study_prefix="study_python_counts_valid") def prepare_queries(self, *args, **kwargs): diff --git a/tests/test_databases.py b/tests/test_databases.py index bd5d8878..b726fe0e 100644 --- a/tests/test_databases.py +++ b/tests/test_databases.py @@ -28,10 +28,6 @@ } -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "db,data,expected,raises", [ @@ -77,10 +73,6 @@ def test_col_types_from_pandas(db, data, expected, raises): assert set(expected) == set(vals) -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "db,data,expected,raises", [ @@ -222,10 +214,6 @@ def test_pyarrow_types_from_sql(db, data, expected, raises): assert vals[index][-1] == expected[index][-1] -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "args,expected_type, raises", [ @@ -273,10 +261,6 @@ def test_create_db_backend(args, expected_type, raises): ### Database-specific edge case testing -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "args,sse,keycount,expected,raises", [ @@ -375,10 +359,6 @@ def test_upload_file_athena(mock_botocore, args, sse, keycount, expected, raises assert kwargs["Key"].endswith(args["file"].name) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("pyathena.connect") def test_athena_pandas_cursor(mock_pyathena): mock_as_pandas = mock.MagicMock() @@ -415,10 +395,6 @@ def test_athena_pandas_cursor(mock_pyathena): ) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("pyathena.connect") def test_athena_parser(mock_pyathena): db = databases.AthenaDatabaseBackend(**ATHENA_KWARGS) @@ -426,6 +402,8 @@ def test_athena_parser(mock_pyathena): assert isinstance(parser, databases.AthenaParser) +# Since this test creates an environment variable, create a dedicated +# mock for it @mock.patch.dict( os.environ, clear=True, diff --git a/tests/test_discovery.py b/tests/test_discovery.py index e85ad24f..48bb7502 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -10,10 +10,6 @@ from tests import conftest -@mock.patch.dict( - os.environ, - clear=True, -) def test_discovery(tmp_path): cli.main( cli_args=conftest.duckdb_args( diff --git a/tests/test_duckdb.py b/tests/test_duckdb.py index 876c8581..1e8d4764 100644 --- a/tests/test_duckdb.py +++ b/tests/test_duckdb.py @@ -6,7 +6,6 @@ import tempfile from datetime import datetime from pathlib import Path -from unittest import mock import pytest @@ -14,10 +13,6 @@ from cumulus_library.template_sql import base_templates -@mock.patch.dict( - os.environ, - clear=True, -) def test_duckdb_core_build_and_export(tmp_path): data_dir = f"{Path(__file__).parent}/test_data/duckdb_data" cli.main( diff --git a/tests/test_dynamic_manifest.py b/tests/test_dynamic_manifest.py index e82251b7..6b76a512 100644 --- a/tests/test_dynamic_manifest.py +++ b/tests/test_dynamic_manifest.py @@ -20,10 +20,6 @@ ] -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "options,result", [ @@ -45,10 +41,6 @@ def test_manifest_with_dynamic_prefix(options, result): assert manifest.get_study_prefix() == result -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("sys.executable", new=None) def test_manifest_with_dynamic_prefix_and_no_executable(): """sys.executable must be valid for us to run a Python script""" @@ -56,10 +48,6 @@ def test_manifest_with_dynamic_prefix_and_no_executable(): study_manifest.StudyManifest(pathlib.Path("tests/test_data/study_dynamic_prefix")) -@mock.patch.dict( - os.environ, - clear=True, -) def test_cli_clean_with_dynamic_prefix(tmp_path): cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS, "--option=prefix:dynamic2"], tmp_path)) cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS], tmp_path)) @@ -80,10 +68,6 @@ def test_cli_clean_with_dynamic_prefix(tmp_path): assert "dynamic2__meta_version" not in tables -@mock.patch.dict( - os.environ, - clear=True, -) def test_cli_export_with_dynamic_prefix(tmp_path): cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS, "--option=prefix:abc"], tmp_path)) cli.main(cli_args=duckdb_args(["export", *STUDY_ARGS, "--option=prefix:abc"], tmp_path)) diff --git a/tests/test_static_file.py b/tests/test_static_file.py index bdea564a..ff89ec77 100644 --- a/tests/test_static_file.py +++ b/tests/test_static_file.py @@ -1,14 +1,7 @@ -import os -from unittest import mock - from cumulus_library import cli, databases from tests import conftest -@mock.patch.dict( - os.environ, - clear=True, -) def test_static_file(tmp_path): cli.main( cli_args=conftest.duckdb_args( diff --git a/tests/test_umls_api.py b/tests/test_umls_api.py index 99c8a30f..b2930b9d 100644 --- a/tests/test_umls_api.py +++ b/tests/test_umls_api.py @@ -1,7 +1,6 @@ import os import pathlib from contextlib import nullcontext as does_not_raise -from unittest import mock import pytest import responses @@ -18,10 +17,6 @@ EXPANSION_VALUESET_OID = "2.16.840.1.113762.1.4.1106.68" -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "api_key,validator_key,res_text,res_status,raises", [ @@ -45,10 +40,6 @@ def test_auth(api_key, validator_key, res_text, res_status, raises): assert api.validator_key == api_key -@mock.patch.dict( - os.environ, - clear=True, -) @responses.activate def test_auth_reads_env_var(): responses.add(responses.GET, AUTH_URL, body="true", status=200) @@ -63,10 +54,6 @@ def get_valueset_data(file_name): return f.read() -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "action,url,oid,expected_oids,raises", [ @@ -159,10 +146,6 @@ def test_get_valueset(action, url, oid, expected_oids, raises): assert data[i]["id"] == expected_oids[i] -@mock.patch.dict( - os.environ, - clear=True, -) @responses.activate def test_download_umls(tmp_path): # this zip file is just an archive made by targeting the other .json files diff --git a/tests/valueset/test_additional_rules_builder.py b/tests/valueset/test_additional_rules_builder.py index 755b9099..a4fb309f 100644 --- a/tests/valueset/test_additional_rules_builder.py +++ b/tests/valueset/test_additional_rules_builder.py @@ -1,5 +1,4 @@ import json -import os import pathlib import tomllib from unittest import mock @@ -16,10 +15,6 @@ @pytest.mark.parametrize("prefix", [(""), ("foo")]) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("cumulus_library.apis.umls.UmlsApi") def test_additional_rules(mock_api, mock_db_config_rxnorm, prefix): data_path = pathlib.Path(__file__).parent.parent / "test_data/valueset/" diff --git a/tests/valueset/test_rxnorm_valueset_builder.py b/tests/valueset/test_rxnorm_valueset_builder.py index 7494f377..1b2f7edc 100644 --- a/tests/valueset/test_rxnorm_valueset_builder.py +++ b/tests/valueset/test_rxnorm_valueset_builder.py @@ -1,5 +1,4 @@ import json -import os import pathlib import tomllib from unittest import mock @@ -15,10 +14,6 @@ @pytest.mark.parametrize("prefix", [(""), ("foo")]) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("cumulus_library.apis.umls.UmlsApi") def test_rxnorm_valueset_builder(mock_api, mock_db_config_rxnorm, prefix): data_path = pathlib.Path(__file__).parent.parent / "test_data/valueset/" diff --git a/tests/valueset/test_umls.py b/tests/valueset/test_umls.py index c4d1485c..ea7c21e2 100644 --- a/tests/valueset/test_umls.py +++ b/tests/valueset/test_umls.py @@ -1,6 +1,3 @@ -import os -from unittest import mock - import pytest from cumulus_library import StudyManifest @@ -8,10 +5,6 @@ @pytest.mark.parametrize("prefix", [(""), ("foo")]) -@mock.patch.dict( - os.environ, - clear=True, -) def test_umls_lookup(mock_db_config_rxnorm, prefix, tmp_path): with open(f"{tmp_path}/manifest.toml", "w", encoding="utf8") as f: f.write('study_prefix="test"') diff --git a/tests/valueset/test_valueset_builder.py b/tests/valueset/test_valueset_builder.py index b5a8e76c..41fd94ad 100644 --- a/tests/valueset/test_valueset_builder.py +++ b/tests/valueset/test_valueset_builder.py @@ -1,5 +1,4 @@ import json -import os import pathlib from contextlib import nullcontext as does_not_raise from unittest import mock @@ -19,10 +18,6 @@ (data_path / "invalid.toml", pytest.raises(SystemExit)), ], ) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("cumulus_library.apis.umls.UmlsApi") def test_valueset_builder(mock_api, mock_db_config_rxnorm, config_path, raises): with raises: From 09f61f9a2aa843bed166e39a7c4bcc5e29b6f3b2 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Tue, 10 Dec 2024 08:48:03 -0500 Subject: [PATCH 2/4] PR feedback --- cumulus_library/actions/builder.py | 91 ++++++++++++------- cumulus_library/actions/file_generator.py | 1 + cumulus_library/cli.py | 47 +++++----- cumulus_library/cli_parser.py | 4 +- tests/test_actions.py | 19 +--- tests/test_cli.py | 1 - .../reference_sql/counts.sql | 9 -- .../reference_sql/meta.sql | 9 -- 8 files changed, 92 insertions(+), 89 deletions(-) delete mode 100644 tests/test_data/study_dynamic_prefix/reference_sql/counts.sql delete mode 100644 tests/test_data/study_dynamic_prefix/reference_sql/meta.sql diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index b1aa2028..f4230214 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -43,7 +43,7 @@ def _load_and_execute_builder( write_reference_sql: bool = False, doc_str: str | None = None, prepare: bool = False, - data_path: pathlib.Path | None = None, + data_path: pathlib.Path, query_count: int | None = None, ) -> int: """Loads a table builder from a file. @@ -177,10 +177,15 @@ def _run_workflow( """ toml_path = pathlib.Path(f"{manifest._study_path}/{filename}") if prepare: - with open(toml_path) as file: + with open(toml_path, encoding="utf-8") as file: workflow_config = file.read() _render_output( - manifest.get_study_prefix(), [workflow_config], data_path, filename, query_count, True + manifest.get_study_prefix(), + [workflow_config], + data_path, + filename, + query_count, + is_toml=True, ) return 1 existing_stats = [] @@ -243,7 +248,7 @@ def build_matching_files( *, builder: str | None, db_parser: databases.DatabaseParser = None, - prepare: bool = False, + prepare: bool, data_path: pathlib.Path, ): """targets all table builders matching a target string for running @@ -255,11 +260,8 @@ def build_matching_files( :keyword prepare: If true, will render query instead of executing :keyword data_path: If prepare is true, the path to write rendered data to """ - if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]: - sys.exit( - f"{manifest.get_study_prefix()} does not support prepare mode. It must be run " - "directly against a target database." - ) + if prepare: + _check_if_preparable(manifest.get_study_prefix()) all_generators = manifest.get_all_generators() matches = [] if not builder: # pragma: no cover @@ -285,8 +287,8 @@ def build_study( db_parser: databases.DatabaseParser = None, continue_from: str | None = None, file_list: list | None = None, - prepare: bool = False, - data_path: pathlib.Path | None = None, + prepare: bool, + data_path: pathlib.Path | None, ) -> None: """Creates tables in the schema by iterating through the sql_config.file_names @@ -297,17 +299,15 @@ def build_study( :keyword prepare: If true, will render query instead of executing :keyword data_path: If prepare is true, the path to write rendered data to """ - if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]: - sys.exit( - f"{manifest.get_study_prefix()} does not support prepare mode. It must be run " - "directly against a target database." - ) + if prepare: + _check_if_preparable(manifest.get_study_prefix()) if file_list is None: file_list = manifest.get_file_list(continue_from) if prepare: data_dir = data_path / manifest.get_study_prefix() - files = data_dir.glob("*") - [file.unlink() for file in files if file.is_file()] + for file in data_dir.glob("*"): + if file.is_file(): + file.unlink() query_count = 0 for file in file_list: if file.endswith(".py"): @@ -316,8 +316,8 @@ def build_study( manifest=manifest, filename=file, db_parser=db_parser, - prepare=prepare, data_path=data_path, + prepare=prepare, query_count=query_count, ) elif file.endswith(".toml"): @@ -325,8 +325,8 @@ def build_study( config=config, manifest=manifest, filename=file, - prepare=prepare, data_path=data_path, + prepare=prepare, query_count=query_count, ) elif file.endswith(".sql"): @@ -334,8 +334,8 @@ def build_study( config=config, manifest=manifest, filename=file, - prepare=prepare, data_path=data_path, + prepare=prepare, query_count=query_count, ) else: @@ -344,7 +344,7 @@ def build_study( with zipfile.ZipFile( f"{data_path}/{manifest.get_study_prefix()}.zip", "w", zipfile.ZIP_DEFLATED ) as z: - for file in data_dir.glob("*"): + for file in data_dir.iterdir(): z.write(file, file.relative_to(data_dir)) @@ -352,8 +352,9 @@ def _run_raw_queries( config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str, + *, + data_path: pathlib.Path | None, prepare: bool, - data_path: pathlib.Path, query_count: int, ) -> int: """Creates tables in the schema by iterating through the sql_config.file_names @@ -401,23 +402,49 @@ def _run_raw_queries( return len(queries) -def _render_output(study_name, outputs, data_path, filename, count, is_toml=False): +def _render_output( + study_name: str, + outputs: list, + data_path: pathlib.Path, + filename: str, + count: int, + *, + is_toml: bool = False, +): if is_toml: suffix = "toml" else: suffix = "sql" - for output in enumerate(outputs): + for index, output in enumerate(outputs): if is_toml: name = "config" else: - name = re.search(r"(.*)__\w*", output[1])[0].lower().replace(" ", "_") - file_path = data_path / ( - f"{study_name}/{count+output[0]:04d}.{filename.rsplit('.', 1)[0]}" - f".{output[0]:02d}.{name}.{suffix}" - ) + # This regex attempts to discover the table name, via looking for the first + # dunder, and then gets the start of the line its on as a non-quote requiring + # part of a file name. So for example, finding SQL that looks like this: + # CREATE TABLE foo__bar AS (varchar baz) + # would result in `create_table_foo__bar` being assigned to name. The goal + # is to make this at least mildly parsable at the file system level if someone + # is reviewing a prepared study + name = re.search(r"(.*)__\w*", output)[0].lower().replace(" ", "_") + total_index = count + index + new_filename = f"{total_index:04d}.{filename.rsplit('.', 1)[0]}.{index:02d}.{name}.{suffix}" + file_path = data_path / f"{study_name}/{new_filename}" + file_path.parent.mkdir(exist_ok=True, parents=True) - with open(file_path, "w") as f: - f.write(output[1]) + with open(file_path, "w", encoding="UTF-8") as f: + f.write(output) + + +def _check_if_preparable(prefix): + # This list should include any study which requires interrogating the database to + # find if data is available to query (outside of a toml-driven workflow), + # which isn't doable as a distributed query + if prefix in ["core", "discovery", "data-metrics"]: + sys.exit( + f"Study '{prefix}'' does not support prepare mode. It must be run " + "directly against a target database." + ) def _query_error( diff --git a/cumulus_library/actions/file_generator.py b/cumulus_library/actions/file_generator.py index be4a721a..f3920cfb 100644 --- a/cumulus_library/actions/file_generator.py +++ b/cumulus_library/actions/file_generator.py @@ -39,6 +39,7 @@ def run_generate_sql( db_parser=db_parser, write_reference_sql=True, doc_str=doc_str, + data_path=None, ) diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index 33e61711..547c3c75 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -106,27 +106,29 @@ def clean_and_build_study( """ manifest = study_manifest.StudyManifest(target, self.data_path, options=options) try: + print("prep", prepare) + print("cont", continue_from) if not prepare: builder.run_protected_table_builder( config=self.get_config(manifest), manifest=manifest ) - if not continue_from and not prepare: - log_utils.log_transaction( - config=self.get_config(manifest), - manifest=manifest, - status=enums.LogStatuses.STARTED, - ) - cleaner.clean_study( - config=self.get_config(manifest), - manifest=manifest, - ) - - elif not prepare: - log_utils.log_transaction( - config=self.get_config(manifest), - manifest=manifest, - status=enums.LogStatuses.RESUMED, - ) + if not continue_from: + print("continue") + log_utils.log_transaction( + config=self.get_config(manifest), + manifest=manifest, + status=enums.LogStatuses.STARTED, + ) + cleaner.clean_study( + config=self.get_config(manifest), + manifest=manifest, + ) + else: + log_utils.log_transaction( + config=self.get_config(manifest), + manifest=manifest, + status=enums.LogStatuses.RESUMED, + ) builder.build_study( config=self.get_config(manifest), @@ -147,11 +149,12 @@ def clean_and_build_study( # skipping logging raise e # pragma: no cover except Exception as e: - log_utils.log_transaction( - config=self.get_config(manifest), - manifest=manifest, - status=enums.LogStatuses.ERROR, - ) + if not prepare: + log_utils.log_transaction( + config=self.get_config(manifest), + manifest=manifest, + status=enums.LogStatuses.ERROR, + ) raise e def build_matching_files( diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py index 6d9dd5e4..a1fb80ad 100644 --- a/cumulus_library/cli_parser.py +++ b/cumulus_library/cli_parser.py @@ -38,7 +38,7 @@ def add_custom_option(parser: argparse.ArgumentParser) -> None: def add_data_path_argument(parser: argparse.ArgumentParser) -> None: """Adds path arg to a subparser""" parser.add_argument( - "--data_path", + "data_path", default="./", nargs="?", help=( @@ -211,7 +211,7 @@ def get_parser() -> argparse.ArgumentParser: build.add_argument( "--prepare", action="store_true", - help=("Prepares queries for remote execution."), + help=argparse.SUPPRESS, ) build.add_argument( "--statistics", diff --git a/tests/test_actions.py b/tests/test_actions.py index 9e3a27aa..e963e5b1 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -172,10 +172,7 @@ def test_run_protected_table_builder(mock_db_config, study_path, stats): def test_table_builder(mock_db_config, study_path, verbose, expects, raises): with raises: manifest = study_manifest.StudyManifest(pathlib.Path(study_path)) - builder.build_study( - config=mock_db_config, - manifest=manifest, - ) + builder.build_study(config=mock_db_config, manifest=manifest, data_path=None, prepare=False) tables = ( mock_db_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables ") @@ -248,10 +245,7 @@ def test_build_study(mock_db_config, study_path, verbose, expects, raises): table_cols_types=table.column_types, ) mock_db_config.db.cursor().execute(query) - builder.build_study( - config=mock_db_config, - manifest=manifest, - ) + builder.build_study(config=mock_db_config, manifest=manifest, data_path=None, prepare=False) tables = ( mock_db_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables ") @@ -309,7 +303,7 @@ def test_run_psm_statistics_builders( table_name="psm_test__psm_encounter_2023_06_15", view_name="psm_test__psm_encounter_covariate", ) - builder.build_study(config=config, manifest=manifest) + builder.build_study(config=config, manifest=manifest, prepare=False, data_path=None) tables = ( mock_db_stats_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables") @@ -333,10 +327,7 @@ def test_invoke_valueset_builder(mock_builder, mock_db_config, tmp_path): config = base_utils.StudyConfig( db=mock_db_config.db, schema=mock_db_config.schema, stats_build=True ) - builder.build_study( - config=config, - manifest=manifest, - ) + builder.build_study(config=config, manifest=manifest, prepare=False, data_path=None) assert mock_builder.is_called() @@ -567,7 +558,7 @@ def test_generate_md(mock_db_config, tmp_path): manifest = study_manifest.StudyManifest( study_path=pathlib.Path(f"{tmp_path}/study_python_valid/") ) - builder.build_study(config=mock_db_config, manifest=manifest) + builder.build_study(config=mock_db_config, manifest=manifest, prepare=False, data_path=None) file_generator.run_generate_markdown(config=mock_db_config, manifest=manifest) with open(f"{tmp_path}/study_python_valid/study_python_valid_generated.md") as f: generated_md = f.read() diff --git a/tests/test_cli.py b/tests/test_cli.py index 546d0fba..a3db7fab 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -942,7 +942,6 @@ def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_ study, "-s", "tests/test_data/", - "--data_path", str(tmp_path), "--prepare", ], diff --git a/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql b/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql deleted file mode 100644 index 8002c3a1..00000000 --- a/tests/test_data/study_dynamic_prefix/reference_sql/counts.sql +++ /dev/null @@ -1,9 +0,0 @@ --- noqa: disable=all --- This sql was autogenerated as a reference example using the library --- CLI. Its format is tied to the specific database it was run against, --- and it may not be correct for all databases. Use the CLI's build --- option to derive the best SQL for your dataset. - --- ########################################################### - -CREATE TABLE IF NOT EXISTS abc__counts (test int); diff --git a/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql b/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql deleted file mode 100644 index 9d7aa00d..00000000 --- a/tests/test_data/study_dynamic_prefix/reference_sql/meta.sql +++ /dev/null @@ -1,9 +0,0 @@ --- noqa: disable=all --- This sql was autogenerated as a reference example using the library --- CLI. Its format is tied to the specific database it was run against, --- and it may not be correct for all databases. Use the CLI's build --- option to derive the best SQL for your dataset. - --- ########################################################### - -CREATE TABLE abc__meta_version AS SELECT 1 AS data_package_version; From 71f7a9b713658f777c08e4dab6fe8dc8c159e80d Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Tue, 10 Dec 2024 09:22:26 -0500 Subject: [PATCH 3/4] sqlfluff update --- cumulus_library/.sqlfluff | 1 + tests/test_cli.py | 3 ++- tests/test_data/psm/psm_cohort.sql | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cumulus_library/.sqlfluff b/cumulus_library/.sqlfluff index a3bccbd4..2a69a5f1 100644 --- a/cumulus_library/.sqlfluff +++ b/cumulus_library/.sqlfluff @@ -6,6 +6,7 @@ exclude_rules= # these rule overfires on athena nested arrays references.from, structure.column_order, + structure.unused_join, aliasing.unused, # this rule interferes with FHIR naming conventions capitalisation.identifiers diff --git a/tests/test_cli.py b/tests/test_cli.py index a3db7fab..7377d222 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -907,7 +907,8 @@ def test_study_dir(tmp_path): "psm_test", ["0000.psm_cohort.00.create_table_psm_test__psm_cohort.sql"], """CREATE TABLE psm_test__psm_cohort AS ( - SELECT * FROM core__condition ORDER BY id DESC LIMIT 100 --noqa: AM04 + SELECT * FROM core__condition --noqa: AM04 + ORDER BY id DESC LIMIT 100 )""", [ "0001.psm_config.00.config.toml", diff --git a/tests/test_data/psm/psm_cohort.sql b/tests/test_data/psm/psm_cohort.sql index eb971142..13a19814 100644 --- a/tests/test_data/psm/psm_cohort.sql +++ b/tests/test_data/psm/psm_cohort.sql @@ -1,3 +1,4 @@ CREATE TABLE psm_test__psm_cohort AS ( - SELECT * FROM core__condition ORDER BY id DESC LIMIT 100 --noqa: AM04 + SELECT * FROM core__condition --noqa: AM04 + ORDER BY id DESC LIMIT 100 ); From 944be91b874411c0ec7132af4ca06d4663dc28e1 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Tue, 10 Dec 2024 09:54:46 -0500 Subject: [PATCH 4/4] coverage --- cumulus_library/actions/builder.py | 4 +- cumulus_library/actions/file_generator.py | 2 +- tests/test_cli.py | 57 ++++++++++++----------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index f4230214..39d874bc 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -261,7 +261,7 @@ def build_matching_files( :keyword data_path: If prepare is true, the path to write rendered data to """ if prepare: - _check_if_preparable(manifest.get_study_prefix()) + _check_if_preparable(manifest.get_study_prefix()) # pragma: no cover all_generators = manifest.get_all_generators() matches = [] if not builder: # pragma: no cover @@ -306,7 +306,7 @@ def build_study( if prepare: data_dir = data_path / manifest.get_study_prefix() for file in data_dir.glob("*"): - if file.is_file(): + if file.is_file(): # pragma: no cover file.unlink() query_count = 0 for file in file_list: diff --git a/cumulus_library/actions/file_generator.py b/cumulus_library/actions/file_generator.py index f3920cfb..1ed943d2 100644 --- a/cumulus_library/actions/file_generator.py +++ b/cumulus_library/actions/file_generator.py @@ -31,7 +31,7 @@ def run_generate_sql( ) for file in all_generators: if table_builder and file.find(table_builder) == -1: - continue + continue # pragma: no cover builder._load_and_execute_builder( config=config, manifest=manifest, diff --git a/tests/test_cli.py b/tests/test_cli.py index 7377d222..46b6d39e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -882,7 +882,7 @@ def test_study_dir(tmp_path): @pytest.mark.parametrize( - "study,expected_queries,generated_query,toml_file", + "study,expected_queries,generated_query,toml_file, raises", [ ( "study_valid", @@ -892,6 +892,7 @@ def test_study_dir(tmp_path): ], "CREATE TABLE study_valid__table (test int)", None, + does_not_raise(), ), ( "study_python_counts_valid", @@ -902,6 +903,7 @@ def test_study_dir(tmp_path): ], "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table1 (test int);", None, + does_not_raise(), ), ( "psm_test", @@ -932,31 +934,34 @@ def test_study_dir(tmp_path): ] """, ], + does_not_raise(), ), + ("core", [], "", None, pytest.raises(SystemExit)), ], ) -def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_file): - build_args = duckdb_args( - [ - "build", - "-t", - study, - "-s", - "tests/test_data/", - str(tmp_path), - "--prepare", - ], - tmp_path, - ) - with does_not_raise(): - cli.main(cli_args=build_args) - queries = list(pathlib.Path(tmp_path).glob("**/*.sql")) - for query in queries: - assert query.name in expected_queries - with open(queries[0]) as f: - assert f.read() == generated_query - if toml_file: - config = next(pathlib.Path(tmp_path).glob("**/*.toml")) - assert config.name == toml_file[0] - with open(config) as f: - assert f.read() == toml_file[1] +def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_file, raises): + with raises: + build_args = duckdb_args( + [ + "build", + "-t", + study, + "-s", + "tests/test_data/", + str(tmp_path), + "--prepare", + ], + tmp_path, + ) + with does_not_raise(): + cli.main(cli_args=build_args) + queries = list(pathlib.Path(tmp_path).glob("**/*.sql")) + for query in queries: + assert query.name in expected_queries + with open(queries[0]) as f: + assert f.read() == generated_query + if toml_file: + config = next(pathlib.Path(tmp_path).glob("**/*.toml")) + assert config.name == toml_file[0] + with open(config) as f: + assert f.read() == toml_file[1]