From 763e0dee7957c754ba217ccb4f2a1a079489a363 Mon Sep 17 00:00:00 2001 From: matt garber Date: Tue, 10 Dec 2024 10:07:45 -0500 Subject: [PATCH] Packaging mode for studies (#323) * Packaging mode for studies * PR feedback * sqlfluff update * coverage --- cumulus_library/.sqlfluff | 1 + cumulus_library/actions/builder.py | 230 +++++++++++++++--- cumulus_library/actions/file_generator.py | 3 +- cumulus_library/builders/valueset/umls.py | 6 +- cumulus_library/cli.py | 85 ++++--- cumulus_library/cli_parser.py | 5 + tests/conftest.py | 8 + tests/test_actions.py | 19 +- tests/test_athena.py | 5 - tests/test_cli.py | 100 +++++++- tests/test_data/psm/psm_cohort.sql | 3 +- .../study_python_counts_valid/module1.py | 4 +- .../study_python_counts_valid/module2.py | 2 +- tests/test_databases.py | 26 +- tests/test_discovery.py | 4 - tests/test_duckdb.py | 5 - tests/test_dynamic_manifest.py | 16 -- tests/test_static_file.py | 7 - tests/test_umls_api.py | 17 -- .../valueset/test_additional_rules_builder.py | 5 - .../valueset/test_rxnorm_valueset_builder.py | 5 - tests/valueset/test_umls.py | 7 - tests/valueset/test_valueset_builder.py | 5 - 23 files changed, 372 insertions(+), 196 deletions(-) diff --git a/cumulus_library/.sqlfluff b/cumulus_library/.sqlfluff index a3bccbd4..2a69a5f1 100644 --- a/cumulus_library/.sqlfluff +++ b/cumulus_library/.sqlfluff @@ -6,6 +6,7 @@ exclude_rules= # these rule overfires on athena nested arrays references.from, structure.column_order, + structure.unused_join, aliasing.unused, # this rule interferes with FHIR naming conventions capitalisation.identifiers diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index 119463a9..39d874bc 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -7,11 +7,13 @@ import re import sys import tomllib +import zipfile from rich.progress import Progress, TaskID from cumulus_library import ( BaseTableBuilder, + CountsBuilder, base_utils, databases, enums, @@ -40,7 +42,10 @@ def _load_and_execute_builder( db_parser: databases.DatabaseParser = None, write_reference_sql: bool = False, doc_str: str | None = None, -) -> None: + prepare: bool = False, + data_path: pathlib.Path, + query_count: int | None = None, +) -> int: """Loads a table builder from a file. :param config: a StudyConfig object @@ -49,6 +54,10 @@ def _load_and_execute_builder( :keyword db_parser: an object implementing DatabaseParser for the target database :keyword write_reference_sql: if true, writes sql to disk inside a study's directory :keyword doc_str: A string to insert between queries written to disk + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to + :keyword query_count: if prepare is true, the number of queries already rendered + :returns: number of processed queries """ # Since we have to support arbitrary user-defined python files here, we @@ -79,8 +88,7 @@ def _load_and_execute_builder( f"Error loading {manifest._study_path}{filename}\n" "Custom builders must extend the BaseTableBuilder class." ) - - # Remove instances of intermediate classes, if present + # Remove instances of intermediate classes, if present (usually not) table_builder_subclasses = list( filter(lambda x: x.__name__ != "CountsBuilder", table_builder_subclasses) ) @@ -89,7 +97,10 @@ def _load_and_execute_builder( # remove it so it doesn't interfere with the next python module to # execute, since the subclass would otherwise hang around. table_builder_class = table_builder_subclasses[0] - table_builder = table_builder_class() + if issubclass(table_builder_class, CountsBuilder): + table_builder = table_builder_class(study_prefix=manifest.get_study_prefix()) + else: + table_builder = table_builder_class() if write_reference_sql: prefix = manifest.get_study_prefix() table_builder.prepare_queries(config=config, manifest=manifest, parser=db_parser) @@ -105,19 +116,30 @@ def _load_and_execute_builder( table_builder.write_queries( path=pathlib.Path(f"{manifest._study_path}/reference_sql/" + new_filename) ) + elif prepare: + table_builder.prepare_queries( + config=config, + manifest=manifest, + parser=db_parser, + ) + _render_output( + manifest.get_study_prefix(), table_builder.queries, data_path, filename, query_count + ) else: table_builder.execute_queries( config=config, manifest=manifest, parser=db_parser, ) - + num_queries = len(table_builder.queries) # After running the executor code, we'll remove # it so it doesn't interfere with the next python module to # execute, since the subclass would otherwise hang around. del sys.modules[table_builder_module.__name__] del table_builder_module + return num_queries + def run_protected_table_builder( config: base_utils.StudyConfig, @@ -136,13 +158,36 @@ def run_protected_table_builder( def _run_workflow( - config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str -) -> None: + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + filename: str, + prepare: str, + data_path: pathlib.Path, + query_count: int, +) -> int: """Loads workflow config from toml definitions and executes workflow :param config: a StudyConfig object :param manifest: a StudyManifest object + :param filename: Filename of the workflow config + :param prepare: If true, will render query instead of executing + :param data_path: If prepare is true, the path to write rendered data to + :param query_count: if prepare is true, the number of queries already rendered + :returns: a count of processed queries """ + toml_path = pathlib.Path(f"{manifest._study_path}/{filename}") + if prepare: + with open(toml_path, encoding="utf-8") as file: + workflow_config = file.read() + _render_output( + manifest.get_study_prefix(), + [workflow_config], + data_path, + filename, + query_count, + is_toml=True, + ) + return 1 existing_stats = [] if not config.stats_build: existing_stats = ( @@ -157,14 +202,13 @@ def _run_workflow( # but we're letting it slide so that builders function similarly # across the board safe_timestamp = base_utils.get_tablename_safe_iso_timestamp() - toml_path = pathlib.Path(f"{manifest._study_path}/{filename}") with open(toml_path, "rb") as file: workflow_config = tomllib.load(file) config_type = workflow_config["config_type"] target_table = workflow_config.get("target_table", workflow_config.get("table_prefix", "")) if (target_table,) in existing_stats and not config.stats_build: - return + return 0 match config_type: case "psm": builder = psm_builder.PsmBuilder( @@ -195,6 +239,7 @@ def _run_workflow( table_name=f"{target_table}_{safe_timestamp}", view_name=target_table, ) + return len(builder.queries) def build_matching_files( @@ -203,13 +248,20 @@ def build_matching_files( *, builder: str | None, db_parser: databases.DatabaseParser = None, + prepare: bool, + data_path: pathlib.Path, ): """targets all table builders matching a target string for running :param config: a StudyConfig object :param manifest: a StudyManifest object :keyword builder: filename of a module implementing a TableBuilder - :keyword db_parser: an object implementing DatabaseParser for the target database""" + :keyword db_parser: an object implementing DatabaseParser for the target database + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to + """ + if prepare: + _check_if_preparable(manifest.get_study_prefix()) # pragma: no cover all_generators = manifest.get_all_generators() matches = [] if not builder: # pragma: no cover @@ -218,7 +270,14 @@ def build_matching_files( for file in all_generators: if file.find(builder) != -1: matches.append(file) - build_study(config, manifest, db_parser=db_parser, file_list=matches) + build_study( + config, + manifest, + db_parser=db_parser, + file_list=matches, + prepare=prepare, + data_path=data_path, + ) def build_study( @@ -228,35 +287,86 @@ def build_study( db_parser: databases.DatabaseParser = None, continue_from: str | None = None, file_list: list | None = None, -) -> list: + prepare: bool, + data_path: pathlib.Path | None, +) -> None: """Creates tables in the schema by iterating through the sql_config.file_names :param config: a StudyConfig object :param manifest: a StudyManifest object + :keyword db_parser: a parser for the target database :keyword continue_from: Name of a file to resume table creation from - :returns: loaded queries (for unit testing only) + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to """ + if prepare: + _check_if_preparable(manifest.get_study_prefix()) if file_list is None: file_list = manifest.get_file_list(continue_from) + if prepare: + data_dir = data_path / manifest.get_study_prefix() + for file in data_dir.glob("*"): + if file.is_file(): # pragma: no cover + file.unlink() + query_count = 0 for file in file_list: if file.endswith(".py"): - _load_and_execute_builder( + query_count += _load_and_execute_builder( config=config, manifest=manifest, filename=file, db_parser=db_parser, + data_path=data_path, + prepare=prepare, + query_count=query_count, ) elif file.endswith(".toml"): - _run_workflow(config=config, manifest=manifest, filename=file) + query_count += _run_workflow( + config=config, + manifest=manifest, + filename=file, + data_path=data_path, + prepare=prepare, + query_count=query_count, + ) elif file.endswith(".sql"): - _run_raw_queries(config=config, manifest=manifest, filename=file) + query_count += _run_raw_queries( + config=config, + manifest=manifest, + filename=file, + data_path=data_path, + prepare=prepare, + query_count=query_count, + ) else: raise errors.StudyManifestParsingError + if prepare: + with zipfile.ZipFile( + f"{data_path}/{manifest.get_study_prefix()}.zip", "w", zipfile.ZIP_DEFLATED + ) as z: + for file in data_dir.iterdir(): + z.write(file, file.relative_to(data_dir)) def _run_raw_queries( - config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str -): + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + filename: str, + *, + data_path: pathlib.Path | None, + prepare: bool, + query_count: int, +) -> int: + """Creates tables in the schema by iterating through the sql_config.file_names + + :param config: a StudyConfig object + :param manifest: a StudyManifest object + :param filename: the name of the sql file to read + :param prepare: If true, will render query instead of executing + :param data_path: If prepare is true, the path to write rendered data to + :param query_count: the number of queries currently processed + :returns number of queries processed: + """ queries = [] for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{filename}")): queries.append([query, filename]) @@ -266,25 +376,75 @@ def _run_raw_queries( f"`{manifest.get_study_prefix()}__", "`", ) - # We'll explicitly create a cursor since recreating cursors for each - # table in a study is slightly slower for some databases - cursor = config.db.cursor() - # We want to only show a progress bar if we are :not: printing SQL lines - with base_utils.get_progress_bar(disable=config.verbose) as progress: - task = progress.add_task( - f"Building tables from {filename}...", - total=len(queries), - visible=not config.verbose, + if prepare: + _render_output( + manifest.get_study_prefix(), [q[0] for q in queries], data_path, filename, query_count ) - _execute_build_queries( - config=config, - manifest=manifest, - cursor=cursor, - queries=queries, - progress=progress, - task=task, + else: + # We'll explicitly create a cursor since recreating cursors for each + # table in a study is slightly slower for some databases + cursor = config.db.cursor() + # We want to only show a progress bar if we are :not: printing SQL lines + with base_utils.get_progress_bar(disable=config.verbose) as progress: + task = progress.add_task( + f"Building tables from {filename}...", + total=len(queries), + visible=not config.verbose, + ) + _execute_build_queries( + config=config, + manifest=manifest, + cursor=cursor, + queries=queries, + progress=progress, + task=task, + ) + return len(queries) + + +def _render_output( + study_name: str, + outputs: list, + data_path: pathlib.Path, + filename: str, + count: int, + *, + is_toml: bool = False, +): + if is_toml: + suffix = "toml" + else: + suffix = "sql" + for index, output in enumerate(outputs): + if is_toml: + name = "config" + else: + # This regex attempts to discover the table name, via looking for the first + # dunder, and then gets the start of the line its on as a non-quote requiring + # part of a file name. So for example, finding SQL that looks like this: + # CREATE TABLE foo__bar AS (varchar baz) + # would result in `create_table_foo__bar` being assigned to name. The goal + # is to make this at least mildly parsable at the file system level if someone + # is reviewing a prepared study + name = re.search(r"(.*)__\w*", output)[0].lower().replace(" ", "_") + total_index = count + index + new_filename = f"{total_index:04d}.{filename.rsplit('.', 1)[0]}.{index:02d}.{name}.{suffix}" + file_path = data_path / f"{study_name}/{new_filename}" + + file_path.parent.mkdir(exist_ok=True, parents=True) + with open(file_path, "w", encoding="UTF-8") as f: + f.write(output) + + +def _check_if_preparable(prefix): + # This list should include any study which requires interrogating the database to + # find if data is available to query (outside of a toml-driven workflow), + # which isn't doable as a distributed query + if prefix in ["core", "discovery", "data-metrics"]: + sys.exit( + f"Study '{prefix}'' does not support prepare mode. It must be run " + "directly against a target database." ) - return queries def _query_error( diff --git a/cumulus_library/actions/file_generator.py b/cumulus_library/actions/file_generator.py index be4a721a..1ed943d2 100644 --- a/cumulus_library/actions/file_generator.py +++ b/cumulus_library/actions/file_generator.py @@ -31,7 +31,7 @@ def run_generate_sql( ) for file in all_generators: if table_builder and file.find(table_builder) == -1: - continue + continue # pragma: no cover builder._load_and_execute_builder( config=config, manifest=manifest, @@ -39,6 +39,7 @@ def run_generate_sql( db_parser=db_parser, write_reference_sql=True, doc_str=doc_str, + data_path=None, ) diff --git a/cumulus_library/builders/valueset/umls.py b/cumulus_library/builders/valueset/umls.py index 966452a8..a6cc8c36 100644 --- a/cumulus_library/builders/valueset/umls.py +++ b/cumulus_library/builders/valueset/umls.py @@ -92,10 +92,12 @@ def generate_umls_tables( f"SELECT count(*) from {study_prefix}{table_prefix}umls_valuesets_rels" # noqa: S608 ).fetchone()[0] - cursor.execute(f"""CREATE TABLE {study_prefix}{table_prefix}umls_valuesets AS + cursor.execute( + f"""CREATE TABLE {study_prefix}{table_prefix}umls_valuesets AS SELECT distinct r.rxcui, v.* FROM {study_prefix}{table_prefix}umls_valuesets_rels as v, rxnorm.RXNCONSO as r WHERE v.code = r.code AND v.sab=r.sab --AND v.tty=r.tty -""") # noqa: S608 +""" # noqa: S608 + ) diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index f2cd0b71..547c3c75 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -93,55 +93,68 @@ def clean_and_build_study( *, options: dict[str, str], continue_from: str | None = None, + prepare: bool = False, + data_path: pathlib.Path | None = None, ) -> None: """Recreates study views/tables :param target: A path to the study directory - :param options: The dictionary of study-specific options + :keyword options: The dictionary of study-specific options :keyword continue_from: Restart a run from a specific sql file (for dev only) + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to """ manifest = study_manifest.StudyManifest(target, self.data_path, options=options) try: - builder.run_protected_table_builder(config=self.get_config(manifest), manifest=manifest) - if not continue_from: - log_utils.log_transaction( - config=self.get_config(manifest), - manifest=manifest, - status=enums.LogStatuses.STARTED, - ) - cleaner.clean_study( - config=self.get_config(manifest), - manifest=manifest, - ) - - else: - log_utils.log_transaction( - config=self.get_config(manifest), - manifest=manifest, - status=enums.LogStatuses.RESUMED, + print("prep", prepare) + print("cont", continue_from) + if not prepare: + builder.run_protected_table_builder( + config=self.get_config(manifest), manifest=manifest ) + if not continue_from: + print("continue") + log_utils.log_transaction( + config=self.get_config(manifest), + manifest=manifest, + status=enums.LogStatuses.STARTED, + ) + cleaner.clean_study( + config=self.get_config(manifest), + manifest=manifest, + ) + else: + log_utils.log_transaction( + config=self.get_config(manifest), + manifest=manifest, + status=enums.LogStatuses.RESUMED, + ) builder.build_study( config=self.get_config(manifest), manifest=manifest, continue_from=continue_from, + data_path=data_path, + prepare=prepare, ) - log_utils.log_transaction( - config=self.get_config(manifest), - manifest=manifest, - status=enums.LogStatuses.FINISHED, - ) + if not prepare: + log_utils.log_transaction( + config=self.get_config(manifest), + manifest=manifest, + status=enums.LogStatuses.FINISHED, + ) except errors.StudyManifestFilesystemError as e: # This should be thrown prior to any database connections, so # skipping logging raise e # pragma: no cover except Exception as e: - log_utils.log_transaction( - config=self.get_config(manifest), - manifest=manifest, - status=enums.LogStatuses.ERROR, - ) + if not prepare: + log_utils.log_transaction( + config=self.get_config(manifest), + manifest=manifest, + status=enums.LogStatuses.ERROR, + ) raise e def build_matching_files( @@ -150,18 +163,24 @@ def build_matching_files( table_builder_name: str, *, options: dict[str, str], + prepare: bool = False, + data_path: pathlib.Path, ) -> None: """Runs a single table builder :param target: A path to the study directory :param table_builder_name: a builder file referenced in the study's manifest - :param options: The dictionary of study-specific options + :keyword options: The dictionary of study-specific options + :keyword prepare: If true, will render query instead of executing + :keyword data_path: If prepare is true, the path to write rendered data to """ manifest = study_manifest.StudyManifest(target, options=options) builder.build_matching_files( config=self.get_config(manifest), manifest=manifest, builder=table_builder_name, + prepare=prepare, + data_path=data_path, ) ### Data exporters @@ -325,13 +344,19 @@ def run_cli(args: dict): for target in args["target"]: if args["builder"]: runner.build_matching_files( - study_dict[target], args["builder"], options=args["options"] + study_dict[target], + args["builder"], + options=args["options"], + prepare=args["prepare"], + data_path=args["data_path"], ) else: runner.clean_and_build_study( study_dict[target], continue_from=args["continue_from"], options=args["options"], + prepare=args["prepare"], + data_path=args["data_path"], ) elif args["action"] == "export": diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py index ec1830b9..a1fb80ad 100644 --- a/cumulus_library/cli_parser.py +++ b/cumulus_library/cli_parser.py @@ -208,6 +208,11 @@ def get_parser() -> argparse.ArgumentParser: action="store_true", help="Forces file downloads/uploads to occur, even if they already exist", ) + build.add_argument( + "--prepare", + action="store_true", + help=argparse.SUPPRESS, + ) build.add_argument( "--statistics", action="store_true", diff --git a/tests/conftest.py b/tests/conftest.py index f602a247..824544d0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,9 @@ import copy import datetime import json +import os import pathlib +from unittest import mock import numpy import pandas @@ -197,6 +199,12 @@ def debug_diff_tables(cols, found, ref, pos=0): # Database fixtures +@pytest.fixture(scope="session", autouse=True) +def mock_env(): + with mock.patch.dict(os.environ, clear=True): + yield + + @pytest.fixture def mock_db(tmp_path): """Provides a DuckDatabaseBackend for local testing""" diff --git a/tests/test_actions.py b/tests/test_actions.py index 9e3a27aa..e963e5b1 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -172,10 +172,7 @@ def test_run_protected_table_builder(mock_db_config, study_path, stats): def test_table_builder(mock_db_config, study_path, verbose, expects, raises): with raises: manifest = study_manifest.StudyManifest(pathlib.Path(study_path)) - builder.build_study( - config=mock_db_config, - manifest=manifest, - ) + builder.build_study(config=mock_db_config, manifest=manifest, data_path=None, prepare=False) tables = ( mock_db_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables ") @@ -248,10 +245,7 @@ def test_build_study(mock_db_config, study_path, verbose, expects, raises): table_cols_types=table.column_types, ) mock_db_config.db.cursor().execute(query) - builder.build_study( - config=mock_db_config, - manifest=manifest, - ) + builder.build_study(config=mock_db_config, manifest=manifest, data_path=None, prepare=False) tables = ( mock_db_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables ") @@ -309,7 +303,7 @@ def test_run_psm_statistics_builders( table_name="psm_test__psm_encounter_2023_06_15", view_name="psm_test__psm_encounter_covariate", ) - builder.build_study(config=config, manifest=manifest) + builder.build_study(config=config, manifest=manifest, prepare=False, data_path=None) tables = ( mock_db_stats_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables") @@ -333,10 +327,7 @@ def test_invoke_valueset_builder(mock_builder, mock_db_config, tmp_path): config = base_utils.StudyConfig( db=mock_db_config.db, schema=mock_db_config.schema, stats_build=True ) - builder.build_study( - config=config, - manifest=manifest, - ) + builder.build_study(config=config, manifest=manifest, prepare=False, data_path=None) assert mock_builder.is_called() @@ -567,7 +558,7 @@ def test_generate_md(mock_db_config, tmp_path): manifest = study_manifest.StudyManifest( study_path=pathlib.Path(f"{tmp_path}/study_python_valid/") ) - builder.build_study(config=mock_db_config, manifest=manifest) + builder.build_study(config=mock_db_config, manifest=manifest, prepare=False, data_path=None) file_generator.run_generate_markdown(config=mock_db_config, manifest=manifest) with open(f"{tmp_path}/study_python_valid/study_python_valid_generated.md") as f: generated_md = f.read() diff --git a/tests/test_athena.py b/tests/test_athena.py index 940b7708..18e233f4 100644 --- a/tests/test_athena.py +++ b/tests/test_athena.py @@ -1,7 +1,6 @@ """Edge case testing for Athena database support""" import json -import os import pathlib from unittest import mock @@ -46,10 +45,6 @@ def test_schema_parsing(): assert expected == parser.parse_found_schema(schema) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("botocore.session.Session") def test_upload_parquet_response_handling(mock_session): path = pathlib.Path(__file__).resolve().parent diff --git a/tests/test_cli.py b/tests/test_cli.py index 6d50bc22..46b6d39e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -673,7 +673,6 @@ def test_cli_upload_filter(mock_upload_data, mock_glob, args, calls): @pytest.mark.parametrize("mode", ["cli", "env"]) -@mock.patch.dict(os.environ, clear=True) # early exit with a dumb error @mock.patch("cumulus_library.base_utils.StudyConfig", side_effect=ZeroDivisionError) def test_cli_umls_parsing(mock_config, mode, tmp_path): @@ -688,7 +687,6 @@ def test_cli_umls_parsing(mock_config, mode, tmp_path): assert mock_config.call_args[1]["umls_key"] == "MY-KEY" -@mock.patch.dict(os.environ, clear=True) def test_cli_single_builder(tmp_path): cli.main(cli_args=duckdb_args(["build", "--builder=patient", "--target=core"], tmp_path)) db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") @@ -700,7 +698,6 @@ def test_cli_single_builder(tmp_path): } == tables -@mock.patch.dict(os.environ, clear=True) def test_cli_finds_study_from_manifest_prefix(tmp_path): # This study is located inside a folder called `study_different_dir`, # but we're going to find it using its real study prefix from the manifest. @@ -715,7 +712,6 @@ def test_cli_finds_study_from_manifest_prefix(tmp_path): assert "study_different_name__table" in tables -@mock.patch.dict(os.environ, clear=True) @pytest.mark.parametrize( "option,raises", [ @@ -745,7 +741,6 @@ def test_cli_custom_args(mock_config, tmp_path, option, raises): assert called_options[option.split(":")[0]] == option.split(":")[1] -@mock.patch.dict(os.environ, clear=True) @mock.patch("cumulus_library.base_utils.StudyConfig") def test_cli_no_custom_args_yields_empty_dict(mock_config, tmp_path): mock_config.return_value.stats_clean = False @@ -765,7 +760,6 @@ def test_cli_no_custom_args_yields_empty_dict(mock_config, tmp_path): assert {} == called_options -@mock.patch.dict(os.environ, clear=True) def test_cli_import_study(tmp_path): test_data = {"string": ["a", "b", None]} df = pandas.DataFrame(test_data) @@ -790,7 +784,6 @@ def test_cli_import_study(tmp_path): ) -@mock.patch.dict(os.environ, clear=True) def test_dedicated_schema(tmp_path): core_build_args = duckdb_args( [ @@ -827,7 +820,6 @@ def test_dedicated_schema(tmp_path): assert table in tables -@mock.patch.dict(os.environ, clear=True) @mock.patch("cumulus_library.databases.duckdb.DuckDatabaseBackend") def test_sql_error_handling(mock_backend, tmp_path): mock_backend.return_value.cursor.return_value.execute.side_effect = [ @@ -848,7 +840,6 @@ def test_sql_error_handling(mock_backend, tmp_path): cli.main(cli_args=build_args) -@mock.patch.dict(os.environ, clear=True) def test_version(capfd): out = None with pytest.raises(SystemExit): @@ -870,7 +861,10 @@ def test_version(capfd): assert "study_invalid_bad_query: no version defined" in out -@mock.patch.dict(os.environ, clear=True) +@mock.patch.dict( + os.environ, + clear=True, +) def test_study_dir(tmp_path): os.environ["CUMULUS_LIBRARY_STUDY_DIR"] = str( pathlib.Path(__file__).resolve().parent / "test_data/" @@ -885,3 +879,89 @@ def test_study_dir(tmp_path): ) with does_not_raise(): cli.main(cli_args=build_args) + + +@pytest.mark.parametrize( + "study,expected_queries,generated_query,toml_file, raises", + [ + ( + "study_valid", + [ + "0000.test.00.create_table_study_valid__table.sql", + "0001.test2.00.create_table_study_valid__table2.sql", + ], + "CREATE TABLE study_valid__table (test int)", + None, + does_not_raise(), + ), + ( + "study_python_counts_valid", + [ + "0000.module1.00.create_table_if_not_exists_study_python_counts_valid__table1.sql", + "0001.module1.00.create_table_if_not_exists_study_python_counts_valid__table1.sql", + "0002.module2.00.create_table_if_not_exists_study_python_counts_valid__table2.sql", + ], + "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table1 (test int);", + None, + does_not_raise(), + ), + ( + "psm_test", + ["0000.psm_cohort.00.create_table_psm_test__psm_cohort.sql"], + """CREATE TABLE psm_test__psm_cohort AS ( + SELECT * FROM core__condition --noqa: AM04 + ORDER BY id DESC LIMIT 100 +)""", + [ + "0001.psm_config.00.config.toml", + """config_type = "psm" +classification_json = "dsm5_classifications.json" +pos_source_table = "psm_test__psm_cohort" +neg_source_table = "core__condition" +target_table = "psm_test__psm_encounter_covariate" +primary_ref = 'encounter_ref' +count_ref = 'subject_ref' +count_table = 'core__condition' +dependent_variable = "example_diagnosis" +pos_sample_size = 20 +neg_sample_size = 100 +seed = 1234567890 +[join_cols_by_table.core__encounter] +join_id = "encounter_ref" +included_cols = [ + ["gender"], + ["race_display", "race"] +] +""", + ], + does_not_raise(), + ), + ("core", [], "", None, pytest.raises(SystemExit)), + ], +) +def test_prepare_study(tmp_path, study, expected_queries, generated_query, toml_file, raises): + with raises: + build_args = duckdb_args( + [ + "build", + "-t", + study, + "-s", + "tests/test_data/", + str(tmp_path), + "--prepare", + ], + tmp_path, + ) + with does_not_raise(): + cli.main(cli_args=build_args) + queries = list(pathlib.Path(tmp_path).glob("**/*.sql")) + for query in queries: + assert query.name in expected_queries + with open(queries[0]) as f: + assert f.read() == generated_query + if toml_file: + config = next(pathlib.Path(tmp_path).glob("**/*.toml")) + assert config.name == toml_file[0] + with open(config) as f: + assert f.read() == toml_file[1] diff --git a/tests/test_data/psm/psm_cohort.sql b/tests/test_data/psm/psm_cohort.sql index eb971142..13a19814 100644 --- a/tests/test_data/psm/psm_cohort.sql +++ b/tests/test_data/psm/psm_cohort.sql @@ -1,3 +1,4 @@ CREATE TABLE psm_test__psm_cohort AS ( - SELECT * FROM core__condition ORDER BY id DESC LIMIT 100 --noqa: AM04 + SELECT * FROM core__condition --noqa: AM04 + ORDER BY id DESC LIMIT 100 ); diff --git a/tests/test_data/study_python_counts_valid/module1.py b/tests/test_data/study_python_counts_valid/module1.py index ecbaf6e1..0871e908 100644 --- a/tests/test_data/study_python_counts_valid/module1.py +++ b/tests/test_data/study_python_counts_valid/module1.py @@ -4,8 +4,8 @@ class ModuleOneRunner(cumulus_library.CountsBuilder): display_text = "module1" - def __init__(self): - super().__init__() + def __init__(self, study_prefix): + super().__init__(study_prefix="study_python_counts_valid") def prepare_queries(self, *args, **kwargs): self.queries.append( diff --git a/tests/test_data/study_python_counts_valid/module2.py b/tests/test_data/study_python_counts_valid/module2.py index c6c7e80a..9d96f130 100644 --- a/tests/test_data/study_python_counts_valid/module2.py +++ b/tests/test_data/study_python_counts_valid/module2.py @@ -4,7 +4,7 @@ class ModuleTwoRunner(cumulus_library.CountsBuilder): display_text = "module2" - def __init__(self): + def __init__(self, study_prefix): super().__init__(study_prefix="study_python_counts_valid") def prepare_queries(self, *args, **kwargs): diff --git a/tests/test_databases.py b/tests/test_databases.py index bd5d8878..b726fe0e 100644 --- a/tests/test_databases.py +++ b/tests/test_databases.py @@ -28,10 +28,6 @@ } -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "db,data,expected,raises", [ @@ -77,10 +73,6 @@ def test_col_types_from_pandas(db, data, expected, raises): assert set(expected) == set(vals) -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "db,data,expected,raises", [ @@ -222,10 +214,6 @@ def test_pyarrow_types_from_sql(db, data, expected, raises): assert vals[index][-1] == expected[index][-1] -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "args,expected_type, raises", [ @@ -273,10 +261,6 @@ def test_create_db_backend(args, expected_type, raises): ### Database-specific edge case testing -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "args,sse,keycount,expected,raises", [ @@ -375,10 +359,6 @@ def test_upload_file_athena(mock_botocore, args, sse, keycount, expected, raises assert kwargs["Key"].endswith(args["file"].name) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("pyathena.connect") def test_athena_pandas_cursor(mock_pyathena): mock_as_pandas = mock.MagicMock() @@ -415,10 +395,6 @@ def test_athena_pandas_cursor(mock_pyathena): ) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("pyathena.connect") def test_athena_parser(mock_pyathena): db = databases.AthenaDatabaseBackend(**ATHENA_KWARGS) @@ -426,6 +402,8 @@ def test_athena_parser(mock_pyathena): assert isinstance(parser, databases.AthenaParser) +# Since this test creates an environment variable, create a dedicated +# mock for it @mock.patch.dict( os.environ, clear=True, diff --git a/tests/test_discovery.py b/tests/test_discovery.py index e85ad24f..48bb7502 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -10,10 +10,6 @@ from tests import conftest -@mock.patch.dict( - os.environ, - clear=True, -) def test_discovery(tmp_path): cli.main( cli_args=conftest.duckdb_args( diff --git a/tests/test_duckdb.py b/tests/test_duckdb.py index 876c8581..1e8d4764 100644 --- a/tests/test_duckdb.py +++ b/tests/test_duckdb.py @@ -6,7 +6,6 @@ import tempfile from datetime import datetime from pathlib import Path -from unittest import mock import pytest @@ -14,10 +13,6 @@ from cumulus_library.template_sql import base_templates -@mock.patch.dict( - os.environ, - clear=True, -) def test_duckdb_core_build_and_export(tmp_path): data_dir = f"{Path(__file__).parent}/test_data/duckdb_data" cli.main( diff --git a/tests/test_dynamic_manifest.py b/tests/test_dynamic_manifest.py index e82251b7..6b76a512 100644 --- a/tests/test_dynamic_manifest.py +++ b/tests/test_dynamic_manifest.py @@ -20,10 +20,6 @@ ] -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "options,result", [ @@ -45,10 +41,6 @@ def test_manifest_with_dynamic_prefix(options, result): assert manifest.get_study_prefix() == result -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("sys.executable", new=None) def test_manifest_with_dynamic_prefix_and_no_executable(): """sys.executable must be valid for us to run a Python script""" @@ -56,10 +48,6 @@ def test_manifest_with_dynamic_prefix_and_no_executable(): study_manifest.StudyManifest(pathlib.Path("tests/test_data/study_dynamic_prefix")) -@mock.patch.dict( - os.environ, - clear=True, -) def test_cli_clean_with_dynamic_prefix(tmp_path): cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS, "--option=prefix:dynamic2"], tmp_path)) cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS], tmp_path)) @@ -80,10 +68,6 @@ def test_cli_clean_with_dynamic_prefix(tmp_path): assert "dynamic2__meta_version" not in tables -@mock.patch.dict( - os.environ, - clear=True, -) def test_cli_export_with_dynamic_prefix(tmp_path): cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS, "--option=prefix:abc"], tmp_path)) cli.main(cli_args=duckdb_args(["export", *STUDY_ARGS, "--option=prefix:abc"], tmp_path)) diff --git a/tests/test_static_file.py b/tests/test_static_file.py index bdea564a..ff89ec77 100644 --- a/tests/test_static_file.py +++ b/tests/test_static_file.py @@ -1,14 +1,7 @@ -import os -from unittest import mock - from cumulus_library import cli, databases from tests import conftest -@mock.patch.dict( - os.environ, - clear=True, -) def test_static_file(tmp_path): cli.main( cli_args=conftest.duckdb_args( diff --git a/tests/test_umls_api.py b/tests/test_umls_api.py index 99c8a30f..b2930b9d 100644 --- a/tests/test_umls_api.py +++ b/tests/test_umls_api.py @@ -1,7 +1,6 @@ import os import pathlib from contextlib import nullcontext as does_not_raise -from unittest import mock import pytest import responses @@ -18,10 +17,6 @@ EXPANSION_VALUESET_OID = "2.16.840.1.113762.1.4.1106.68" -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "api_key,validator_key,res_text,res_status,raises", [ @@ -45,10 +40,6 @@ def test_auth(api_key, validator_key, res_text, res_status, raises): assert api.validator_key == api_key -@mock.patch.dict( - os.environ, - clear=True, -) @responses.activate def test_auth_reads_env_var(): responses.add(responses.GET, AUTH_URL, body="true", status=200) @@ -63,10 +54,6 @@ def get_valueset_data(file_name): return f.read() -@mock.patch.dict( - os.environ, - clear=True, -) @pytest.mark.parametrize( "action,url,oid,expected_oids,raises", [ @@ -159,10 +146,6 @@ def test_get_valueset(action, url, oid, expected_oids, raises): assert data[i]["id"] == expected_oids[i] -@mock.patch.dict( - os.environ, - clear=True, -) @responses.activate def test_download_umls(tmp_path): # this zip file is just an archive made by targeting the other .json files diff --git a/tests/valueset/test_additional_rules_builder.py b/tests/valueset/test_additional_rules_builder.py index 755b9099..a4fb309f 100644 --- a/tests/valueset/test_additional_rules_builder.py +++ b/tests/valueset/test_additional_rules_builder.py @@ -1,5 +1,4 @@ import json -import os import pathlib import tomllib from unittest import mock @@ -16,10 +15,6 @@ @pytest.mark.parametrize("prefix", [(""), ("foo")]) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("cumulus_library.apis.umls.UmlsApi") def test_additional_rules(mock_api, mock_db_config_rxnorm, prefix): data_path = pathlib.Path(__file__).parent.parent / "test_data/valueset/" diff --git a/tests/valueset/test_rxnorm_valueset_builder.py b/tests/valueset/test_rxnorm_valueset_builder.py index 7494f377..1b2f7edc 100644 --- a/tests/valueset/test_rxnorm_valueset_builder.py +++ b/tests/valueset/test_rxnorm_valueset_builder.py @@ -1,5 +1,4 @@ import json -import os import pathlib import tomllib from unittest import mock @@ -15,10 +14,6 @@ @pytest.mark.parametrize("prefix", [(""), ("foo")]) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("cumulus_library.apis.umls.UmlsApi") def test_rxnorm_valueset_builder(mock_api, mock_db_config_rxnorm, prefix): data_path = pathlib.Path(__file__).parent.parent / "test_data/valueset/" diff --git a/tests/valueset/test_umls.py b/tests/valueset/test_umls.py index c4d1485c..ea7c21e2 100644 --- a/tests/valueset/test_umls.py +++ b/tests/valueset/test_umls.py @@ -1,6 +1,3 @@ -import os -from unittest import mock - import pytest from cumulus_library import StudyManifest @@ -8,10 +5,6 @@ @pytest.mark.parametrize("prefix", [(""), ("foo")]) -@mock.patch.dict( - os.environ, - clear=True, -) def test_umls_lookup(mock_db_config_rxnorm, prefix, tmp_path): with open(f"{tmp_path}/manifest.toml", "w", encoding="utf8") as f: f.write('study_prefix="test"') diff --git a/tests/valueset/test_valueset_builder.py b/tests/valueset/test_valueset_builder.py index b5a8e76c..41fd94ad 100644 --- a/tests/valueset/test_valueset_builder.py +++ b/tests/valueset/test_valueset_builder.py @@ -1,5 +1,4 @@ import json -import os import pathlib from contextlib import nullcontext as does_not_raise from unittest import mock @@ -19,10 +18,6 @@ (data_path / "invalid.toml", pytest.raises(SystemExit)), ], ) -@mock.patch.dict( - os.environ, - clear=True, -) @mock.patch("cumulus_library.apis.umls.UmlsApi") def test_valueset_builder(mock_api, mock_db_config_rxnorm, config_path, raises): with raises: