-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Packaging mode for studies #323
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,11 +7,13 @@ | |
import re | ||
import sys | ||
import tomllib | ||
import zipfile | ||
|
||
from rich.progress import Progress, TaskID | ||
|
||
from cumulus_library import ( | ||
BaseTableBuilder, | ||
CountsBuilder, | ||
base_utils, | ||
databases, | ||
enums, | ||
|
@@ -40,7 +42,10 @@ def _load_and_execute_builder( | |
db_parser: databases.DatabaseParser = None, | ||
write_reference_sql: bool = False, | ||
doc_str: str | None = None, | ||
) -> None: | ||
prepare: bool = False, | ||
data_path: pathlib.Path | None = None, | ||
query_count: int | None = None, | ||
) -> int: | ||
"""Loads a table builder from a file. | ||
|
||
:param config: a StudyConfig object | ||
|
@@ -49,6 +54,10 @@ def _load_and_execute_builder( | |
:keyword db_parser: an object implementing DatabaseParser for the target database | ||
:keyword write_reference_sql: if true, writes sql to disk inside a study's directory | ||
:keyword doc_str: A string to insert between queries written to disk | ||
:keyword prepare: If true, will render query instead of executing | ||
:keyword data_path: If prepare is true, the path to write rendered data to | ||
:keyword query_count: if prepare is true, the number of queries already rendered | ||
:returns: number of processed queries | ||
""" | ||
|
||
# Since we have to support arbitrary user-defined python files here, we | ||
|
@@ -79,8 +88,7 @@ def _load_and_execute_builder( | |
f"Error loading {manifest._study_path}{filename}\n" | ||
"Custom builders must extend the BaseTableBuilder class." | ||
) | ||
|
||
# Remove instances of intermediate classes, if present | ||
# Remove instances of intermediate classes, if present (usually not) | ||
table_builder_subclasses = list( | ||
filter(lambda x: x.__name__ != "CountsBuilder", table_builder_subclasses) | ||
) | ||
|
@@ -89,7 +97,10 @@ def _load_and_execute_builder( | |
# remove it so it doesn't interfere with the next python module to | ||
# execute, since the subclass would otherwise hang around. | ||
table_builder_class = table_builder_subclasses[0] | ||
table_builder = table_builder_class() | ||
if issubclass(table_builder_class, CountsBuilder): | ||
table_builder = table_builder_class(study_prefix=manifest.get_study_prefix()) | ||
else: | ||
table_builder = table_builder_class() | ||
if write_reference_sql: | ||
prefix = manifest.get_study_prefix() | ||
table_builder.prepare_queries(config=config, manifest=manifest, parser=db_parser) | ||
|
@@ -105,19 +116,30 @@ def _load_and_execute_builder( | |
table_builder.write_queries( | ||
path=pathlib.Path(f"{manifest._study_path}/reference_sql/" + new_filename) | ||
) | ||
elif prepare: | ||
table_builder.prepare_queries( | ||
config=config, | ||
manifest=manifest, | ||
parser=db_parser, | ||
) | ||
_render_output( | ||
manifest.get_study_prefix(), table_builder.queries, data_path, filename, query_count | ||
) | ||
else: | ||
table_builder.execute_queries( | ||
config=config, | ||
manifest=manifest, | ||
parser=db_parser, | ||
) | ||
|
||
num_queries = len(table_builder.queries) | ||
# After running the executor code, we'll remove | ||
# it so it doesn't interfere with the next python module to | ||
# execute, since the subclass would otherwise hang around. | ||
del sys.modules[table_builder_module.__name__] | ||
del table_builder_module | ||
|
||
return num_queries | ||
|
||
|
||
def run_protected_table_builder( | ||
config: base_utils.StudyConfig, | ||
|
@@ -136,13 +158,31 @@ def run_protected_table_builder( | |
|
||
|
||
def _run_workflow( | ||
config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str | ||
) -> None: | ||
config: base_utils.StudyConfig, | ||
manifest: study_manifest.StudyManifest, | ||
filename: str, | ||
prepare: str, | ||
data_path: pathlib.Path, | ||
query_count: int, | ||
) -> int: | ||
"""Loads workflow config from toml definitions and executes workflow | ||
|
||
:param config: a StudyConfig object | ||
:param manifest: a StudyManifest object | ||
:param filename: Filename of the workflow config | ||
:param prepare: If true, will render query instead of executing | ||
:param data_path: If prepare is true, the path to write rendered data to | ||
:param query_count: if prepare is true, the number of queries already rendered | ||
:returns: a count of processed queries | ||
""" | ||
toml_path = pathlib.Path(f"{manifest._study_path}/{filename}") | ||
if prepare: | ||
with open(toml_path) as file: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting... I am use to the pylint check to always pass an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah ruff didn't flag that, i'll see if i can find the right rule for it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://docs.astral.sh/ruff/rules/unspecified-encoding/ - since its marked unstable, i'm inclined to not do it for now. |
||
workflow_config = file.read() | ||
_render_output( | ||
manifest.get_study_prefix(), [workflow_config], data_path, filename, query_count, True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I hate to see a bare True/False - can you make that parameter kwarg only? (using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i skipped it since it was in module and internal, but for you? sure. |
||
) | ||
return 1 | ||
existing_stats = [] | ||
if not config.stats_build: | ||
existing_stats = ( | ||
|
@@ -157,14 +197,13 @@ def _run_workflow( | |
# but we're letting it slide so that builders function similarly | ||
# across the board | ||
safe_timestamp = base_utils.get_tablename_safe_iso_timestamp() | ||
toml_path = pathlib.Path(f"{manifest._study_path}/{filename}") | ||
with open(toml_path, "rb") as file: | ||
workflow_config = tomllib.load(file) | ||
config_type = workflow_config["config_type"] | ||
target_table = workflow_config.get("target_table", workflow_config.get("table_prefix", "")) | ||
|
||
if (target_table,) in existing_stats and not config.stats_build: | ||
return | ||
return 0 | ||
match config_type: | ||
case "psm": | ||
builder = psm_builder.PsmBuilder( | ||
|
@@ -195,6 +234,7 @@ def _run_workflow( | |
table_name=f"{target_table}_{safe_timestamp}", | ||
view_name=target_table, | ||
) | ||
return len(builder.queries) | ||
|
||
|
||
def build_matching_files( | ||
|
@@ -203,13 +243,23 @@ def build_matching_files( | |
*, | ||
builder: str | None, | ||
db_parser: databases.DatabaseParser = None, | ||
prepare: bool = False, | ||
data_path: pathlib.Path, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment here about default args - This feature feels like a thing that a future dev (including us) might totally forget exists when adding code that calls these methods. If everyone calling this is passing |
||
): | ||
"""targets all table builders matching a target string for running | ||
|
||
:param config: a StudyConfig object | ||
:param manifest: a StudyManifest object | ||
:keyword builder: filename of a module implementing a TableBuilder | ||
:keyword db_parser: an object implementing DatabaseParser for the target database""" | ||
:keyword db_parser: an object implementing DatabaseParser for the target database | ||
:keyword prepare: If true, will render query instead of executing | ||
:keyword data_path: If prepare is true, the path to write rendered data to | ||
""" | ||
if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]: | ||
sys.exit( | ||
f"{manifest.get_study_prefix()} does not support prepare mode. It must be run " | ||
"directly against a target database." | ||
) | ||
all_generators = manifest.get_all_generators() | ||
matches = [] | ||
if not builder: # pragma: no cover | ||
|
@@ -218,7 +268,14 @@ def build_matching_files( | |
for file in all_generators: | ||
if file.find(builder) != -1: | ||
matches.append(file) | ||
build_study(config, manifest, db_parser=db_parser, file_list=matches) | ||
build_study( | ||
config, | ||
manifest, | ||
db_parser=db_parser, | ||
file_list=matches, | ||
prepare=prepare, | ||
data_path=data_path, | ||
) | ||
|
||
|
||
def build_study( | ||
|
@@ -228,35 +285,87 @@ def build_study( | |
db_parser: databases.DatabaseParser = None, | ||
continue_from: str | None = None, | ||
file_list: list | None = None, | ||
) -> list: | ||
prepare: bool = False, | ||
data_path: pathlib.Path | None = None, | ||
) -> None: | ||
"""Creates tables in the schema by iterating through the sql_config.file_names | ||
|
||
:param config: a StudyConfig object | ||
:param manifest: a StudyManifest object | ||
:keyword db_parser: a parser for the target database | ||
:keyword continue_from: Name of a file to resume table creation from | ||
:returns: loaded queries (for unit testing only) | ||
:keyword prepare: If true, will render query instead of executing | ||
:keyword data_path: If prepare is true, the path to write rendered data to | ||
""" | ||
if prepare and manifest.get_study_prefix() in ["core", "discovery", "data-metrics"]: | ||
sys.exit( | ||
f"{manifest.get_study_prefix()} does not support prepare mode. It must be run " | ||
"directly against a target database." | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I saw a user string repeated, so my "move this to a utility method" alarm went off. |
||
if file_list is None: | ||
file_list = manifest.get_file_list(continue_from) | ||
if prepare: | ||
data_dir = data_path / manifest.get_study_prefix() | ||
files = data_dir.glob("*") | ||
[file.unlink() for file in files if file.is_file()] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: a "logic-only" list comprehension like that feels like an odd construction. You could still do this in just one more line with:
|
||
query_count = 0 | ||
for file in file_list: | ||
if file.endswith(".py"): | ||
_load_and_execute_builder( | ||
query_count += _load_and_execute_builder( | ||
config=config, | ||
manifest=manifest, | ||
filename=file, | ||
db_parser=db_parser, | ||
prepare=prepare, | ||
data_path=data_path, | ||
query_count=query_count, | ||
) | ||
elif file.endswith(".toml"): | ||
_run_workflow(config=config, manifest=manifest, filename=file) | ||
query_count += _run_workflow( | ||
config=config, | ||
manifest=manifest, | ||
filename=file, | ||
prepare=prepare, | ||
data_path=data_path, | ||
query_count=query_count, | ||
) | ||
elif file.endswith(".sql"): | ||
_run_raw_queries(config=config, manifest=manifest, filename=file) | ||
query_count += _run_raw_queries( | ||
config=config, | ||
manifest=manifest, | ||
filename=file, | ||
prepare=prepare, | ||
data_path=data_path, | ||
query_count=query_count, | ||
) | ||
else: | ||
raise errors.StudyManifestParsingError | ||
if prepare: | ||
with zipfile.ZipFile( | ||
f"{data_path}/{manifest.get_study_prefix()}.zip", "w", zipfile.ZIP_DEFLATED | ||
) as z: | ||
for file in data_dir.glob("*"): | ||
dogversioning marked this conversation as resolved.
Show resolved
Hide resolved
|
||
z.write(file, file.relative_to(data_dir)) | ||
|
||
|
||
def _run_raw_queries( | ||
config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str | ||
): | ||
config: base_utils.StudyConfig, | ||
manifest: study_manifest.StudyManifest, | ||
filename: str, | ||
prepare: bool, | ||
data_path: pathlib.Path, | ||
query_count: int, | ||
dogversioning marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) -> int: | ||
"""Creates tables in the schema by iterating through the sql_config.file_names | ||
|
||
:param config: a StudyConfig object | ||
:param manifest: a StudyManifest object | ||
:param filename: the name of the sql file to read | ||
:param prepare: If true, will render query instead of executing | ||
:param data_path: If prepare is true, the path to write rendered data to | ||
:param query_count: the number of queries currently processed | ||
:returns number of queries processed: | ||
""" | ||
queries = [] | ||
for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{filename}")): | ||
queries.append([query, filename]) | ||
|
@@ -266,25 +375,49 @@ def _run_raw_queries( | |
f"`{manifest.get_study_prefix()}__", | ||
"`", | ||
) | ||
# We'll explicitly create a cursor since recreating cursors for each | ||
# table in a study is slightly slower for some databases | ||
cursor = config.db.cursor() | ||
# We want to only show a progress bar if we are :not: printing SQL lines | ||
with base_utils.get_progress_bar(disable=config.verbose) as progress: | ||
task = progress.add_task( | ||
f"Building tables from {filename}...", | ||
total=len(queries), | ||
visible=not config.verbose, | ||
if prepare: | ||
_render_output( | ||
manifest.get_study_prefix(), [q[0] for q in queries], data_path, filename, query_count | ||
) | ||
_execute_build_queries( | ||
config=config, | ||
manifest=manifest, | ||
cursor=cursor, | ||
queries=queries, | ||
progress=progress, | ||
task=task, | ||
else: | ||
# We'll explicitly create a cursor since recreating cursors for each | ||
# table in a study is slightly slower for some databases | ||
cursor = config.db.cursor() | ||
# We want to only show a progress bar if we are :not: printing SQL lines | ||
with base_utils.get_progress_bar(disable=config.verbose) as progress: | ||
task = progress.add_task( | ||
f"Building tables from {filename}...", | ||
total=len(queries), | ||
visible=not config.verbose, | ||
) | ||
_execute_build_queries( | ||
config=config, | ||
manifest=manifest, | ||
cursor=cursor, | ||
queries=queries, | ||
progress=progress, | ||
task=task, | ||
) | ||
return len(queries) | ||
|
||
|
||
def _render_output(study_name, outputs, data_path, filename, count, is_toml=False): | ||
if is_toml: | ||
suffix = "toml" | ||
else: | ||
suffix = "sql" | ||
for output in enumerate(outputs): | ||
dogversioning marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if is_toml: | ||
name = "config" | ||
else: | ||
name = re.search(r"(.*)__\w*", output[1])[0].lower().replace(" ", "_") | ||
file_path = data_path / ( | ||
f"{study_name}/{count+output[0]:04d}.{filename.rsplit('.', 1)[0]}" | ||
f".{output[0]:02d}.{name}.{suffix}" | ||
) | ||
dogversioning marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return queries | ||
file_path.parent.mkdir(exist_ok=True, parents=True) | ||
with open(file_path, "w") as f: | ||
dogversioning marked this conversation as resolved.
Show resolved
Hide resolved
|
||
f.write(output[1]) | ||
|
||
|
||
def _query_error( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: should we just eventually retool builders to just accept a manifest object in the init method?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh gosh, probably?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#324