From cd3b776c2db2a3683d8add764dfb7a7c0d247e00 Mon Sep 17 00:00:00 2001 From: matt garber Date: Fri, 20 Oct 2023 09:09:02 -0400 Subject: [PATCH] Discovery study (#134) * Discovery study * Schema validation, code def cleanup --- .sqlfluffignore | 3 + README.md | 4 +- cumulus_library/.sqlfluff | 2 + .../studies/discovery/code_definitions.py | 57 ++++++++ .../studies/discovery/code_detection.py | 82 +++++++++++ .../studies/discovery/code_detection.sql | 127 ++++++++++++++++++ .../studies/discovery/manifest.toml | 12 ++ .../template_sql/code_system_pairs.sql.jinja | 44 ++++++ cumulus_library/template_sql/templates.py | 9 ++ cumulus_library/template_sql/utils.py | 99 ++++++++++---- tests/test_templates.py | 77 ++++++++++- 11 files changed, 483 insertions(+), 33 deletions(-) create mode 100644 cumulus_library/studies/discovery/code_definitions.py create mode 100644 cumulus_library/studies/discovery/code_detection.py create mode 100644 cumulus_library/studies/discovery/code_detection.sql create mode 100644 cumulus_library/studies/discovery/manifest.toml create mode 100644 cumulus_library/template_sql/code_system_pairs.sql.jinja diff --git a/.sqlfluffignore b/.sqlfluffignore index 5a506b2c..f2dbae5e 100644 --- a/.sqlfluffignore +++ b/.sqlfluffignore @@ -8,5 +8,8 @@ show_views.sql.jinja # but the table in question builds codeable_concept_denormalize.sql.jinja +# This template causes sqlfluff to hang - need to try a version uprev at some point +code_system_pairs.sql.jinja + # This is a common destination for debugging sql generation output.sql diff --git a/README.md b/README.md index 1cc42d50..3b17d3e1 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,6 @@ A framework for designing, executing, and distributing SQL queries packaged as " ## Installing For end users, just run `pip install cumulus-library`. -For running from source, checkout the repo, and at the project root run `pip install -e .`. - - +For running from source, checkout the repo, and at the project root run `pip install -e .`. If you're not working on new features, you should check out the tag associated with the latest release. For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library). diff --git a/cumulus_library/.sqlfluff b/cumulus_library/.sqlfluff index 4e2c617f..1bcd5df9 100644 --- a/cumulus_library/.sqlfluff +++ b/cumulus_library/.sqlfluff @@ -20,6 +20,7 @@ code_systems = ["http://snomed.info/sct", "http://hl7.org/fhir/sid/icd-10-cm"] col_type_list = ["a string","b string"] cc_columns = [{"name": "baz", "is_array": True}, {"name": "foobar", "is_array": False}] cc_column = 'code' +code_system_tables = [{table_name":"hasarray","column_name":"acol","is_bare_coding":False,"is_array":True, "has_data": True},{"table_name":"noarray","column_name":"col","is_bare_coding":False,"is_array":False, "has_data": True}{"table_name":"bare","column_name":"bcol","is_bare_coding":True,"is_array":False, "has_data": True},{"table_name":"empty","column_name":"empty","is_bare_coding":False,"is_array":False, "has_data": False}] column_name = 'bar' conditions = ["1 > 0", "1 < 2"] dataset = [["foo","foo"],["bar","bar"]] @@ -29,6 +30,7 @@ fhir_extension = fhir_extension fhir_resource = patient id = 'id' medication_datasources = {"by_contained_ref" : True, "by_external_ref" : True} +output_table_name = 'created_table' prefix = Test schema_name = test_schema source_table = source_table diff --git a/cumulus_library/studies/discovery/code_definitions.py b/cumulus_library/studies/discovery/code_definitions.py new file mode 100644 index 00000000..daf2f276 --- /dev/null +++ b/cumulus_library/studies/discovery/code_definitions.py @@ -0,0 +1,57 @@ +# A collection of codes & codeableConcepts to extract available codes from. +# Two optional booleans are available for use: +# - is_array: the field in question is an array of CodeableConcepts +# - is_bare_coding: the field in question is a Coding not wrapped in concepts +# - otherwise, it is assumed to be a 0..1 or 1..1 CodeableConcept +# TODO: if another state is needed, move to an Enum + +code_list = [ + # Condition + {"table_name": "condition", "column_name": "category", "is_array": True}, + { + "table_name": "condition", + "column_name": "code", + }, + # DocumentReference + { + "table_name": "documentreference", + "column_name": "type", + }, + {"table_name": "documentreference", "column_name": "category", "is_array": True}, + # Encounter + { + "table_name": "encounter", + "column_name": "class", + "is_bare_coding": True, + }, + { + "table_name": "encounter", + "column_name": "type", + "is_array": True, + }, + { + "table_name": "encounter", + "column_name": "servicetype", + }, + { + "table_name": "encounter", + "column_name": "priority", + }, + {"table_name": "encounter", "column_name": "reasoncode", "is_array": True}, + # Medication + { + "table_name": "medication", + "column_name": "code", + }, + # Observation + {"table_name": "observation", "column_name": "category", "is_array": True}, + { + "table_name": "observation", + "column_name": "code", + }, + # Patient + { + "table_name": "patient", + "column_name": "maritalstatus", + }, +] diff --git a/cumulus_library/studies/discovery/code_detection.py b/cumulus_library/studies/discovery/code_detection.py new file mode 100644 index 00000000..1d3add09 --- /dev/null +++ b/cumulus_library/studies/discovery/code_detection.py @@ -0,0 +1,82 @@ +""" Module for generating encounter codeableConcept table""" + +from cumulus_library.base_table_builder import BaseTableBuilder +from cumulus_library.helper import get_progress_bar, query_console_output +from cumulus_library.template_sql.templates import get_code_system_pairs +from cumulus_library.template_sql.utils import ( + is_codeable_concept_array_populated, + is_codeable_concept_populated, + is_code_populated, +) + +from cumulus_library.studies.discovery.code_definitions import code_list + + +class CodeDetectionBuilder(BaseTableBuilder): + display_text = "Selecting unique code systems..." + + def _check_codes_in_fields(self, code_sources: list[dict], schema, cursor) -> dict: + """checks if Coding/CodeableConcept fields are present and populated""" + + with get_progress_bar() as progress: + task = progress.add_task( + "Discovering available coding systems...", + total=len(code_sources), + ) + for code_source in code_sources: + if code_source["is_array"]: + code_source["has_data"] = is_codeable_concept_array_populated( + schema, + code_source["table_name"], + code_source["column_name"], + cursor, + allow_partial=False, + ) + elif code_source["is_bare_coding"]: + code_source["has_data"] = is_code_populated( + schema, + code_source["table_name"], + code_source["column_name"], + cursor, + allow_partial=False, + ) + else: + code_source["has_data"] = is_codeable_concept_populated( + schema, + code_source["table_name"], + code_source["column_name"], + cursor, + allow_partial=False, + ) + progress.advance(task) + return code_sources + + def prepare_queries(self, cursor: object, schema: str): + """Constructs queries related to condition codeableConcept + + :param cursor: A database cursor object + :param schema: the schema/db name, matching the cursor + + """ + + code_sources = [] + for code_definition in code_list: + if any( + x not in code_definition.keys() for x in ["table_name", "column_name"] + ): + raise KeyError( + "Expected table_name and column_name keys in " + f"{str(code_definition)}" + ) + code_source = { + "is_bare_coding": False, + "is_array": False, + "has_data": False, + } + for key in code_definition.keys(): + code_source[key] = code_definition[key] + code_sources.append(code_source) + + code_sources = self._check_codes_in_fields(code_sources, schema, cursor) + query = get_code_system_pairs("discovery__code_sources", code_sources) + self.queries.append(query) diff --git a/cumulus_library/studies/discovery/code_detection.sql b/cumulus_library/studies/discovery/code_detection.sql new file mode 100644 index 00000000..1d993e5d --- /dev/null +++ b/cumulus_library/studies/discovery/code_detection.sql @@ -0,0 +1,127 @@ +-- noqa: disable=all +/* +This is a reference output of the SQL generated by builder_code_detection.py +<<<<<<< HEAD +that is used by the core__encounter_type table, against the synthea dataset. +It is provided as a form of documentation only and will not be invoked directly. +*/ + +CREATE TABLE discovery__code_sources AS +SELECT DISTINCT + 'condition' AS table_name, + 'category' AS column_name, + t2.row2.code, + t2.row2.display, + t2.row2.system +FROM condition, +UNNEST(category) AS t1 (row1), +UNNEST(t1.row1.coding) AS t2 (row2) +UNION +SELECT DISTINCT + 'condition' AS table_name, + 'code' AS column_name, + t.row.code, + t.row.display, + t.row.system +FROM condition, +UNNEST(code.coding) AS t (row) +UNION +SELECT DISTINCT + 'documentreference' AS table_name, + 'type' AS column_name, + t.row.code, + t.row.display, + t.row.system +FROM documentreference, +UNNEST(type.coding) AS t (row) +UNION +SELECT DISTINCT + 'documentreference' AS table_name, + 'category' AS column_name, + t2.row2.code, + t2.row2.display, + t2.row2.system +FROM documentreference, +UNNEST(category) AS t1 (row1), +UNNEST(t1.row1.coding) AS t2 (row2) +UNION +SELECT * + FROM ( + VALUES ( + ('encounter','class', '', '', '') + ) + ) +AS t ( table_name, column_name, code, display, system ) -- noqa: L025 +UNION +SELECT DISTINCT + 'encounter' AS table_name, + 'type' AS column_name, + t2.row2.code, + t2.row2.display, + t2.row2.system +FROM encounter, +UNNEST(type) AS t1 (row1), +UNNEST(t1.row1.coding) AS t2 (row2) +UNION +SELECT * + FROM ( + VALUES ( + ('encounter','servicetype', '', '', '') + ) + ) +AS t ( table_name, column_name, code, display, system ) -- noqa: L025 +UNION +SELECT * + FROM ( + VALUES ( + ('encounter','priority', '', '', '') + ) + ) +AS t ( table_name, column_name, code, display, system ) -- noqa: L025 +UNION +SELECT DISTINCT + 'encounter' AS table_name, + 'reasoncode' AS column_name, + t2.row2.code, + t2.row2.display, + t2.row2.system +FROM encounter, +UNNEST(reasoncode) AS t1 (row1), +UNNEST(t1.row1.coding) AS t2 (row2) +UNION +SELECT DISTINCT + 'medication' AS table_name, + 'code' AS column_name, + t.row.code, + t.row.display, + t.row.system +FROM medication, +UNNEST(code.coding) AS t (row) +UNION +SELECT DISTINCT + 'observation' AS table_name, + 'category' AS column_name, + t2.row2.code, + t2.row2.display, + t2.row2.system +FROM observation, +UNNEST(category) AS t1 (row1), +UNNEST(t1.row1.coding) AS t2 (row2) +UNION +SELECT DISTINCT + 'observation' AS table_name, + 'code' AS column_name, + t.row.code, + t.row.display, + t.row.system +FROM observation, +UNNEST(code.coding) AS t (row) +UNION +SELECT DISTINCT + 'patient' AS table_name, + 'maritalstatus' AS column_name, + t.row.code, + t.row.display, + t.row.system +FROM patient, +UNNEST(maritalstatus.coding) AS t (row) diff --git a/cumulus_library/studies/discovery/manifest.toml b/cumulus_library/studies/discovery/manifest.toml new file mode 100644 index 00000000..45a37798 --- /dev/null +++ b/cumulus_library/studies/discovery/manifest.toml @@ -0,0 +1,12 @@ +study_prefix = "discovery" + +[table_builder_config] +file_names = [ + "code_detection.py", +] + + +[export_config] +export_list = [ + "discovery__code_sources", +] diff --git a/cumulus_library/template_sql/code_system_pairs.sql.jinja b/cumulus_library/template_sql/code_system_pairs.sql.jinja new file mode 100644 index 00000000..7195ad30 --- /dev/null +++ b/cumulus_library/template_sql/code_system_pairs.sql.jinja @@ -0,0 +1,44 @@ +CREATE TABLE {{ output_table_name }} AS +{%- for source in code_system_tables %} +{%- if source.has_data %} +{%- if source.is_bare_coding %} +SELECT DISTINCT + '{{ source.table_name }}' AS table_name, + '{{ source.column_name }}' AS column_name, + {{ source.column_name }}.code, + {{ source.column_name }}.display, + {{ source.column_name }}.system +FROM {{ source.table_name }} +{%- elif source.is_array %} +SELECT DISTINCT + '{{ source.table_name }}' AS table_name, + '{{ source.column_name }}' AS column_name, + t2.row2.code, + t2.row2.display, + t2.row2.system +FROM {{ source.table_name }}, +UNNEST({{ source.column_name }}) AS t1 (row1), +UNNEST(t1.row1.coding) AS t2 (row2) +{%- else %} +SELECT DISTINCT + '{{ source.table_name }}' AS table_name, + '{{ source.column_name }}' AS column_name, + t.row.code, + t.row.display, + t.row.system +FROM {{ source.table_name }}, +UNNEST({{source.column_name}}.coding) AS t (row) +{%- endif %} +{%- else %} +SELECT * + FROM ( + VALUES ( + ('{{ source.table_name }}','{{ source.column_name }}', '', '', '') + ) + ) +AS t ( table_name, column_name, code, display, system ) -- noqa: L025 +{%- endif -%} +{%- if not loop.last %} +UNION +{%- endif -%} +{% endfor %} diff --git a/cumulus_library/template_sql/templates.py b/cumulus_library/template_sql/templates.py index 010dc159..a407667b 100644 --- a/cumulus_library/template_sql/templates.py +++ b/cumulus_library/template_sql/templates.py @@ -95,6 +95,15 @@ def __init__( self.is_array = is_array +def get_code_system_pairs(output_table_name: str, code_system_tables: list) -> str: + """Extracts code system details as a standalone table""" + path = Path(__file__).parent + with open(f"{path}/code_system_pairs.sql.jinja") as code_system_pairs: + return Template(code_system_pairs.read()).render( + output_table_name=output_table_name, code_system_tables=code_system_tables + ) + + def get_codeable_concept_denormalize_query(config: CodeableConceptConfig) -> str: """extracts codeable concepts from a specified table. diff --git a/cumulus_library/template_sql/utils.py b/cumulus_library/template_sql/utils.py index 2728f7c2..d206ed27 100644 --- a/cumulus_library/template_sql/utils.py +++ b/cumulus_library/template_sql/utils.py @@ -22,6 +22,7 @@ def is_codeable_concept_populated( base_col: str, cursor, coding_element="coding", + allow_partial: bool = True, ) -> bool: """Check db to see if codeableconcept data exists. @@ -35,24 +36,14 @@ def is_codeable_concept_populated( :param cursor: a PEP-249 compliant database cursor :param coding_element: the place inside the code element to look for coding info. default: 'coding' (and :hopefully: this is always right) + :allow_partial: If true, codings which do not have fields expected by the library + will still be included, and will need to be manually coerced. :returns: a boolean indicating if valid data is present. """ - # if the source column is missing for some reason (i.e. we're dealing with - # conversion to FHIR rather than a true FHIR source and it's incomplete), - # we'll return false - try: - query = get_is_table_not_empty_query(table, base_col) - cursor.execute(query) - if cursor.fetchone() is None: - return False - except: - return False - - query = get_column_datatype_query(schema, table, base_col) - cursor.execute(query) - - if coding_element not in str(cursor.fetchone()[0]): + if not _check_schema_if_exists( + schema, table, base_col, cursor, coding_element, allow_partial + ): return False query = get_is_table_not_empty_query( @@ -78,6 +69,7 @@ def is_codeable_concept_array_populated( base_col: str, cursor, coding_element="coding", + allow_partial: bool = True, ) -> bool: """Check db to see if an array of codeableconcept data exists. @@ -91,21 +83,15 @@ def is_codeable_concept_array_populated( :param cursor: a PEP-249 compliant database cursor :param coding_element: the place inside the code element to look for coding info. default: 'coding' (and :hopefully: this is always right) + :allow_partial: If true, codings which do not have fields expected by the library + will still be included, and will need to be manually coerced. :returns: a boolean indicating if valid data is present. """ - try: - query = get_is_table_not_empty_query(table, base_col) - cursor.execute(query) - if cursor.fetchone() is None: - return False - except: - return False - query = get_column_datatype_query(schema, table, base_col) - cursor.execute(query) - if coding_element not in str(cursor.fetchone()[0]): + if not _check_schema_if_exists( + schema, table, base_col, cursor, coding_element, allow_partial + ): return False - query = get_is_table_not_empty_query( table, "t2.row2", @@ -126,3 +112,64 @@ def is_codeable_concept_array_populated( if cursor.fetchone() is None: return False return True + + +def is_code_populated( + schema: str, + table: str, + base_col: str, + cursor, + allow_partial: bool = True, +) -> bool: + """Check db to see if a bare code exists and is populated. + + Will execute several exploratory queries to see if the column in question + can be queried naively. + + :param schema: The schema/database name + :param table: The table to query against + :param base_col: the place to start validation from. + This can be a nested element, like column.object.code + :param cursor: a PEP-249 compliant database cursor + :allow_partial: If true, codings which do not have fields expected by the library + will still be included, and will need to be manually coerced. + :returns: a boolean indicating if valid data is present. + """ + + if not _check_schema_if_exists( + schema, table, base_col, cursor, False, allow_partial + ): + return False + query = get_is_table_not_empty_query( + table, + base_col, + ) + cursor.execute(query) + if cursor.fetchone() is None: + return False + return True + + +def _check_schema_if_exists( + schema: str, table: str, base_col: str, cursor, coding_element, allow_partial: bool +) -> bool: + """Validation check for a column existing, and having the expected schema""" + try: + query = get_is_table_not_empty_query(table, base_col) + cursor.execute(query) + if cursor.fetchone() is None: + return False + + query = get_column_datatype_query(schema, table, base_col) + cursor.execute(query) + schema_str = str(cursor.fetchone()[0]) + required_fields = [coding_element] + if allow_partial: + required_fields + ["code", "system", "display"] + if any(x not in schema_str for x in required_fields): + return False + + return True + + except: + return False diff --git a/tests/test_templates.py b/tests/test_templates.py index e7dac370..02b80bcb 100644 --- a/tests/test_templates.py +++ b/tests/test_templates.py @@ -3,6 +3,7 @@ from cumulus_library.template_sql.templates import ( CodeableConceptConfig, ExtensionConfig, + get_code_system_pairs, get_codeable_concept_denormalize_query, get_column_datatype_query, get_core_medication_query, @@ -338,8 +339,6 @@ def test_count_query( if eval(kwarg) is not None: kwargs[kwarg] = eval(kwarg) query = get_count_query("test_table", "test_source", ["age", "sex"], **kwargs) - with open("output.sql", "w") as f: - f.write(query) assert query == expected @@ -479,8 +478,6 @@ def test_extension_denormalize_creation(): is_array=True, ) query = get_extension_denormalize_query(config) - with open("output.sql", "w") as f: - f.write(query) array_sql = """LOWER( ARRAY_JOIN( ARRAY_SORT( @@ -563,3 +560,75 @@ def test_is_table_not_empty(): conditions=["field_name LIKE 's%'", "field_name IS NOT NULL"], ) assert query == expected + + +def test_get_code_system_pairs(): + expected = """CREATE TABLE output_table AS +SELECT DISTINCT + 'hasarray' AS table_name, + 'acol' AS column_name, + t2.row2.code, + t2.row2.display, + t2.row2.system +FROM hasarray, +UNNEST(acol) AS t1 (row1), +UNNEST(t1.row1.coding) AS t2 (row2) +UNION +SELECT DISTINCT + 'noarray' AS table_name, + 'col' AS column_name, + t.row.code, + t.row.display, + t.row.system +FROM noarray, +UNNEST(col.coding) AS t (row) +UNION +SELECT DISTINCT + 'bare' AS table_name, + 'bcol' AS column_name, + bcol.code, + bcol.display, + bcol.system +FROM bare +UNION +SELECT * + FROM ( + VALUES ( + ('empty','empty', '', '', '') + ) + ) +AS t ( table_name, column_name, code, display, system ) -- noqa: L025""" + query = get_code_system_pairs( + "output_table", + [ + { + "table_name": "hasarray", + "column_name": "acol", + "is_bare_coding": False, + "is_array": True, + "has_data": True, + }, + { + "table_name": "noarray", + "column_name": "col", + "is_bare_coding": False, + "is_array": False, + "has_data": True, + }, + { + "table_name": "bare", + "column_name": "bcol", + "is_bare_coding": True, + "is_array": False, + "has_data": True, + }, + { + "table_name": "empty", + "column_name": "empty", + "is_bare_coding": False, + "is_array": False, + "has_data": False, + }, + ], + ) + assert query == expected