Discovery study (#134)

* Discovery study * Schema validation, code def cleanup
smart-on-fhir · Oct 20, 2023 · cd3b776 · cd3b776
1 parent 73f3b1d
commit cd3b776
Show file tree

Hide file tree

Showing 11 changed files with 483 additions and 33 deletions.
diff --git a/.sqlfluffignore b/.sqlfluffignore
@@ -8,5 +8,8 @@ show_views.sql.jinja
 # but the table in question builds
 codeable_concept_denormalize.sql.jinja
 
+# This template causes sqlfluff to hang - need to try a version uprev at some point
+code_system_pairs.sql.jinja
+
 # This is a common destination for debugging sql generation
 output.sql
diff --git a/README.md b/README.md
@@ -5,8 +5,6 @@ A framework for designing, executing, and distributing SQL queries packaged as "
 ## Installing
 
 For end users, just run `pip install cumulus-library`. 
-For running from source, checkout the repo, and at the project root run `pip install -e .`.
-
-
+For running from source, checkout the repo, and at the project root run `pip install -e .`. If you're not working on new features, you should check out the tag associated with the latest release.
 
 For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library).
diff --git a/cumulus_library/.sqlfluff b/cumulus_library/.sqlfluff
@@ -20,6 +20,7 @@ code_systems = ["http://snomed.info/sct", "http://hl7.org/fhir/sid/icd-10-cm"]
 col_type_list = ["a string","b string"]
 cc_columns = [{"name": "baz", "is_array": True}, {"name": "foobar", "is_array": False}]
 cc_column = 'code'
+code_system_tables = [{table_name":"hasarray","column_name":"acol","is_bare_coding":False,"is_array":True, "has_data": True},{"table_name":"noarray","column_name":"col","is_bare_coding":False,"is_array":False, "has_data": True}{"table_name":"bare","column_name":"bcol","is_bare_coding":True,"is_array":False, "has_data": True},{"table_name":"empty","column_name":"empty","is_bare_coding":False,"is_array":False, "has_data": False}]
 column_name = 'bar'
 conditions = ["1 > 0", "1 < 2"]
 dataset = [["foo","foo"],["bar","bar"]]
@@ -29,6 +30,7 @@ fhir_extension = fhir_extension
 fhir_resource = patient
 id = 'id'
 medication_datasources = {"by_contained_ref" : True, "by_external_ref" : True}
+output_table_name = 'created_table'
 prefix = Test
 schema_name = test_schema
 source_table = source_table

diff --git a/cumulus_library/studies/discovery/code_definitions.py b/cumulus_library/studies/discovery/code_definitions.py
@@ -0,0 +1,57 @@
+# A collection of codes & codeableConcepts to extract available codes from.
+# Two optional booleans are available for use:
+#   - is_array: the field in question is an array of CodeableConcepts
+#   - is_bare_coding: the field in question is a Coding not wrapped in concepts
+#   - otherwise, it is assumed to be a 0..1 or 1..1 CodeableConcept
+# TODO: if another state is needed, move to an Enum
+
+code_list = [
+    # Condition
+    {"table_name": "condition", "column_name": "category", "is_array": True},
+    {
+        "table_name": "condition",
+        "column_name": "code",
+    },
+    # DocumentReference
+    {
+        "table_name": "documentreference",
+        "column_name": "type",
+    },
+    {"table_name": "documentreference", "column_name": "category", "is_array": True},
+    # Encounter
+    {
+        "table_name": "encounter",
+        "column_name": "class",
+        "is_bare_coding": True,
+    },
+    {
+        "table_name": "encounter",
+        "column_name": "type",
+        "is_array": True,
+    },
+    {
+        "table_name": "encounter",
+        "column_name": "servicetype",
+    },
+    {
+        "table_name": "encounter",
+        "column_name": "priority",
+    },
+    {"table_name": "encounter", "column_name": "reasoncode", "is_array": True},
+    # Medication
+    {
+        "table_name": "medication",
+        "column_name": "code",
+    },
+    # Observation
+    {"table_name": "observation", "column_name": "category", "is_array": True},
+    {
+        "table_name": "observation",
+        "column_name": "code",
+    },
+    # Patient
+    {
+        "table_name": "patient",
+        "column_name": "maritalstatus",
+    },
+]
diff --git a/cumulus_library/studies/discovery/code_detection.py b/cumulus_library/studies/discovery/code_detection.py
@@ -0,0 +1,82 @@
+""" Module for generating encounter codeableConcept table"""
+
+from cumulus_library.base_table_builder import BaseTableBuilder
+from cumulus_library.helper import get_progress_bar, query_console_output
+from cumulus_library.template_sql.templates import get_code_system_pairs
+from cumulus_library.template_sql.utils import (
+    is_codeable_concept_array_populated,
+    is_codeable_concept_populated,
+    is_code_populated,
+)
+
+from cumulus_library.studies.discovery.code_definitions import code_list
+
+
+class CodeDetectionBuilder(BaseTableBuilder):
+    display_text = "Selecting unique code systems..."
+
+    def _check_codes_in_fields(self, code_sources: list[dict], schema, cursor) -> dict:
+        """checks if Coding/CodeableConcept fields are present and populated"""
+
+        with get_progress_bar() as progress:
+            task = progress.add_task(
+                "Discovering available coding systems...",
+                total=len(code_sources),
+            )
+            for code_source in code_sources:
+                if code_source["is_array"]:
+                    code_source["has_data"] = is_codeable_concept_array_populated(
+                        schema,
+                        code_source["table_name"],
+                        code_source["column_name"],
+                        cursor,
+                        allow_partial=False,
+                    )
+                elif code_source["is_bare_coding"]:
+                    code_source["has_data"] = is_code_populated(
+                        schema,
+                        code_source["table_name"],
+                        code_source["column_name"],
+                        cursor,
+                        allow_partial=False,
+                    )
+                else:
+                    code_source["has_data"] = is_codeable_concept_populated(
+                        schema,
+                        code_source["table_name"],
+                        code_source["column_name"],
+                        cursor,
+                        allow_partial=False,
+                    )
+                progress.advance(task)
+        return code_sources
+
+    def prepare_queries(self, cursor: object, schema: str):
+        """Constructs queries related to condition codeableConcept
+
+        :param cursor: A database cursor object
+        :param schema: the schema/db name, matching the cursor
+
+        """
+
+        code_sources = []
+        for code_definition in code_list:
+            if any(
+                x not in code_definition.keys() for x in ["table_name", "column_name"]
+            ):
+                raise KeyError(
+                    "Expected table_name and column_name keys in "
+                    f"{str(code_definition)}"
+                )
+            code_source = {
+                "is_bare_coding": False,
+                "is_array": False,
+                "has_data": False,
+            }
+            for key in code_definition.keys():
+                code_source[key] = code_definition[key]
+            code_sources.append(code_source)
+
+        code_sources = self._check_codes_in_fields(code_sources, schema, cursor)
+        query = get_code_system_pairs("discovery__code_sources", code_sources)
+        self.queries.append(query)
diff --git a/cumulus_library/studies/discovery/code_detection.sql b/cumulus_library/studies/discovery/code_detection.sql
@@ -0,0 +1,127 @@
+-- noqa: disable=all
+/*
+This is a reference output of the SQL generated by builder_code_detection.py 
+<<<<<<< HEAD
+that is used by the core__encounter_type table, against the synthea dataset.
+It is provided as a form of documentation only and will not be invoked directly.
+*/
+
+CREATE TABLE discovery__code_sources AS
+SELECT DISTINCT
+    'condition' AS table_name,
+    'category' AS column_name,
+    t2.row2.code,
+    t2.row2.display,
+    t2.row2.system
+FROM condition,
+UNNEST(category) AS t1 (row1),
+UNNEST(t1.row1.coding) AS t2 (row2)
+UNION
+SELECT DISTINCT
+    'condition' AS table_name,
+    'code' AS column_name,
+    t.row.code,
+    t.row.display,
+    t.row.system
+FROM condition,
+UNNEST(code.coding) AS t (row)
+UNION
+SELECT DISTINCT
+    'documentreference' AS table_name,
+    'type' AS column_name,
+    t.row.code,
+    t.row.display,
+    t.row.system
+FROM documentreference,
+UNNEST(type.coding) AS t (row)
+UNION
+SELECT DISTINCT
+    'documentreference' AS table_name,
+    'category' AS column_name,
+    t2.row2.code,
+    t2.row2.display,
+    t2.row2.system
+FROM documentreference,
+UNNEST(category) AS t1 (row1),
+UNNEST(t1.row1.coding) AS t2 (row2)
+UNION
+SELECT * 
+    FROM ( 
+        VALUES (
+            ('encounter','class', '', '', '')
+        )
+    )
+AS t ( table_name, column_name, code, display, system ) -- noqa: L025
+UNION
+SELECT DISTINCT
+    'encounter' AS table_name,
+    'type' AS column_name,
+    t2.row2.code,
+    t2.row2.display,
+    t2.row2.system
+FROM encounter,
+UNNEST(type) AS t1 (row1),
+UNNEST(t1.row1.coding) AS t2 (row2)
+UNION
+SELECT * 
+    FROM ( 
+        VALUES (
+            ('encounter','servicetype', '', '', '')
+        )
+    )
+AS t ( table_name, column_name, code, display, system ) -- noqa: L025
+UNION
+SELECT * 
+    FROM ( 
+        VALUES (
+            ('encounter','priority', '', '', '')
+        )
+    )
+AS t ( table_name, column_name, code, display, system ) -- noqa: L025
+UNION
+SELECT DISTINCT
+    'encounter' AS table_name,
+    'reasoncode' AS column_name,
+    t2.row2.code,
+    t2.row2.display,
+    t2.row2.system
+FROM encounter,
+UNNEST(reasoncode) AS t1 (row1),
+UNNEST(t1.row1.coding) AS t2 (row2)
+UNION
+SELECT DISTINCT
+    'medication' AS table_name,
+    'code' AS column_name,
+    t.row.code,
+    t.row.display,
+    t.row.system
+FROM medication,
+UNNEST(code.coding) AS t (row)
+UNION
+SELECT DISTINCT
+    'observation' AS table_name,
+    'category' AS column_name,
+    t2.row2.code,
+    t2.row2.display,
+    t2.row2.system
+FROM observation,
+UNNEST(category) AS t1 (row1),
+UNNEST(t1.row1.coding) AS t2 (row2)
+UNION
+SELECT DISTINCT
+    'observation' AS table_name,
+    'code' AS column_name,
+    t.row.code,
+    t.row.display,
+    t.row.system
+FROM observation,
+UNNEST(code.coding) AS t (row)
+UNION
+SELECT DISTINCT
+    'patient' AS table_name,
+    'maritalstatus' AS column_name,
+    t.row.code,
+    t.row.display,
+    t.row.system
+FROM patient,
+UNNEST(maritalstatus.coding) AS t (row)
diff --git a/cumulus_library/studies/discovery/manifest.toml b/cumulus_library/studies/discovery/manifest.toml
@@ -0,0 +1,12 @@
+study_prefix = "discovery"
+
+[table_builder_config]
+file_names = [
+    "code_detection.py",
+]
+
+
+[export_config]
+export_list = [
+    "discovery__code_sources",
+]
diff --git a/cumulus_library/template_sql/code_system_pairs.sql.jinja b/cumulus_library/template_sql/code_system_pairs.sql.jinja
@@ -0,0 +1,44 @@
+CREATE TABLE {{ output_table_name }} AS
+{%- for source in code_system_tables %}
+{%- if source.has_data %}
+{%- if source.is_bare_coding %}
+SELECT DISTINCT
+    '{{ source.table_name }}' AS table_name,
+    '{{ source.column_name }}' AS column_name,
+    {{ source.column_name }}.code,
+    {{ source.column_name }}.display,
+    {{ source.column_name }}.system
+FROM {{ source.table_name }}
+{%- elif source.is_array %}
+SELECT DISTINCT
+    '{{ source.table_name }}' AS table_name,
+    '{{ source.column_name }}' AS column_name,
+    t2.row2.code,
+    t2.row2.display,
+    t2.row2.system
+FROM {{ source.table_name }},
+UNNEST({{ source.column_name }}) AS t1 (row1),
+UNNEST(t1.row1.coding) AS t2 (row2)
+{%- else %}
+SELECT DISTINCT
+    '{{ source.table_name }}' AS table_name,
+    '{{ source.column_name }}' AS column_name,
+    t.row.code,
+    t.row.display,
+    t.row.system
+FROM {{ source.table_name }},
+UNNEST({{source.column_name}}.coding) AS t (row)
+{%- endif %}
+{%- else %}
+SELECT * 
+    FROM ( 
+        VALUES (
+            ('{{ source.table_name }}','{{ source.column_name }}', '', '', '')
+        )
+    )
+AS t ( table_name, column_name, code, display, system ) -- noqa: L025
+{%- endif -%}
+{%- if not loop.last %}
+UNION
+{%- endif -%}
+{%  endfor %}
diff --git a/cumulus_library/template_sql/templates.py b/cumulus_library/template_sql/templates.py
@@ -95,6 +95,15 @@ def __init__(
         self.is_array = is_array
 
 
+def get_code_system_pairs(output_table_name: str, code_system_tables: list) -> str:
+    """Extracts code system details as a standalone table"""
+    path = Path(__file__).parent
+    with open(f"{path}/code_system_pairs.sql.jinja") as code_system_pairs:
+        return Template(code_system_pairs.read()).render(
+            output_table_name=output_table_name, code_system_tables=code_system_tables
+        )
+
+
 def get_codeable_concept_denormalize_query(config: CodeableConceptConfig) -> str:
     """extracts codeable concepts from a specified table.