Skip to content

Commit

Permalink
Discovery study (#134)
Browse files Browse the repository at this point in the history
* Discovery study

* Schema validation, code def cleanup
  • Loading branch information
dogversioning committed Oct 20, 2023
1 parent 73f3b1d commit cd3b776
Show file tree
Hide file tree
Showing 11 changed files with 483 additions and 33 deletions.
3 changes: 3 additions & 0 deletions .sqlfluffignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,8 @@ show_views.sql.jinja
# but the table in question builds
codeable_concept_denormalize.sql.jinja

# This template causes sqlfluff to hang - need to try a version uprev at some point
code_system_pairs.sql.jinja

# This is a common destination for debugging sql generation
output.sql
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ A framework for designing, executing, and distributing SQL queries packaged as "
## Installing

For end users, just run `pip install cumulus-library`.
For running from source, checkout the repo, and at the project root run `pip install -e .`.


For running from source, checkout the repo, and at the project root run `pip install -e .`. If you're not working on new features, you should check out the tag associated with the latest release.

For more information, [browse the documentation](https://docs.smarthealthit.org/cumulus/library).
2 changes: 2 additions & 0 deletions cumulus_library/.sqlfluff
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ code_systems = ["http://snomed.info/sct", "http://hl7.org/fhir/sid/icd-10-cm"]
col_type_list = ["a string","b string"]
cc_columns = [{"name": "baz", "is_array": True}, {"name": "foobar", "is_array": False}]
cc_column = 'code'
code_system_tables = [{table_name":"hasarray","column_name":"acol","is_bare_coding":False,"is_array":True, "has_data": True},{"table_name":"noarray","column_name":"col","is_bare_coding":False,"is_array":False, "has_data": True}{"table_name":"bare","column_name":"bcol","is_bare_coding":True,"is_array":False, "has_data": True},{"table_name":"empty","column_name":"empty","is_bare_coding":False,"is_array":False, "has_data": False}]
column_name = 'bar'
conditions = ["1 > 0", "1 < 2"]
dataset = [["foo","foo"],["bar","bar"]]
Expand All @@ -29,6 +30,7 @@ fhir_extension = fhir_extension
fhir_resource = patient
id = 'id'
medication_datasources = {"by_contained_ref" : True, "by_external_ref" : True}
output_table_name = 'created_table'
prefix = Test
schema_name = test_schema
source_table = source_table
Expand Down
57 changes: 57 additions & 0 deletions cumulus_library/studies/discovery/code_definitions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# A collection of codes & codeableConcepts to extract available codes from.
# Two optional booleans are available for use:
# - is_array: the field in question is an array of CodeableConcepts
# - is_bare_coding: the field in question is a Coding not wrapped in concepts
# - otherwise, it is assumed to be a 0..1 or 1..1 CodeableConcept
# TODO: if another state is needed, move to an Enum

code_list = [
# Condition
{"table_name": "condition", "column_name": "category", "is_array": True},
{
"table_name": "condition",
"column_name": "code",
},
# DocumentReference
{
"table_name": "documentreference",
"column_name": "type",
},
{"table_name": "documentreference", "column_name": "category", "is_array": True},
# Encounter
{
"table_name": "encounter",
"column_name": "class",
"is_bare_coding": True,
},
{
"table_name": "encounter",
"column_name": "type",
"is_array": True,
},
{
"table_name": "encounter",
"column_name": "servicetype",
},
{
"table_name": "encounter",
"column_name": "priority",
},
{"table_name": "encounter", "column_name": "reasoncode", "is_array": True},
# Medication
{
"table_name": "medication",
"column_name": "code",
},
# Observation
{"table_name": "observation", "column_name": "category", "is_array": True},
{
"table_name": "observation",
"column_name": "code",
},
# Patient
{
"table_name": "patient",
"column_name": "maritalstatus",
},
]
82 changes: 82 additions & 0 deletions cumulus_library/studies/discovery/code_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
""" Module for generating encounter codeableConcept table"""

from cumulus_library.base_table_builder import BaseTableBuilder
from cumulus_library.helper import get_progress_bar, query_console_output
from cumulus_library.template_sql.templates import get_code_system_pairs
from cumulus_library.template_sql.utils import (
is_codeable_concept_array_populated,
is_codeable_concept_populated,
is_code_populated,
)

from cumulus_library.studies.discovery.code_definitions import code_list


class CodeDetectionBuilder(BaseTableBuilder):
display_text = "Selecting unique code systems..."

def _check_codes_in_fields(self, code_sources: list[dict], schema, cursor) -> dict:
"""checks if Coding/CodeableConcept fields are present and populated"""

with get_progress_bar() as progress:
task = progress.add_task(
"Discovering available coding systems...",
total=len(code_sources),
)
for code_source in code_sources:
if code_source["is_array"]:
code_source["has_data"] = is_codeable_concept_array_populated(
schema,
code_source["table_name"],
code_source["column_name"],
cursor,
allow_partial=False,
)
elif code_source["is_bare_coding"]:
code_source["has_data"] = is_code_populated(
schema,
code_source["table_name"],
code_source["column_name"],
cursor,
allow_partial=False,
)
else:
code_source["has_data"] = is_codeable_concept_populated(
schema,
code_source["table_name"],
code_source["column_name"],
cursor,
allow_partial=False,
)
progress.advance(task)
return code_sources

def prepare_queries(self, cursor: object, schema: str):
"""Constructs queries related to condition codeableConcept
:param cursor: A database cursor object
:param schema: the schema/db name, matching the cursor
"""

code_sources = []
for code_definition in code_list:
if any(
x not in code_definition.keys() for x in ["table_name", "column_name"]
):
raise KeyError(
"Expected table_name and column_name keys in "
f"{str(code_definition)}"
)
code_source = {
"is_bare_coding": False,
"is_array": False,
"has_data": False,
}
for key in code_definition.keys():
code_source[key] = code_definition[key]
code_sources.append(code_source)

code_sources = self._check_codes_in_fields(code_sources, schema, cursor)
query = get_code_system_pairs("discovery__code_sources", code_sources)
self.queries.append(query)
127 changes: 127 additions & 0 deletions cumulus_library/studies/discovery/code_detection.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
-- noqa: disable=all
/*
This is a reference output of the SQL generated by builder_code_detection.py
<<<<<<< HEAD
that is used by the core__encounter_type table, against the synthea dataset.
It is provided as a form of documentation only and will not be invoked directly.
*/

CREATE TABLE discovery__code_sources AS
SELECT DISTINCT
'condition' AS table_name,
'category' AS column_name,
t2.row2.code,
t2.row2.display,
t2.row2.system
FROM condition,
UNNEST(category) AS t1 (row1),
UNNEST(t1.row1.coding) AS t2 (row2)
UNION
SELECT DISTINCT
'condition' AS table_name,
'code' AS column_name,
t.row.code,
t.row.display,
t.row.system
FROM condition,
UNNEST(code.coding) AS t (row)
UNION
SELECT DISTINCT
'documentreference' AS table_name,
'type' AS column_name,
t.row.code,
t.row.display,
t.row.system
FROM documentreference,
UNNEST(type.coding) AS t (row)
UNION
SELECT DISTINCT
'documentreference' AS table_name,
'category' AS column_name,
t2.row2.code,
t2.row2.display,
t2.row2.system
FROM documentreference,
UNNEST(category) AS t1 (row1),
UNNEST(t1.row1.coding) AS t2 (row2)
UNION
SELECT *
FROM (
VALUES (
('encounter','class', '', '', '')
)
)
AS t ( table_name, column_name, code, display, system ) -- noqa: L025
UNION
SELECT DISTINCT
'encounter' AS table_name,
'type' AS column_name,
t2.row2.code,
t2.row2.display,
t2.row2.system
FROM encounter,
UNNEST(type) AS t1 (row1),
UNNEST(t1.row1.coding) AS t2 (row2)
UNION
SELECT *
FROM (
VALUES (
('encounter','servicetype', '', '', '')
)
)
AS t ( table_name, column_name, code, display, system ) -- noqa: L025
UNION
SELECT *
FROM (
VALUES (
('encounter','priority', '', '', '')
)
)
AS t ( table_name, column_name, code, display, system ) -- noqa: L025
UNION
SELECT DISTINCT
'encounter' AS table_name,
'reasoncode' AS column_name,
t2.row2.code,
t2.row2.display,
t2.row2.system
FROM encounter,
UNNEST(reasoncode) AS t1 (row1),
UNNEST(t1.row1.coding) AS t2 (row2)
UNION
SELECT DISTINCT
'medication' AS table_name,
'code' AS column_name,
t.row.code,
t.row.display,
t.row.system
FROM medication,
UNNEST(code.coding) AS t (row)
UNION
SELECT DISTINCT
'observation' AS table_name,
'category' AS column_name,
t2.row2.code,
t2.row2.display,
t2.row2.system
FROM observation,
UNNEST(category) AS t1 (row1),
UNNEST(t1.row1.coding) AS t2 (row2)
UNION
SELECT DISTINCT
'observation' AS table_name,
'code' AS column_name,
t.row.code,
t.row.display,
t.row.system
FROM observation,
UNNEST(code.coding) AS t (row)
UNION
SELECT DISTINCT
'patient' AS table_name,
'maritalstatus' AS column_name,
t.row.code,
t.row.display,
t.row.system
FROM patient,
UNNEST(maritalstatus.coding) AS t (row)
12 changes: 12 additions & 0 deletions cumulus_library/studies/discovery/manifest.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
study_prefix = "discovery"

[table_builder_config]
file_names = [
"code_detection.py",
]


[export_config]
export_list = [
"discovery__code_sources",
]
44 changes: 44 additions & 0 deletions cumulus_library/template_sql/code_system_pairs.sql.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
CREATE TABLE {{ output_table_name }} AS
{%- for source in code_system_tables %}
{%- if source.has_data %}
{%- if source.is_bare_coding %}
SELECT DISTINCT
'{{ source.table_name }}' AS table_name,
'{{ source.column_name }}' AS column_name,
{{ source.column_name }}.code,
{{ source.column_name }}.display,
{{ source.column_name }}.system
FROM {{ source.table_name }}
{%- elif source.is_array %}
SELECT DISTINCT
'{{ source.table_name }}' AS table_name,
'{{ source.column_name }}' AS column_name,
t2.row2.code,
t2.row2.display,
t2.row2.system
FROM {{ source.table_name }},
UNNEST({{ source.column_name }}) AS t1 (row1),
UNNEST(t1.row1.coding) AS t2 (row2)
{%- else %}
SELECT DISTINCT
'{{ source.table_name }}' AS table_name,
'{{ source.column_name }}' AS column_name,
t.row.code,
t.row.display,
t.row.system
FROM {{ source.table_name }},
UNNEST({{source.column_name}}.coding) AS t (row)
{%- endif %}
{%- else %}
SELECT *
FROM (
VALUES (
('{{ source.table_name }}','{{ source.column_name }}', '', '', '')
)
)
AS t ( table_name, column_name, code, display, system ) -- noqa: L025
{%- endif -%}
{%- if not loop.last %}
UNION
{%- endif -%}
{% endfor %}
9 changes: 9 additions & 0 deletions cumulus_library/template_sql/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,15 @@ def __init__(
self.is_array = is_array


def get_code_system_pairs(output_table_name: str, code_system_tables: list) -> str:
"""Extracts code system details as a standalone table"""
path = Path(__file__).parent
with open(f"{path}/code_system_pairs.sql.jinja") as code_system_pairs:
return Template(code_system_pairs.read()).render(
output_table_name=output_table_name, code_system_tables=code_system_tables
)


def get_codeable_concept_denormalize_query(config: CodeableConceptConfig) -> str:
"""extracts codeable concepts from a specified table.
Expand Down
Loading

0 comments on commit cd3b776

Please sign in to comment.