-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Process new ETL gpt tables into symptom lists (#43)
- Loading branch information
Showing
10 changed files
with
283 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
"""SQL generation for cumulus covid symptom analysis""" | ||
|
||
__version__ = "2.0.0" | ||
__version__ = "2.1.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
-- Map boolean column names to symptom labels that Chart Review will use | ||
{% set cols = { | ||
'congestion_or_runny_nose': 'Congestion or runny nose', | ||
'cough': 'Cough', | ||
'diarrhea': 'Diarrhea', | ||
'dyspnea': 'Dyspnea', | ||
'fatigue': 'Fatigue', | ||
'fever_or_chills': 'Fever or chills', | ||
'headache': 'Headache', | ||
'loss_of_taste_or_smell': 'Loss of taste or smell', | ||
'muscle_or_body_aches': 'Muscle or body aches', | ||
'nausea_or_vomiting': 'Nausea or vomiting', | ||
'sore_throat': 'Sore throat', | ||
} -%} | ||
|
||
CREATE TABLE covid_symptom__symptom_{{ model }} AS | ||
|
||
-- Iterate table once for each symptom, noting the cases where it was present | ||
{% for col_name, symptom_label in cols.items() %} | ||
SELECT | ||
CONCAT('Encounter/', nr.encounter_id) AS encounter_ref, | ||
CONCAT('DocumentReference/', nr.docref_id) AS docref_ref, | ||
'{{ symptom_label }}' AS symptom_display | ||
FROM covid_symptom__nlp_results_{{ model }} AS nr | ||
WHERE nr.symptoms.{{ col_name }} | ||
UNION | ||
{% endfor %} | ||
|
||
-- Also capture encounters with no symptoms (as a single empty symptom label). | ||
-- Chart review will recognize this as "reviewed, but did not find anything". | ||
SELECT | ||
CONCAT('Encounter/', nr.encounter_id) AS encounter_ref, | ||
CONCAT('DocumentReference/', nr.docref_id) AS docref_ref, | ||
'' AS symptom_display | ||
FROM covid_symptom__nlp_results_{{ model }} AS nr | ||
WHERE | ||
{% for col_name in cols %} | ||
NOT nr.symptoms.{{ col_name }} | ||
{%- if not loop.last %} AND{%- endif %} | ||
{% endfor %} | ||
; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
"""Builder for the ChatGPT symptoms tables.""" | ||
|
||
import os | ||
|
||
import cumulus_library | ||
import jinja2 | ||
|
||
|
||
class GptBuilder(cumulus_library.BaseTableBuilder): | ||
display_text = "Creating ChatGPT symptom tables..." | ||
|
||
def prepare_queries(self, *args, **kwargs): | ||
self.queries += [ | ||
self.render_sql("builder_gpt", model="gpt35"), | ||
self.render_sql("builder_gpt", model="gpt4"), | ||
] | ||
|
||
@staticmethod | ||
def render_sql(template: str, **kwargs) -> str: | ||
path = os.path.dirname(__file__) | ||
with open(f"{path}/{template}.jinja") as file: | ||
loader = jinja2.FileSystemLoader(path) | ||
env = jinja2.Environment(loader=loader, autoescape=True).from_string(file.read()) | ||
return env.render(**kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
"""Gpt unit tests""" | ||
|
||
import datetime | ||
import os | ||
import tempfile | ||
import unittest | ||
|
||
import duckdb | ||
import pandas | ||
from cumulus_library import cli | ||
|
||
|
||
class GptTestCase(unittest.TestCase): | ||
"""Test case for the gpt symptom tables.""" | ||
|
||
def setUp(self): | ||
super().setUp() | ||
self.maxDiff = None | ||
|
||
@staticmethod | ||
def register(db: duckdb.DuckDBPyConnection, name: str, table: pandas.DataFrame) -> None: | ||
db.register(f"{name}_df", table) | ||
db.sql(f"CREATE TABLE {name} AS SELECT * FROM {name}_df") | ||
|
||
def make_core_tables(self, db: duckdb.DuckDBPyConnection) -> None: | ||
"""Make a single core encounter, patient, and docref""" | ||
encounter = pandas.DataFrame( | ||
{ | ||
"period_start_day": [datetime.date(2016, 10, 10)], | ||
"period_start_week": [datetime.date(2016, 10, 10)], | ||
"period_start_month": [datetime.date(2016, 10, 1)], | ||
"period_end_day": [datetime.date(2016, 10, 11)], | ||
"age_at_visit": [12], | ||
"status": ["finished"], | ||
"class_code": [None], | ||
"class_display": [None], | ||
"encounter_ref": ["Encounter/E1"], | ||
"subject_ref": ["Patient/P1"], | ||
} | ||
) | ||
self.register(db, "core__encounter", encounter) | ||
|
||
patient = pandas.DataFrame( | ||
{ | ||
"gender": ["unknown"], | ||
"race_display": [None], | ||
"ethnicity_display": [None], | ||
"subject_ref": ["Patient/P1"], | ||
} | ||
) | ||
self.register(db, "core__patient", patient) | ||
|
||
condition = pandas.DataFrame( | ||
{ | ||
"recordeddate_day": [datetime.date(2016, 10, 10)], | ||
"recordeddate_week": [datetime.date(2016, 10, 10)], | ||
"recordeddate_month": [datetime.date(2016, 10, 1)], | ||
"recordeddate_year": [datetime.date(2016, 1, 1)], | ||
"code": ["U07.1"], | ||
"encounter_ref": ["Encounter/E1"], | ||
"subject_ref": ["Patient/P1"], | ||
} | ||
) | ||
self.register(db, "core__condition", condition) | ||
|
||
docref = pandas.DataFrame( | ||
{ | ||
"author_day": [datetime.date(2016, 10, 10)], | ||
"author_week": [datetime.date(2016, 10, 10)], | ||
"author_month": [datetime.date(2016, 10, 1)], | ||
"author_year": [datetime.date(2016, 1, 1)], | ||
"type_code": ["34878-9"], | ||
"type_display": ["Emergency medicine Note"], | ||
"documentreference_ref": ["DocumentReference/D1"], | ||
"encounter_ref": ["Encounter/E1"], | ||
"subject_ref": ["Patient/P1"], | ||
} | ||
) | ||
self.register(db, "core__documentreference", docref) | ||
|
||
lab = pandas.DataFrame( | ||
{ | ||
"observation_code": ["94309-2"], | ||
"effectivedatetime_day": [datetime.date(2016, 10, 10)], | ||
"effectivedatetime_week": [datetime.date(2016, 10, 10)], | ||
"effectivedatetime_month": [datetime.date(2016, 10, 1)], | ||
"valuecodeableconcept_code": ["10828004"], | ||
"observation_ref": ["Observation/O1"], | ||
"encounter_ref": ["Encounter/E1"], | ||
"subject_ref": ["Patient/P1"], | ||
} | ||
) | ||
self.register(db, "core__observation_lab", lab) | ||
|
||
ed_note = pandas.DataFrame( | ||
{ | ||
"code": ["34878-9"], | ||
"from_code": ["149798455"], | ||
} | ||
) | ||
self.register(db, "core__ed_note", ed_note) | ||
|
||
nlp = pandas.DataFrame( | ||
{ | ||
"docref_id": ["D1"], | ||
"encounter_id": ["E1"], | ||
"subject_id": ["P1"], | ||
"match": [ | ||
{ | ||
"conceptattributes": [ | ||
{"cui": "C0027424"}, | ||
], | ||
"text": "Congestion", | ||
} | ||
], | ||
} | ||
) | ||
self.register(db, "covid_symptom__nlp_results", nlp) | ||
|
||
def make_gpt_table(self, db: duckdb.DuckDBPyConnection, name: str, **kwargs) -> None: | ||
symptoms = { | ||
"congestion_or_runny_nose": False, | ||
"cough": False, | ||
"diarrhea": False, | ||
"dyspnea": False, | ||
"fatigue": False, | ||
"fever_or_chills": False, | ||
"headache": False, | ||
"loss_of_taste_or_smell": False, | ||
"muscle_or_body_aches": False, | ||
"nausea_or_vomiting": False, | ||
"sore_throat": False, | ||
} | ||
symptoms.update(kwargs) | ||
table = pandas.DataFrame( | ||
{ | ||
"encounter_id": ["E1"], | ||
"docref_id": ["D1"], | ||
"symptoms": [symptoms], | ||
} | ||
) | ||
self.register(db, f"covid_symptom__nlp_results_{name}", table) | ||
|
||
def test_happy_path(self) -> None: | ||
"""Runs the study on some input data and spot-checks the gpt results""" | ||
test_dir = os.path.dirname(__file__) | ||
root_dir = os.path.dirname(test_dir) | ||
study_dir = f"{root_dir}/cumulus_library_covid" | ||
|
||
with tempfile.TemporaryDirectory() as tmpdir: | ||
db = duckdb.connect(f"{tmpdir}/duck.db") | ||
self.make_core_tables(db) | ||
self.make_gpt_table(db, "gpt35", cough=True, fever_or_chills=True) | ||
self.make_gpt_table(db, "gpt4") # test that we mark no-symptom-found docrefs | ||
db.close() | ||
|
||
cli.main( | ||
[ | ||
"build", | ||
# "--verbose", | ||
"--target=covid_symptom", | ||
f"--study-dir={study_dir}", | ||
"--db-type=duckdb", | ||
f"--database={tmpdir}/duck.db", | ||
] | ||
) | ||
db = duckdb.connect(f"{tmpdir}/duck.db") | ||
|
||
# Confirm we flag the right symptoms when present | ||
rel = db.sql("SELECT * FROM covid_symptom__symptom_gpt35") | ||
rows = rel.order("symptom_display").fetchall() | ||
self.assertEqual( | ||
[ | ||
("Encounter/E1", "DocumentReference/D1", "Cough"), | ||
("Encounter/E1", "DocumentReference/D1", "Fever or chills"), | ||
], | ||
rows, | ||
) | ||
|
||
# Confirm we flag a no-results docref too | ||
rel = db.sql("SELECT * FROM covid_symptom__symptom_gpt4") | ||
rows = rel.order("symptom_display").fetchall() | ||
self.assertEqual( | ||
[ | ||
("Encounter/E1", "DocumentReference/D1", ""), | ||
], | ||
rows, | ||
) |