Skip to content

Commit

Permalink
Process new ETL gpt tables into symptom lists (#43)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikix authored Aug 22, 2024
1 parent dc237d7 commit 9cc4d46
Show file tree
Hide file tree
Showing 10 changed files with 283 additions and 5 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ jobs:
with:
python-version: "3.11"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install .[tests]
- name: Test with pytest
run: python -m pytest

lint:
runs-on: ubuntu-latest
steps:
Expand Down
10 changes: 8 additions & 2 deletions RUNNING.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ You should now have all the interesting results sitting in Athena.

In Athena's web console, run these commands and download the CSV results,
using the given filenames (we will refer back to these filenames later):
- **ctakes.csv**: `select encounter_ref, symptom_display from covid_symptom__symptom_ctakes_negation`
- **ctakes.csv** (if you ran cTAKES): `select encounter_ref, symptom_display from covid_symptom__symptom_ctakes_negation`
- **gpt35.csv** (if you ran ChatGPT 3.5): `select encounter_ref, symptom_display from covid_symptom__symptom_gpt35`
- **gpt4.csv** (if you ran ChatGPT 4): `select encounter_ref, symptom_display from covid_symptom__symptom_gpt4`
- **docrefs.csv**: `select distinct docref_id from covid_symptom__symptom_ctakes_negation`
- **icd10.csv**: `select encounter_ref, substring(icd10_display, 7) as symptom_display from covid_symptom__symptom_icd10`

Expand Down Expand Up @@ -115,7 +117,7 @@ Save this file as `labelstudio-export.json` in a new folder.
## 8. Set up `chart-review`

- Run `pip install chart-review`
- Copy `ctakes.csv` and `icd10.csv` from step 3 above into the same folder
- Copy the `.csv` files from step 3 above into the same folder
you used for `labelstudio-export.json` above (the "chart review folder").
- Add a new `config.yaml` file in that folder:
```yaml
Expand All @@ -137,6 +139,10 @@ annotators:
human2: 2
ctakes:
filename: ctakes.csv
gpt35:
filename: gpt35.csv
gpt4:
filename: gpt4.csv
icd10:
filename: icd10.csv
```
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library_covid/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""SQL generation for cumulus covid symptom analysis"""

__version__ = "2.0.0"
__version__ = "2.1.0"
41 changes: 41 additions & 0 deletions cumulus_library_covid/builder_gpt.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
-- Map boolean column names to symptom labels that Chart Review will use
{% set cols = {
'congestion_or_runny_nose': 'Congestion or runny nose',
'cough': 'Cough',
'diarrhea': 'Diarrhea',
'dyspnea': 'Dyspnea',
'fatigue': 'Fatigue',
'fever_or_chills': 'Fever or chills',
'headache': 'Headache',
'loss_of_taste_or_smell': 'Loss of taste or smell',
'muscle_or_body_aches': 'Muscle or body aches',
'nausea_or_vomiting': 'Nausea or vomiting',
'sore_throat': 'Sore throat',
} -%}

CREATE TABLE covid_symptom__symptom_{{ model }} AS

-- Iterate table once for each symptom, noting the cases where it was present
{% for col_name, symptom_label in cols.items() %}
SELECT
CONCAT('Encounter/', nr.encounter_id) AS encounter_ref,
CONCAT('DocumentReference/', nr.docref_id) AS docref_ref,
'{{ symptom_label }}' AS symptom_display
FROM covid_symptom__nlp_results_{{ model }} AS nr
WHERE nr.symptoms.{{ col_name }}
UNION
{% endfor %}

-- Also capture encounters with no symptoms (as a single empty symptom label).
-- Chart review will recognize this as "reviewed, but did not find anything".
SELECT
CONCAT('Encounter/', nr.encounter_id) AS encounter_ref,
CONCAT('DocumentReference/', nr.docref_id) AS docref_ref,
'' AS symptom_display
FROM covid_symptom__nlp_results_{{ model }} AS nr
WHERE
{% for col_name in cols %}
NOT nr.symptoms.{{ col_name }}
{%- if not loop.last %} AND{%- endif %}
{% endfor %}
;
24 changes: 24 additions & 0 deletions cumulus_library_covid/builder_gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Builder for the ChatGPT symptoms tables."""

import os

import cumulus_library
import jinja2


class GptBuilder(cumulus_library.BaseTableBuilder):
display_text = "Creating ChatGPT symptom tables..."

def prepare_queries(self, *args, **kwargs):
self.queries += [
self.render_sql("builder_gpt", model="gpt35"),
self.render_sql("builder_gpt", model="gpt4"),
]

@staticmethod
def render_sql(template: str, **kwargs) -> str:
path = os.path.dirname(__file__)
with open(f"{path}/{template}.jinja") as file:
loader = jinja2.FileSystemLoader(path)
env = jinja2.Environment(loader=loader, autoescape=True).from_string(file.read())
return env.render(**kwargs)
4 changes: 2 additions & 2 deletions cumulus_library_covid/counts.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from pathlib import Path

from cumulus_library.statistics.counts import CountsBuilder
import cumulus_library


class CovidCountsBuilder(CountsBuilder):
class CovidCountsBuilder(cumulus_library.CountsBuilder):
display_text = "Creating covid counts..."

def count_dx(self, duration="week"):
Expand Down
5 changes: 5 additions & 0 deletions cumulus_library_covid/manifest.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
study_prefix = "covid_symptom"

[table_builder_config]
file_names = [
"builder_gpt.py",
]

[sql_config]
file_names = [
# "define_age_pediatric.sql",
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ dev = [
# if you update the ruff version, also update .pre-commit-config.yaml
"ruff < 0.6",
]
tests = [
"duckdb",
"pandas",
]

[tool.ruff]
line-length = 100
Expand All @@ -46,3 +50,5 @@ select = [
"S", # bandit security warnings
"UP", # alert you when better syntax is available in your python version
]
[tool.ruff.lint.per-file-ignores]
"tests/**" = ["S"] # tests do suspicious stuff that's fine, actually
Empty file added tests/__init__.py
Empty file.
188 changes: 188 additions & 0 deletions tests/test_gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
"""Gpt unit tests"""

import datetime
import os
import tempfile
import unittest

import duckdb
import pandas
from cumulus_library import cli


class GptTestCase(unittest.TestCase):
"""Test case for the gpt symptom tables."""

def setUp(self):
super().setUp()
self.maxDiff = None

@staticmethod
def register(db: duckdb.DuckDBPyConnection, name: str, table: pandas.DataFrame) -> None:
db.register(f"{name}_df", table)
db.sql(f"CREATE TABLE {name} AS SELECT * FROM {name}_df")

def make_core_tables(self, db: duckdb.DuckDBPyConnection) -> None:
"""Make a single core encounter, patient, and docref"""
encounter = pandas.DataFrame(
{
"period_start_day": [datetime.date(2016, 10, 10)],
"period_start_week": [datetime.date(2016, 10, 10)],
"period_start_month": [datetime.date(2016, 10, 1)],
"period_end_day": [datetime.date(2016, 10, 11)],
"age_at_visit": [12],
"status": ["finished"],
"class_code": [None],
"class_display": [None],
"encounter_ref": ["Encounter/E1"],
"subject_ref": ["Patient/P1"],
}
)
self.register(db, "core__encounter", encounter)

patient = pandas.DataFrame(
{
"gender": ["unknown"],
"race_display": [None],
"ethnicity_display": [None],
"subject_ref": ["Patient/P1"],
}
)
self.register(db, "core__patient", patient)

condition = pandas.DataFrame(
{
"recordeddate_day": [datetime.date(2016, 10, 10)],
"recordeddate_week": [datetime.date(2016, 10, 10)],
"recordeddate_month": [datetime.date(2016, 10, 1)],
"recordeddate_year": [datetime.date(2016, 1, 1)],
"code": ["U07.1"],
"encounter_ref": ["Encounter/E1"],
"subject_ref": ["Patient/P1"],
}
)
self.register(db, "core__condition", condition)

docref = pandas.DataFrame(
{
"author_day": [datetime.date(2016, 10, 10)],
"author_week": [datetime.date(2016, 10, 10)],
"author_month": [datetime.date(2016, 10, 1)],
"author_year": [datetime.date(2016, 1, 1)],
"type_code": ["34878-9"],
"type_display": ["Emergency medicine Note"],
"documentreference_ref": ["DocumentReference/D1"],
"encounter_ref": ["Encounter/E1"],
"subject_ref": ["Patient/P1"],
}
)
self.register(db, "core__documentreference", docref)

lab = pandas.DataFrame(
{
"observation_code": ["94309-2"],
"effectivedatetime_day": [datetime.date(2016, 10, 10)],
"effectivedatetime_week": [datetime.date(2016, 10, 10)],
"effectivedatetime_month": [datetime.date(2016, 10, 1)],
"valuecodeableconcept_code": ["10828004"],
"observation_ref": ["Observation/O1"],
"encounter_ref": ["Encounter/E1"],
"subject_ref": ["Patient/P1"],
}
)
self.register(db, "core__observation_lab", lab)

ed_note = pandas.DataFrame(
{
"code": ["34878-9"],
"from_code": ["149798455"],
}
)
self.register(db, "core__ed_note", ed_note)

nlp = pandas.DataFrame(
{
"docref_id": ["D1"],
"encounter_id": ["E1"],
"subject_id": ["P1"],
"match": [
{
"conceptattributes": [
{"cui": "C0027424"},
],
"text": "Congestion",
}
],
}
)
self.register(db, "covid_symptom__nlp_results", nlp)

def make_gpt_table(self, db: duckdb.DuckDBPyConnection, name: str, **kwargs) -> None:
symptoms = {
"congestion_or_runny_nose": False,
"cough": False,
"diarrhea": False,
"dyspnea": False,
"fatigue": False,
"fever_or_chills": False,
"headache": False,
"loss_of_taste_or_smell": False,
"muscle_or_body_aches": False,
"nausea_or_vomiting": False,
"sore_throat": False,
}
symptoms.update(kwargs)
table = pandas.DataFrame(
{
"encounter_id": ["E1"],
"docref_id": ["D1"],
"symptoms": [symptoms],
}
)
self.register(db, f"covid_symptom__nlp_results_{name}", table)

def test_happy_path(self) -> None:
"""Runs the study on some input data and spot-checks the gpt results"""
test_dir = os.path.dirname(__file__)
root_dir = os.path.dirname(test_dir)
study_dir = f"{root_dir}/cumulus_library_covid"

with tempfile.TemporaryDirectory() as tmpdir:
db = duckdb.connect(f"{tmpdir}/duck.db")
self.make_core_tables(db)
self.make_gpt_table(db, "gpt35", cough=True, fever_or_chills=True)
self.make_gpt_table(db, "gpt4") # test that we mark no-symptom-found docrefs
db.close()

cli.main(
[
"build",
# "--verbose",
"--target=covid_symptom",
f"--study-dir={study_dir}",
"--db-type=duckdb",
f"--database={tmpdir}/duck.db",
]
)
db = duckdb.connect(f"{tmpdir}/duck.db")

# Confirm we flag the right symptoms when present
rel = db.sql("SELECT * FROM covid_symptom__symptom_gpt35")
rows = rel.order("symptom_display").fetchall()
self.assertEqual(
[
("Encounter/E1", "DocumentReference/D1", "Cough"),
("Encounter/E1", "DocumentReference/D1", "Fever or chills"),
],
rows,
)

# Confirm we flag a no-results docref too
rel = db.sql("SELECT * FROM covid_symptom__symptom_gpt4")
rows = rel.order("symptom_display").fetchall()
self.assertEqual(
[
("Encounter/E1", "DocumentReference/D1", ""),
],
rows,
)

0 comments on commit 9cc4d46

Please sign in to comment.