Skip to content

Commit

Permalink
bugfix: bulk insert serialization of data (#136)
Browse files Browse the repository at this point in the history
## Description
Data can be inserted when linking or seeding, in the former we were
inserting the data as a dictionary into the Patient.data column, however
in the latter we inserted the data as a string. This causes issues on
subsequent reads when using the data.

The call to convert the PIIRecord into a value for the database has been
standardized in PIIRecord.to_dict(), and the bulk insert method (which
is used in the seeding process) has been updated to use this method.
  • Loading branch information
ericbuckley authored Nov 19, 2024
1 parent 7bd7b09 commit 3182485
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/recordlinker/database/mpi_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def bulk_insert_patients(
pat_data = [
{
"person_id": person and person.id,
"_data": record.to_json(prune_empty=True),
"_data": record.to_dict(prune_empty=True),
"external_patient_id": record.external_id,
"external_person_id": external_person_id,
"external_person_source": "IRIS" if external_person_id else None,
Expand Down
5 changes: 1 addition & 4 deletions src/recordlinker/models/mpi.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import enum
import json
import uuid

from sqlalchemy import orm
Expand Down Expand Up @@ -96,12 +95,10 @@ def record(self, value):
from recordlinker.schemas import pii

assert isinstance(value, pii.PIIRecord), "Expected a PIIRecord object"
# convert the data to a JSON string, then load it back as a dictionary
# this is necessary to ensure all data elements are JSON serializable
# recursively remove all None and unset values from the data
# this is an optimization to reduce the amount of data stored in the
# database, if a value is empty, no need to store it
self._data = json.loads(value.to_json(prune_empty=True))
self._data = value.to_dict(prune_empty=True)
if hasattr(self, "_record"):
# if the record property is cached, delete it
del self._record
Expand Down
10 changes: 10 additions & 0 deletions src/recordlinker/schemas/pii.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import enum
import json
import re
import typing

Expand Down Expand Up @@ -288,6 +289,15 @@ def to_json(self, prune_empty: bool = False) -> str:
"""
return self.model_dump_json(exclude_unset=prune_empty, exclude_none=prune_empty)

def to_dict(self, prune_empty: bool = False) -> dict:
"""
Convert the PIIRecord object to a dictionary.
"""
# convert the data to a JSON string, then load it back as a dictionary
# this is necessary to ensure all data elements are JSON serializable
data = self.to_json(prune_empty=prune_empty)
return json.loads(data)

def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
"""
Given a field name, return an iterator of all string values for that field.
Expand Down
7 changes: 3 additions & 4 deletions tests/unit/database/test_mpi_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
This module contains the unit tests for the recordlinker.database.mpi_service module.
"""

import json
import uuid

import pytest
Expand Down Expand Up @@ -253,7 +252,7 @@ def test_no_person(self, session):
patients = mpi_service.bulk_insert_patients(session, [rec], external_person_id="123456")
assert len(patients) == 1
assert patients[0].person_id is None
assert json.loads(patients[0].data) == {
assert patients[0].data == {
"name": [{"given": ["Johnathon"], "family": "Smith"}]
}
assert patients[0].external_person_id == "123456"
Expand All @@ -280,11 +279,11 @@ def test_with_person(self, session):
assert len(patients) == 2
assert patients[0].person_id == person.id
assert patients[1].person_id == person.id
assert json.loads(patients[0].data) == {
assert patients[0].data == {
"birth_date": "1950-01-01",
"name": [{"given": ["George"], "family": "Harrison"}],
}
assert json.loads(patients[1].data) == {
assert patients[1].data == {
"birth_date": "1950-01-01",
"name": [{"given": ["George", "Harold"], "family": "Harrison"}],
}
Expand Down
28 changes: 27 additions & 1 deletion tests/unit/routes/test_seed_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
This module contains the unit tests for the recordlinker.routes.seed_router module.
"""

import unittest.mock as mock

from conftest import load_test_json_asset

from recordlinker import models
Expand All @@ -20,7 +22,10 @@ def test_too_many_clusters(self, client):
data = {"clusters": [{"records": []} for _ in range(101)]}
response = client.post("/seed", json=data)
assert response.status_code == 422
assert response.json()["detail"][0]["msg"] == "Value error, Clusters must not exceed 100 records"
assert (
response.json()["detail"][0]["msg"]
== "Value error, Clusters must not exceed 100 records"
)

def test_large_batch(self, client):
data = load_test_json_asset("seed_test.json.gz")
Expand All @@ -35,6 +40,27 @@ def test_large_batch(self, client):
assert client.session.query(models.Patient).count() == 1285
assert client.session.query(models.BlockingValue).count() == 8995

@mock.patch("recordlinker.database.algorithm_service.default_algorithm")
def test_seed_and_link(self, mock_algorithm, basic_algorithm, client):
mock_algorithm.return_value = basic_algorithm
record = {
"birth_date": "1956-09-06",
"sex": "F",
"address": [{"line": ["581 Baker Club"], "postal_code": "80373"}],
"name": [
{
"family": "Cervantes",
"given": ["Jason"],
}
],
}
seed_resp = client.post("/seed", json={"clusters": [{"records": [record]}]})
assert seed_resp.status_code == 201
persons = seed_resp.json()["persons"]
assert len(persons) == 1
response = client.post("/link", json={"record": record})
assert response.status_code == 200


class TestReset:
def test_reset(self, client):
Expand Down

0 comments on commit 3182485

Please sign in to comment.