bugfix: bulk insert serialization of data (#136)

## Description Data can be inserted when linking or seeding, in the former we were inserting the data as a dictionary into the Patient.data column, however in the latter we inserted the data as a string. This causes issues on subsequent reads when using the data. The call to convert the PIIRecord into a value for the database has been standardized in PIIRecord.to_dict(), and the bulk insert method (which is used in the seeding process) has been updated to use this method.
CDCgov · Nov 19, 2024 · 3182485 · 3182485
1 parent 7bd7b09
commit 3182485
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 10 deletions.
diff --git a/src/recordlinker/database/mpi_service.py b/src/recordlinker/database/mpi_service.py
@@ -132,7 +132,7 @@ def bulk_insert_patients(
     pat_data = [
         {
             "person_id": person and person.id,
-            "_data": record.to_json(prune_empty=True),
+            "_data": record.to_dict(prune_empty=True),
             "external_patient_id": record.external_id,
             "external_person_id": external_person_id,
             "external_person_source": "IRIS" if external_person_id else None,

diff --git a/src/recordlinker/models/mpi.py b/src/recordlinker/models/mpi.py
@@ -1,5 +1,4 @@
 import enum
-import json
 import uuid
 
 from sqlalchemy import orm
@@ -96,12 +95,10 @@ def record(self, value):
         from recordlinker.schemas import pii
 
         assert isinstance(value, pii.PIIRecord), "Expected a PIIRecord object"
-        # convert the data to a JSON string, then load it back as a dictionary
-        # this is necessary to ensure all data elements are JSON serializable
         # recursively remove all None and unset values from the data
         # this is an optimization to reduce the amount of data stored in the
         # database, if a value is empty, no need to store it
-        self._data = json.loads(value.to_json(prune_empty=True))
+        self._data = value.to_dict(prune_empty=True)
         if hasattr(self, "_record"):
             # if the record property is cached, delete it
             del self._record

diff --git a/src/recordlinker/schemas/pii.py b/src/recordlinker/schemas/pii.py
@@ -1,5 +1,6 @@
 import datetime
 import enum
+import json
 import re
 import typing
 
@@ -288,6 +289,15 @@ def to_json(self, prune_empty: bool = False) -> str:
         """
         return self.model_dump_json(exclude_unset=prune_empty, exclude_none=prune_empty)
 
+    def to_dict(self, prune_empty: bool = False) -> dict:
+        """
+        Convert the PIIRecord object to a dictionary.
+        """
+        # convert the data to a JSON string, then load it back as a dictionary
+        # this is necessary to ensure all data elements are JSON serializable
+        data = self.to_json(prune_empty=prune_empty)
+        return json.loads(data)
+
     def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
         """
         Given a field name, return an iterator of all string values for that field.

diff --git a/tests/unit/database/test_mpi_service.py b/tests/unit/database/test_mpi_service.py
@@ -5,7 +5,6 @@
 This module contains the unit tests for the recordlinker.database.mpi_service module.
 """
 
-import json
 import uuid
 
 import pytest
@@ -253,7 +252,7 @@ def test_no_person(self, session):
         patients = mpi_service.bulk_insert_patients(session, [rec], external_person_id="123456")
         assert len(patients) == 1
         assert patients[0].person_id is None
-        assert json.loads(patients[0].data) == {
+        assert patients[0].data == {
             "name": [{"given": ["Johnathon"], "family": "Smith"}]
         }
         assert patients[0].external_person_id == "123456"
@@ -280,11 +279,11 @@ def test_with_person(self, session):
         assert len(patients) == 2
         assert patients[0].person_id == person.id
         assert patients[1].person_id == person.id
-        assert json.loads(patients[0].data) == {
+        assert patients[0].data == {
             "birth_date": "1950-01-01",
             "name": [{"given": ["George"], "family": "Harrison"}],
         }
-        assert json.loads(patients[1].data) == {
+        assert patients[1].data == {
             "birth_date": "1950-01-01",
             "name": [{"given": ["George", "Harold"], "family": "Harrison"}],
         }

diff --git a/tests/unit/routes/test_seed_router.py b/tests/unit/routes/test_seed_router.py
@@ -5,6 +5,8 @@
 This module contains the unit tests for the recordlinker.routes.seed_router module.
 """
 
+import unittest.mock as mock
+
 from conftest import load_test_json_asset
 
 from recordlinker import models
@@ -20,7 +22,10 @@ def test_too_many_clusters(self, client):
         data = {"clusters": [{"records": []} for _ in range(101)]}
         response = client.post("/seed", json=data)
         assert response.status_code == 422
-        assert response.json()["detail"][0]["msg"] == "Value error, Clusters must not exceed 100 records"
+        assert (
+            response.json()["detail"][0]["msg"]
+            == "Value error, Clusters must not exceed 100 records"
+        )
 
     def test_large_batch(self, client):
         data = load_test_json_asset("seed_test.json.gz")
@@ -35,6 +40,27 @@ def test_large_batch(self, client):
         assert client.session.query(models.Patient).count() == 1285
         assert client.session.query(models.BlockingValue).count() == 8995
 
+    @mock.patch("recordlinker.database.algorithm_service.default_algorithm")
+    def test_seed_and_link(self, mock_algorithm, basic_algorithm, client):
+        mock_algorithm.return_value = basic_algorithm
+        record = {
+            "birth_date": "1956-09-06",
+            "sex": "F",
+            "address": [{"line": ["581 Baker Club"], "postal_code": "80373"}],
+            "name": [
+                {
+                    "family": "Cervantes",
+                    "given": ["Jason"],
+                }
+            ],
+        }
+        seed_resp = client.post("/seed", json={"clusters": [{"records": [record]}]})
+        assert seed_resp.status_code == 201
+        persons = seed_resp.json()["persons"]
+        assert len(persons) == 1
+        response = client.post("/link", json={"record": record})
+        assert response.status_code == 200
+
 
 class TestReset:
     def test_reset(self, client):