From 31824851678a1869351f1020272cea318a6ba0ab Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Tue, 19 Nov 2024 12:51:37 -0800 Subject: [PATCH] bugfix: bulk insert serialization of data (#136) ## Description Data can be inserted when linking or seeding, in the former we were inserting the data as a dictionary into the Patient.data column, however in the latter we inserted the data as a string. This causes issues on subsequent reads when using the data. The call to convert the PIIRecord into a value for the database has been standardized in PIIRecord.to_dict(), and the bulk insert method (which is used in the seeding process) has been updated to use this method. --- src/recordlinker/database/mpi_service.py | 2 +- src/recordlinker/models/mpi.py | 5 +---- src/recordlinker/schemas/pii.py | 10 +++++++++ tests/unit/database/test_mpi_service.py | 7 +++--- tests/unit/routes/test_seed_router.py | 28 +++++++++++++++++++++++- 5 files changed, 42 insertions(+), 10 deletions(-) diff --git a/src/recordlinker/database/mpi_service.py b/src/recordlinker/database/mpi_service.py index 863431e1..c28bb3c9 100644 --- a/src/recordlinker/database/mpi_service.py +++ b/src/recordlinker/database/mpi_service.py @@ -132,7 +132,7 @@ def bulk_insert_patients( pat_data = [ { "person_id": person and person.id, - "_data": record.to_json(prune_empty=True), + "_data": record.to_dict(prune_empty=True), "external_patient_id": record.external_id, "external_person_id": external_person_id, "external_person_source": "IRIS" if external_person_id else None, diff --git a/src/recordlinker/models/mpi.py b/src/recordlinker/models/mpi.py index 8b0d4460..f8d81113 100644 --- a/src/recordlinker/models/mpi.py +++ b/src/recordlinker/models/mpi.py @@ -1,5 +1,4 @@ import enum -import json import uuid from sqlalchemy import orm @@ -96,12 +95,10 @@ def record(self, value): from recordlinker.schemas import pii assert isinstance(value, pii.PIIRecord), "Expected a PIIRecord object" - # convert the data to a JSON string, then load it back as a dictionary - # this is necessary to ensure all data elements are JSON serializable # recursively remove all None and unset values from the data # this is an optimization to reduce the amount of data stored in the # database, if a value is empty, no need to store it - self._data = json.loads(value.to_json(prune_empty=True)) + self._data = value.to_dict(prune_empty=True) if hasattr(self, "_record"): # if the record property is cached, delete it del self._record diff --git a/src/recordlinker/schemas/pii.py b/src/recordlinker/schemas/pii.py index 47dab707..8df7aea3 100644 --- a/src/recordlinker/schemas/pii.py +++ b/src/recordlinker/schemas/pii.py @@ -1,5 +1,6 @@ import datetime import enum +import json import re import typing @@ -288,6 +289,15 @@ def to_json(self, prune_empty: bool = False) -> str: """ return self.model_dump_json(exclude_unset=prune_empty, exclude_none=prune_empty) + def to_dict(self, prune_empty: bool = False) -> dict: + """ + Convert the PIIRecord object to a dictionary. + """ + # convert the data to a JSON string, then load it back as a dictionary + # this is necessary to ensure all data elements are JSON serializable + data = self.to_json(prune_empty=prune_empty) + return json.loads(data) + def feature_iter(self, feature: Feature) -> typing.Iterator[str]: """ Given a field name, return an iterator of all string values for that field. diff --git a/tests/unit/database/test_mpi_service.py b/tests/unit/database/test_mpi_service.py index 11568792..cf91f6fe 100644 --- a/tests/unit/database/test_mpi_service.py +++ b/tests/unit/database/test_mpi_service.py @@ -5,7 +5,6 @@ This module contains the unit tests for the recordlinker.database.mpi_service module. """ -import json import uuid import pytest @@ -253,7 +252,7 @@ def test_no_person(self, session): patients = mpi_service.bulk_insert_patients(session, [rec], external_person_id="123456") assert len(patients) == 1 assert patients[0].person_id is None - assert json.loads(patients[0].data) == { + assert patients[0].data == { "name": [{"given": ["Johnathon"], "family": "Smith"}] } assert patients[0].external_person_id == "123456" @@ -280,11 +279,11 @@ def test_with_person(self, session): assert len(patients) == 2 assert patients[0].person_id == person.id assert patients[1].person_id == person.id - assert json.loads(patients[0].data) == { + assert patients[0].data == { "birth_date": "1950-01-01", "name": [{"given": ["George"], "family": "Harrison"}], } - assert json.loads(patients[1].data) == { + assert patients[1].data == { "birth_date": "1950-01-01", "name": [{"given": ["George", "Harold"], "family": "Harrison"}], } diff --git a/tests/unit/routes/test_seed_router.py b/tests/unit/routes/test_seed_router.py index ef394561..88741c85 100644 --- a/tests/unit/routes/test_seed_router.py +++ b/tests/unit/routes/test_seed_router.py @@ -5,6 +5,8 @@ This module contains the unit tests for the recordlinker.routes.seed_router module. """ +import unittest.mock as mock + from conftest import load_test_json_asset from recordlinker import models @@ -20,7 +22,10 @@ def test_too_many_clusters(self, client): data = {"clusters": [{"records": []} for _ in range(101)]} response = client.post("/seed", json=data) assert response.status_code == 422 - assert response.json()["detail"][0]["msg"] == "Value error, Clusters must not exceed 100 records" + assert ( + response.json()["detail"][0]["msg"] + == "Value error, Clusters must not exceed 100 records" + ) def test_large_batch(self, client): data = load_test_json_asset("seed_test.json.gz") @@ -35,6 +40,27 @@ def test_large_batch(self, client): assert client.session.query(models.Patient).count() == 1285 assert client.session.query(models.BlockingValue).count() == 8995 + @mock.patch("recordlinker.database.algorithm_service.default_algorithm") + def test_seed_and_link(self, mock_algorithm, basic_algorithm, client): + mock_algorithm.return_value = basic_algorithm + record = { + "birth_date": "1956-09-06", + "sex": "F", + "address": [{"line": ["581 Baker Club"], "postal_code": "80373"}], + "name": [ + { + "family": "Cervantes", + "given": ["Jason"], + } + ], + } + seed_resp = client.post("/seed", json={"clusters": [{"records": [record]}]}) + assert seed_resp.status_code == 201 + persons = seed_resp.json()["persons"] + assert len(persons) == 1 + response = client.post("/link", json={"record": record}) + assert response.status_code == 200 + class TestReset: def test_reset(self, client):