Skip to content

Commit

Permalink
fix misleading matcher names (#134)
Browse files Browse the repository at this point in the history
## Description
Renaming `Feature.TELEPHONE` to `Feature.TELECOM`, as that more
accurately describes the matching behavior. Adding` Feature.GIVEN_NAME`
and updating `Feature.FIRST_NAME` to only use the first element of a
given name.

## Related Issues
closes #128
closes #129 

## Additional Notes
- TELEPHONE was misleading because the telecom field could collect
emails, fax numbers and other contact points, that would be included in
the matches. Renaming this to TELECOM better explains to the user what
we're matching on.
- FIRST_NAME was misleading, as it was including the middle names in its
matches as well.
  • Loading branch information
ericbuckley authored Nov 18, 2024
1 parent 8efe75c commit 7bd7b09
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 25 deletions.
16 changes: 10 additions & 6 deletions docs/site/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,14 @@ linkage evaluation phase. The following features are supported:

: The patient's race in the format of "AMERICAN_INDIAN", "ASIAN", "BLACK", "HAWAIIAN", "WHITE", "OTHER", "ASKED_UNKNOWN" or "UNKNOWN".

`FIRST_NAME`
`GIVEN_NAME`

: The patient's given name, this includes first and middle names.

`FIRST_NAME`

: The patient's first name.

`LAST_NAME`

: The patient's last name.
Expand Down Expand Up @@ -67,9 +71,9 @@ linkage evaluation phase. The following features are supported:

: The patient's county.

`TELEPHONE`
`TELECOM`

: The patient's telephone number.
: The patient's phone, email, fax, or other contact information.

`DRIVERS_LICENSE`

Expand Down Expand Up @@ -134,10 +138,10 @@ These are the functions that can be used to compare the values of two features t
if they are a match or not.

**Note**: When most features are compared, we are doing a 1 to 1 comparison (e.g. "M" == "M").
However, some features have the ability to have multiple values (e.g. `FIRST_NAME`), thus feature
However, some features have the ability to have multiple values (e.g. `GIVEN_NAME`), thus feature
matching is designed to compare one list of values to another list of values. For example, an
incoming record could have a FIRST_NAME of ["John", "Dean"] and we could be comparing them to an
existing Patient with the FIRST_NAME of ["John", "D"].
incoming record could have a GIVEN_NAME of ["John", "Dean"] and we could be comparing them to an
existing Patient with the GIVEN_NAME of ["John", "D"].

`func:recordlinker.linking.matchers.feature_match_any`

Expand Down
13 changes: 10 additions & 3 deletions src/recordlinker/schemas/pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class Feature(enum.Enum):
BIRTHDATE = "BIRTHDATE"
MRN = "MRN"
SEX = "SEX"
GIVEN_NAME = "GIVEN_NAME"
FIRST_NAME = "FIRST_NAME"
LAST_NAME = "LAST_NAME"
ADDRESS = "ADDRESS"
Expand All @@ -26,7 +27,7 @@ class Feature(enum.Enum):
SSN = "SSN"
RACE = "RACE"
GENDER = "GENDER"
TELEPHONE = "TELEPHONE"
TELECOM = "TELECOM"
SUFFIX = "SUFFIX"
COUNTY = "COUNTY"
DRIVERS_LICENSE = "DRIVERS_LICENSE"
Expand Down Expand Up @@ -323,11 +324,17 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
if address.postal_code:
# only use the first 5 digits for comparison
yield address.postal_code[:5]
elif feature == Feature.FIRST_NAME:
elif feature == Feature.GIVEN_NAME:
for name in self.name:
for given in name.given:
if given:
yield given
elif feature == Feature.FIRST_NAME:
for name in self.name:
# We only want the first given name for comparison
for given in name.given[0:1]:
if given:
yield given
elif feature == Feature.LAST_NAME:
for name in self.name:
if name.family:
Expand All @@ -341,7 +348,7 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
elif feature == Feature.GENDER:
if self.gender:
yield str(self.gender)
elif feature == Feature.TELEPHONE:
elif feature == Feature.TELECOM:
for telecom in self.telecom:
if telecom.value:
yield telecom.value
Expand Down
16 changes: 8 additions & 8 deletions tests/unit/database/test_mpi_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_patient(self, session):
)
mpi_service.insert_blocking_values(session, [pat])
values = pat.blocking_values
assert len(values) == 4
assert len(values) == 3
for val in values:
assert values[0].patient_id == pat.id
if val.blockingkey == models.BlockingKey.BIRTHDATE.id:
Expand Down Expand Up @@ -90,7 +90,7 @@ def test_multiple_patients(self, session):
},
)
mpi_service.insert_blocking_values(session, [pat1, pat2])
assert len(pat1.blocking_values) == 4
assert len(pat1.blocking_values) == 3
assert len(pat2.blocking_values) == 3

def test_with_mismatched_records(self, session):
Expand All @@ -117,9 +117,9 @@ def test_with_records(self, session):
rec = schemas.PIIRecord(**pat.data)
mpi_service.insert_blocking_values(session, [pat], [rec])
values = pat.blocking_values
assert len(values) == 4
assert len(values) == 3
assert set(v.patient_id for v in values) == {pat.id}
assert set(v.value for v in values) == {"1980-01-01", "John", "Bill", "Smit"}
assert set(v.value for v in values) == {"1980-01-01", "John", "Smit"}


class TestInsertPatient:
Expand Down Expand Up @@ -151,7 +151,8 @@ def test_no_person(self, session):
]
assert patient.external_person_id is None
assert patient.external_person_source is None
assert len(patient.blocking_values) == 4
assert patient.person_id is None
assert len(patient.blocking_values) == 3

def test_no_person_with_external_id(self, session):
data = {
Expand Down Expand Up @@ -291,11 +292,10 @@ def test_with_person(self, session):
assert patients[1].external_person_id == "123456"
assert len(patients[0].blocking_values) == 3
assert set(v.value for v in patients[0].blocking_values) == {"1950-01-01", "Geor", "Harr"}
assert len(patients[1].blocking_values) == 4
assert len(patients[1].blocking_values) == 3
assert set(v.value for v in patients[1].blocking_values) == {
"1950-01-01",
"Geor",
"Haro",
"Harr",
}

Expand Down Expand Up @@ -701,7 +701,7 @@ def test(self, session):
mpi_service.insert_patient(session, schemas.PIIRecord(**data), person=models.Person())
assert session.query(models.Patient).count() == 1
assert session.query(models.Person).count() == 1
assert session.query(models.BlockingValue).count() == 4
assert session.query(models.BlockingValue).count() == 3
mpi_service.reset_mpi(session)
assert session.query(models.Patient).count() == 0
assert session.query(models.Person).count() == 0
Expand Down
9 changes: 7 additions & 2 deletions tests/unit/linking/test_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_get_fuzzy_params():

def test_feature_match_any():
record = schemas.PIIRecord(
name=[{"given": ["John"], "family": "Smith"}, {"family": "Harrison"}],
name=[{"given": ["John", "Michael"], "family": "Smith"}, {"family": "Harrison"}],
birthDate="Jan 1 1980",
)
pat1 = models.Patient(
Expand All @@ -66,16 +66,19 @@ def test_feature_match_any():
pat2 = models.Patient(data={"name": [{"given": ["Michael"], "family": "Smith"}], "sex": "male"})
pat3 = models.Patient(data={"name": [{"family": "Smith"}, {"family": "Williams"}]})

assert matchers.feature_match_any(record, pat1, schemas.Feature.GIVEN_NAME)
assert matchers.feature_match_any(record, pat1, schemas.Feature.FIRST_NAME)
assert not matchers.feature_match_any(record, pat1, schemas.Feature.LAST_NAME)
assert matchers.feature_match_any(record, pat1, schemas.Feature.BIRTHDATE)
assert not matchers.feature_match_any(record, pat1, schemas.Feature.ZIP)

assert matchers.feature_match_any(record, pat2, schemas.Feature.GIVEN_NAME)
assert not matchers.feature_match_any(record, pat2, schemas.Feature.FIRST_NAME)
assert matchers.feature_match_any(record, pat2, schemas.Feature.LAST_NAME)
assert not matchers.feature_match_any(record, pat2, schemas.Feature.SEX)
assert not matchers.feature_match_any(record, pat1, schemas.Feature.ZIP)

assert not matchers.feature_match_any(record, pat3, schemas.Feature.GIVEN_NAME)
assert not matchers.feature_match_any(record, pat3, schemas.Feature.FIRST_NAME)
assert matchers.feature_match_any(record, pat3, schemas.Feature.LAST_NAME)
assert not matchers.feature_match_any(record, pat3, schemas.Feature.BIRTHDATE)
Expand All @@ -101,11 +104,13 @@ def test_feature_match_exact():
)
pat3 = models.Patient(data={"name": [{"family": "Smith"}, {"family": "Harrison"}]})

assert not matchers.feature_match_exact(record, pat1, schemas.Feature.FIRST_NAME)
assert not matchers.feature_match_exact(record, pat1, schemas.Feature.GIVEN_NAME)
assert matchers.feature_match_exact(record, pat1, schemas.Feature.FIRST_NAME)
assert not matchers.feature_match_exact(record, pat1, schemas.Feature.LAST_NAME)
assert matchers.feature_match_exact(record, pat1, schemas.Feature.BIRTHDATE)
assert not matchers.feature_match_exact(record, pat1, schemas.Feature.ZIP)

assert matchers.feature_match_exact(record, pat2, schemas.Feature.GIVEN_NAME)
assert matchers.feature_match_exact(record, pat2, schemas.Feature.FIRST_NAME)
assert not matchers.feature_match_exact(record, pat2, schemas.Feature.LAST_NAME)
assert not matchers.feature_match_exact(record, pat2, schemas.Feature.SEX)
Expand Down
18 changes: 12 additions & 6 deletions tests/unit/schemas/test_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ def test_feature_iter(self):
],
telecom=[
pii.Telecom(value="555-123-4567"),
pii.Telecom(value="555-987-6543"),
pii.Telecom(value="555-987-6543", system="phone"),
pii.Telecom(value="[email protected]", system="email"),
],
drivers_license=pii.DriversLicense(value="D1234567", authority="VA"),
)
Expand All @@ -248,12 +249,17 @@ def test_feature_iter(self):
assert list(record.feature_iter(pii.Feature.CITY)) == ["Anytown", "Somecity"]
assert list(record.feature_iter(pii.Feature.STATE)) == ["NY", "CA"]
assert list(record.feature_iter(pii.Feature.ZIP)) == ["12345", "98765"]
assert list(record.feature_iter(pii.Feature.FIRST_NAME)) == ["John", "L", "Jane"]
assert list(record.feature_iter(pii.Feature.GIVEN_NAME)) == ["John", "L", "Jane"]
assert list(record.feature_iter(pii.Feature.FIRST_NAME)) == ["John", "Jane"]
assert list(record.feature_iter(pii.Feature.LAST_NAME)) == ["Doe", "Smith"]
assert list(record.feature_iter(pii.Feature.SSN)) == ["123-45-6789"]
assert list(record.feature_iter(pii.Feature.RACE)) == ["UNKNOWN"]
assert list(record.feature_iter(pii.Feature.GENDER)) == ["UNKNOWN"]
assert list(record.feature_iter(pii.Feature.TELEPHONE)) == ["555-123-4567", "555-987-6543"]
assert list(record.feature_iter(pii.Feature.TELECOM)) == [
"555-123-4567",
"555-987-6543",
"[email protected]",
]
assert list(record.feature_iter(pii.Feature.SUFFIX)) == ["suffix", "suffix2"]
assert list(record.feature_iter(pii.Feature.COUNTY)) == ["county"]
assert list(record.feature_iter(pii.Feature.DRIVERS_LICENSE)) == ["D1234567|VA"]
Expand Down Expand Up @@ -329,7 +335,7 @@ def test_blocking_keys_first_name_first_four(self):
rec = pii.PIIRecord(**{"name": [{"given": [""], "family": "Doe"}]})
assert rec.blocking_keys(BlockingKey.FIRST_NAME) == set()
rec = pii.PIIRecord(**{"name": [{"given": ["John", "Jane"], "family": "Doe"}]})
assert rec.blocking_keys(BlockingKey.FIRST_NAME) == {"John", "Jane"}
assert rec.blocking_keys(BlockingKey.FIRST_NAME) == {"John"}
rec = pii.PIIRecord(
**{
"name": [
Expand All @@ -338,7 +344,7 @@ def test_blocking_keys_first_name_first_four(self):
]
}
)
assert rec.blocking_keys(BlockingKey.FIRST_NAME) == {"Jane", "John"}
assert rec.blocking_keys(BlockingKey.FIRST_NAME) == {"Jane"}

def test_blocking_keys_last_name_first_four(self):
rec = pii.PIIRecord(**{"last_name": "Doe"})
Expand Down Expand Up @@ -375,7 +381,7 @@ def test_blocking_values(self):
elif key == BlockingKey.MRN:
assert val == "3456"
elif key == BlockingKey.FIRST_NAME:
assert val in ("John", "Will")
assert val == "John"
elif key == BlockingKey.LAST_NAME:
assert val == "Doe"
else:
Expand Down

0 comments on commit 7bd7b09

Please sign in to comment.