Skip to content

Commit 2533adb

Browse files
Expand Feature Comparison Support (#88)
## Description NBS has requested that we add additional fields for feature comparisons. We won't need to block on any of these features, but we do need to have the ability to compare them in the second half of the algorithm. ## Related Issues closes #55 ## Additional Notes - [x] Add the following values to `recordlinker.models.pii.PIIRecord` - [x] SSN (should validate that its 10 digits and conform to XXX-XX-XXXX format) - [x] Race (need an Enum with the [8 races identified here](https://www.hl7.org/fhir/us/core/ValueSet-omb-race-category.html)) - [x] Gender (need an Enum with the [5 genders identified here](https://terminology.hl7.org/5.5.0/ValueSet-gender-identity.html)) - [x] Add the above 3 to `recordlinker.models.pii.Feature`, and additionally add 3 more - [x] Telephone - [x] Suffix - [x] County - [x] Rename `recordlinker.models.pii.PIIRecord.field_iter` to `recordlinker.models.pii.PIIRecord.feature_iter` - [x] Update `recordlinker.models.pii.PIIRecord.field_iter` to extract the 6 features above - [x] Update `recordlinker.linking.fhir_record_to_pii_record` to extract the 6 features from a FHIR payload - [x] Add documentation to the `docs/` directory to explain the process of adding a new Feature HL7 FHIR resources - https://build.fhir.org/patient-examples-general.json.html - https://hl7.org/fhir/R5/patient.html - https://hl7.org/fhir/R5/datatypes-examples.html#HumanName - https://www.hl7.org/fhir/us/core/StructureDefinition-us-core-race.html - https://www.hl7.org/fhir/us/core/Patient-example.json.html
1 parent 735fe58 commit 2533adb

File tree

6 files changed

+413
-37
lines changed

6 files changed

+413
-37
lines changed

docs/process_for_adding_feature.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Adding a New Feature
2+
3+
### Add the New Feature to the `Feature` Class
4+
- In [src/recordlinker/schemas/pii.py](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py), add the new feature to the [Feature](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py#L12C7-L12C14) enum class.
5+
6+
### Update the `PIIRecord` Schema
7+
- In the same file, modify the [PIIRecord](https://github.com/CDCgov/RecordLinker/blob/c85f555e5da91d54eb8c51e3bdf0789d1e204b2f/src/recordlinker/schemas/pii.py#L97) class to include the new feature as a field.
8+
- If the feature requires predefined values, create an enum to represent those values.
9+
10+
### Modify the `PIIRecord.feature_iter` Method
11+
- Update the [PIIRecord.feature_iter](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py#L246) method to return the value of the new feature when it's used for comparison.
12+
13+
### Extract the FHIR Field in `fhir_record_to_pii_record`
14+
- In [src/recordlinker/linking/link.py](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/linking/link.py), update the [fhir_record_to_pii_record](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/linking/link.py#L26) function to map the relevant FHIR field to the new feature in [PIIRecord](https://github.com/CDCgov/RecordLinker/blob/c85f555e5da91d54eb8c51e3bdf0789d1e204b2f/src/recordlinker/schemas/pii.py#L97).
15+
16+
### Update the Tests
17+
- Add or modify unit tests to verify that the new feature is properly extracted, mapped, and compared.

src/recordlinker/linking/link.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,21 +41,38 @@ def fhir_record_to_pii_record(fhir_record: dict) -> schemas.PIIRecord:
4141
"birthDate": fhir_record.get("birthDate"),
4242
"sex": fhir_record.get("gender"),
4343
"address": fhir_record.get("address", []),
44-
"phone": fhir_record.get("telecom", []),
4544
"mrn": None,
45+
"ssn": None,
46+
"race": None,
47+
"gender": None,
48+
"telecom": fhir_record.get("telecom", []),
4649
}
4750
for identifier in fhir_record.get("identifier", []):
4851
for coding in identifier.get("type", {}).get("coding", []):
4952
if coding.get("code") == "MR":
5053
val["mrn"] = identifier.get("value")
54+
elif coding.get("code") == "SS":
55+
val["ssn"] = identifier.get("value")
5156
for address in val["address"]:
57+
address["county"] = address.get("district", "")
5258
for extension in address.get("extension", []):
5359
if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/geolocation":
5460
for coord in extension.get("extension", []):
5561
if coord.get("url") == "latitude":
5662
address["latitude"] = coord.get("valueDecimal")
5763
elif coord.get("url") == "longitude":
5864
address["longitude"] = coord.get("valueDecimal")
65+
for extension in fhir_record.get("extension", []):
66+
if extension.get("url") == "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race":
67+
for ext in extension.get("extension", []):
68+
if ext.get("url") == "ombCategory":
69+
val["race"] = ext.get("valueCoding", {}).get("display")
70+
if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/individual-genderIdentity":
71+
for ext in extension.get("extension", []):
72+
if ext.get("url") == "value":
73+
for coding in ext.get("valueCodeableConcept", {}).get("coding", []):
74+
val["gender"] = coding.get("display")
75+
5976
return schemas.PIIRecord(**val)
6077

6178

src/recordlinker/linking/matchers.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,10 @@ def feature_match_any(
9595
:param key: The name of the column being evaluated (e.g. "city").
9696
:return: A float indicating whether the features are an exact match.
9797
"""
98-
rec_values = set(record.field_iter(key))
98+
rec_values = set(record.feature_iter(key))
9999
if not rec_values:
100100
return 0
101-
pat_values = set(patient.record.field_iter(key))
101+
pat_values = set(patient.record.feature_iter(key))
102102
return float(bool(rec_values & pat_values))
103103

104104

@@ -114,10 +114,10 @@ def feature_match_exact(
114114
:param key: The name of the column being evaluated (e.g. "city").
115115
:return: A float indicating whether the features are an exact match.
116116
"""
117-
rec_values = set(record.field_iter(key))
117+
rec_values = set(record.feature_iter(key))
118118
if not rec_values:
119119
return 0
120-
pat_values = set(patient.record.field_iter(key))
120+
pat_values = set(patient.record.feature_iter(key))
121121
return float(rec_values == pat_values)
122122

123123

@@ -138,8 +138,8 @@ def feature_match_fuzzy_string(
138138
"""
139139
similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
140140
comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
141-
for x in record.field_iter(key):
142-
for y in patient.record.field_iter(key):
141+
for x in record.feature_iter(key):
142+
for y in patient.record.feature_iter(key):
143143
score = comp_func(x, y)
144144
if score >= threshold:
145145
return 1
@@ -168,8 +168,8 @@ def feature_match_log_odds_fuzzy_compare(
168168
similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
169169
comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
170170
max_score = 0.0
171-
for x in patient.record.field_iter(key):
172-
for y in record.field_iter(key):
171+
for x in patient.record.feature_iter(key):
172+
for y in record.feature_iter(key):
173173
# for each permutation of values, find the score and record it if its
174174
# larger than any previous score
175175
max_score = max(comp_func(x, y), max_score)

src/recordlinker/schemas/pii.py

Lines changed: 142 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import datetime
22
import enum
3+
import re
34
import typing
45

56
import dateutil.parser
@@ -22,6 +23,12 @@ class Feature(enum.Enum):
2223
CITY = "CITY"
2324
STATE = "STATE"
2425
ZIP = "ZIP"
26+
SSN = "SSN"
27+
RACE = "RACE"
28+
GENDER = "GENDER"
29+
TELEPHONE = "TELEPHONE"
30+
SUFFIX = "SUFFIX"
31+
COUNTY = "COUNTY"
2532

2633
def __str__(self):
2734
"""
@@ -45,6 +52,43 @@ def __str__(self):
4552
"""
4653
return self.value
4754

55+
class Race(enum.Enum):
56+
"""
57+
Enum for the Race field.
58+
"""
59+
60+
AMERICAN_INDIAN = "AMERICAN_INDIAN"
61+
ASIAN = "ASIAN"
62+
BLACK = "BLACK"
63+
HAWAIIAN = "HAWAIIAN"
64+
WHITE = "WHITE"
65+
OTHER = "OTHER"
66+
ASKED_UNKNOWN = "ASKED_UNKNOWN"
67+
UNKNOWN = "UNKNOWN"
68+
69+
def __str__(self):
70+
"""
71+
Return the value of the enum as a string.
72+
"""
73+
return self.value
74+
75+
class Gender(enum.Enum):
76+
"""
77+
Enum for the Gender field.
78+
"""
79+
80+
FEMALE = "FEMALE"
81+
MALE = "MALE"
82+
NON_BINARY = "NON_BINARY"
83+
ASKED_DECLINED = "ASKED_DECLINED"
84+
UNKNOWN = "UNKNOWN"
85+
86+
def __str__(self):
87+
"""
88+
Return the value of the enum as a string.
89+
"""
90+
return self.value
91+
4892

4993
class Name(pydantic.BaseModel):
5094
"""
@@ -57,7 +101,7 @@ class Name(pydantic.BaseModel):
57101
given: typing.List[str] = []
58102
use: typing.Optional[str] = None
59103
prefix: typing.List[str] = [] # future use
60-
suffix: typing.List[str] = [] # future use
104+
suffix: typing.List[str] = []
61105

62106

63107
class Address(pydantic.BaseModel):
@@ -76,7 +120,7 @@ class Address(pydantic.BaseModel):
76120
"postal_code", "postalcode", "postalCode", "zip_code", "zipcode", "zipCode", "zip"
77121
),
78122
)
79-
county: typing.Optional[str] = None # future use
123+
county: typing.Optional[str] = None
80124
country: typing.Optional[str] = None
81125
latitude: typing.Optional[float] = None
82126
longitude: typing.Optional[float] = None
@@ -110,6 +154,9 @@ class PIIRecord(pydantic.BaseModel):
110154
address: typing.List[Address] = []
111155
name: typing.List[Name] = []
112156
telecom: typing.List[Telecom] = []
157+
ssn: typing.Optional[str] = None
158+
race: typing.Optional[Race] = None
159+
gender: typing.Optional[Gender] = None
113160

114161
@classmethod
115162
def model_construct(cls, _fields_set: set[str] | None = None, **values: typing.Any) -> typing.Self:
@@ -154,8 +201,71 @@ def parse_sex(cls, value):
154201
elif val in ["f", "female"]:
155202
return Sex.FEMALE
156203
return Sex.UNKNOWN
204+
205+
@pydantic.field_validator("ssn", mode="before")
206+
def parse_ssn(cls, value):
207+
"""
208+
Parse the ssn string
209+
"""
210+
if value:
211+
val = str(value).strip()
212+
213+
if re.match(r"^\d{3}-\d{2}-\d{4}$", val):
214+
return val
215+
216+
if len(val) != 9 or not val.isdigit():
217+
return None
218+
219+
# Format back to the standard SSN format (XXX-XX-XXXX)
220+
formatted_ssn = f"{val[:3]}-{val[3:5]}-{val[5:]}"
221+
return formatted_ssn
222+
223+
@pydantic.field_validator("race", mode="before")
224+
def parse_race(cls, value):
225+
"""
226+
Prase the race string into a race enum
227+
"""
228+
229+
race_mapping = [
230+
(["american indian", "alaska native"], Race.AMERICAN_INDIAN),
231+
(["asian"], Race.ASIAN),
232+
(["black", "african american"], Race.BLACK),
233+
(["white"], Race.WHITE),
234+
(["hawaiian", "pacific islander"], Race.HAWAIIAN),
235+
(["asked unknown", "asked but unknown"], Race.ASKED_UNKNOWN),
236+
(["unknown"], Race.UNKNOWN),
237+
]
157238

158-
def field_iter(self, feature: Feature) -> typing.Iterator[str]:
239+
if value:
240+
val = str(value).lower().strip()
241+
for substrings, race in race_mapping:
242+
if any(substring in val for substring in substrings):
243+
return race
244+
return Race.OTHER
245+
246+
247+
248+
@pydantic.field_validator("gender", mode="before")
249+
def parse_gender(cls, value):
250+
"""
251+
Prase the gender string into a gender enum
252+
"""
253+
if value:
254+
val = str(value).lower().strip()
255+
try:
256+
return Gender(val)
257+
except ValueError:
258+
if "female" in val:
259+
return Gender.FEMALE
260+
elif "male" in val:
261+
return Gender.MALE
262+
elif "nonbinary" in val:
263+
return Gender.NON_BINARY
264+
elif "declined" in val or "asked" in val:
265+
return Gender.ASKED_DECLINED
266+
return Gender.UNKNOWN
267+
268+
def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
159269
"""
160270
Given a field name, return an iterator of all string values for that field.
161271
Empty strings are not included in the iterator.
@@ -200,6 +310,28 @@ def field_iter(self, feature: Feature) -> typing.Iterator[str]:
200310
for name in self.name:
201311
if name.family:
202312
yield name.family
313+
elif feature == Feature.SSN:
314+
if self.ssn:
315+
yield self.ssn
316+
elif feature == Feature.RACE:
317+
if self.race:
318+
yield str(self.race)
319+
elif feature == Feature.GENDER:
320+
if self.gender:
321+
yield str(self.gender)
322+
elif feature == Feature.TELEPHONE:
323+
for telecom in self.telecom:
324+
if telecom.value:
325+
yield telecom.value
326+
elif feature == Feature.SUFFIX:
327+
for name in self.name:
328+
for suffix in name.suffix:
329+
if suffix:
330+
yield suffix
331+
elif feature == Feature.COUNTY:
332+
for address in self.address:
333+
if address.county:
334+
yield address.county
203335

204336
def blocking_keys(self, key: models.BlockingKey) -> set[str]:
205337
"""
@@ -214,19 +346,19 @@ def blocking_keys(self, key: models.BlockingKey) -> set[str]:
214346

215347
if key == models.BlockingKey.BIRTHDATE:
216348
# NOTE: we could optimize here and remove the dashes from the date
217-
vals.update(self.field_iter(Feature.BIRTHDATE))
349+
vals.update(self.feature_iter(Feature.BIRTHDATE))
218350
elif key == models.BlockingKey.MRN:
219-
vals.update({x[-4:] for x in self.field_iter(Feature.MRN)})
351+
vals.update({x[-4:] for x in self.feature_iter(Feature.MRN)})
220352
elif key == models.BlockingKey.SEX:
221-
vals.update(self.field_iter(Feature.SEX))
353+
vals.update(self.feature_iter(Feature.SEX))
222354
elif key == models.BlockingKey.ZIP:
223-
vals.update(self.field_iter(Feature.ZIP))
355+
vals.update(self.feature_iter(Feature.ZIP))
224356
elif key == models.BlockingKey.FIRST_NAME:
225-
vals.update({x[:4] for x in self.field_iter(Feature.FIRST_NAME)})
357+
vals.update({x[:4] for x in self.feature_iter(Feature.FIRST_NAME)})
226358
elif key == models.BlockingKey.LAST_NAME:
227-
vals.update({x[:4] for x in self.field_iter(Feature.LAST_NAME)})
359+
vals.update({x[:4] for x in self.feature_iter(Feature.LAST_NAME)})
228360
elif key == models.BlockingKey.ADDRESS:
229-
vals.update({x[:4] for x in self.field_iter(Feature.ADDRESS)})
361+
vals.update({x[:4] for x in self.feature_iter(Feature.ADDRESS)})
230362

231363
# if any vals are longer than the BLOCKING_KEY_MAX_LENGTH, raise an error
232364
if any(len(x) > models.BLOCKING_VALUE_MAX_LENGTH for x in vals):

0 commit comments

Comments
 (0)