Expand Feature Comparison Support (#88)

## Description NBS has requested that we add additional fields for feature comparisons. We won't need to block on any of these features, but we do need to have the ability to compare them in the second half of the algorithm. ## Related Issues closes #55 ## Additional Notes - [x] Add the following values to `recordlinker.models.pii.PIIRecord` - [x] SSN (should validate that its 10 digits and conform to XXX-XX-XXXX format) - [x] Race (need an Enum with the [8 races identified here](https://www.hl7.org/fhir/us/core/ValueSet-omb-race-category.html)) - [x] Gender (need an Enum with the [5 genders identified here](https://terminology.hl7.org/5.5.0/ValueSet-gender-identity.html)) - [x] Add the above 3 to `recordlinker.models.pii.Feature`, and additionally add 3 more - [x] Telephone - [x] Suffix - [x] County - [x] Rename `recordlinker.models.pii.PIIRecord.field_iter` to `recordlinker.models.pii.PIIRecord.feature_iter` - [x] Update `recordlinker.models.pii.PIIRecord.field_iter` to extract the 6 features above - [x] Update `recordlinker.linking.fhir_record_to_pii_record` to extract the 6 features from a FHIR payload - [x] Add documentation to the `docs/` directory to explain the process of adding a new Feature HL7 FHIR resources - https://build.fhir.org/patient-examples-general.json.html - https://hl7.org/fhir/R5/patient.html - https://hl7.org/fhir/R5/datatypes-examples.html#HumanName - https://www.hl7.org/fhir/us/core/StructureDefinition-us-core-race.html - https://www.hl7.org/fhir/us/core/Patient-example.json.html
CDCgov · Oct 29, 2024 · 2533adb · 2533adb
1 parent 735fe58
commit 2533adb
Show file tree

Hide file tree

Showing 6 changed files with 413 additions and 37 deletions.
diff --git a/docs/process_for_adding_feature.md b/docs/process_for_adding_feature.md
@@ -0,0 +1,17 @@
+# Adding a New Feature
+
+### Add the New Feature to the `Feature` Class
+- In [src/recordlinker/schemas/pii.py](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py), add the new feature to the [Feature](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py#L12C7-L12C14) enum class.
+
+### Update the `PIIRecord` Schema
+- In the same file, modify the [PIIRecord](https://github.com/CDCgov/RecordLinker/blob/c85f555e5da91d54eb8c51e3bdf0789d1e204b2f/src/recordlinker/schemas/pii.py#L97) class to include the new feature as a field.
+- If the feature requires predefined values, create an enum to represent those values.
+
+### Modify the `PIIRecord.feature_iter` Method
+- Update the [PIIRecord.feature_iter](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/schemas/pii.py#L246) method to return the value of the new feature when it's used for comparison.
+
+### Extract the FHIR Field in `fhir_record_to_pii_record`
+- In [src/recordlinker/linking/link.py](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/linking/link.py), update the [fhir_record_to_pii_record](https://github.com/CDCgov/RecordLinker/blob/a672d2b6409cbd1a08f729d94fba5692f57f6fc6/src/recordlinker/linking/link.py#L26) function to map the relevant FHIR field to the new feature in [PIIRecord](https://github.com/CDCgov/RecordLinker/blob/c85f555e5da91d54eb8c51e3bdf0789d1e204b2f/src/recordlinker/schemas/pii.py#L97).
+
+### Update the Tests
+- Add or modify unit tests to verify that the new feature is properly extracted, mapped, and compared. 
diff --git a/src/recordlinker/linking/link.py b/src/recordlinker/linking/link.py
@@ -41,21 +41,38 @@ def fhir_record_to_pii_record(fhir_record: dict) -> schemas.PIIRecord:
         "birthDate": fhir_record.get("birthDate"),
         "sex": fhir_record.get("gender"),
         "address": fhir_record.get("address", []),
-        "phone": fhir_record.get("telecom", []),
         "mrn": None,
+        "ssn": None,
+        "race": None,
+        "gender": None,
+        "telecom": fhir_record.get("telecom", []),
     }
     for identifier in fhir_record.get("identifier", []):
         for coding in identifier.get("type", {}).get("coding", []):
             if coding.get("code") == "MR":
                 val["mrn"] = identifier.get("value")
+            elif coding.get("code") == "SS":
+                val["ssn"] = identifier.get("value")
     for address in val["address"]:
+        address["county"] = address.get("district", "")
         for extension in address.get("extension", []):
             if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/geolocation":
                 for coord in extension.get("extension", []):
                     if coord.get("url") == "latitude":
                         address["latitude"] = coord.get("valueDecimal")
                     elif coord.get("url") == "longitude":
                         address["longitude"] = coord.get("valueDecimal")
+    for extension in fhir_record.get("extension", []):
+        if extension.get("url") == "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race":
+            for ext in extension.get("extension", []):
+                if ext.get("url") == "ombCategory":
+                    val["race"] = ext.get("valueCoding", {}).get("display")
+        if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/individual-genderIdentity":
+            for ext in extension.get("extension", []):
+                if ext.get("url") == "value":
+                    for coding in ext.get("valueCodeableConcept", {}).get("coding", []):
+                        val["gender"] = coding.get("display")
+
     return schemas.PIIRecord(**val)
 
 

diff --git a/src/recordlinker/linking/matchers.py b/src/recordlinker/linking/matchers.py
@@ -95,10 +95,10 @@ def feature_match_any(
     :param key: The name of the column being evaluated (e.g. "city").
     :return: A float indicating whether the features are an exact match.
     """
-    rec_values = set(record.field_iter(key))
+    rec_values = set(record.feature_iter(key))
     if not rec_values:
         return 0
-    pat_values = set(patient.record.field_iter(key))
+    pat_values = set(patient.record.feature_iter(key))
     return float(bool(rec_values & pat_values))
 
 
@@ -114,10 +114,10 @@ def feature_match_exact(
     :param key: The name of the column being evaluated (e.g. "city").
     :return: A float indicating whether the features are an exact match.
     """
-    rec_values = set(record.field_iter(key))
+    rec_values = set(record.feature_iter(key))
     if not rec_values:
         return 0
-    pat_values = set(patient.record.field_iter(key))
+    pat_values = set(patient.record.feature_iter(key))
     return float(rec_values == pat_values)
 
 
@@ -138,8 +138,8 @@ def feature_match_fuzzy_string(
     """
     similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
     comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
-    for x in record.field_iter(key):
-        for y in patient.record.field_iter(key):
+    for x in record.feature_iter(key):
+        for y in patient.record.feature_iter(key):
             score = comp_func(x, y)
             if score >= threshold:
                 return 1
@@ -168,8 +168,8 @@ def feature_match_log_odds_fuzzy_compare(
     similarity_measure, threshold = _get_fuzzy_params(str(key), **kwargs)
     comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity
     max_score = 0.0
-    for x in patient.record.field_iter(key):
-        for y in record.field_iter(key):
+    for x in patient.record.feature_iter(key):
+        for y in record.feature_iter(key):
             # for each permutation of values, find the score and record it if its
             # larger than any previous score
             max_score = max(comp_func(x, y), max_score)

diff --git a/src/recordlinker/schemas/pii.py b/src/recordlinker/schemas/pii.py
@@ -1,5 +1,6 @@
 import datetime
 import enum
+import re
 import typing
 
 import dateutil.parser
@@ -22,6 +23,12 @@ class Feature(enum.Enum):
     CITY = "CITY"
     STATE = "STATE"
     ZIP = "ZIP"
+    SSN = "SSN"
+    RACE = "RACE"
+    GENDER = "GENDER"
+    TELEPHONE = "TELEPHONE"
+    SUFFIX = "SUFFIX"
+    COUNTY = "COUNTY"
 
     def __str__(self):
         """
@@ -45,6 +52,43 @@ def __str__(self):
         """
         return self.value
 
+class Race(enum.Enum):
+    """
+    Enum for the Race field.
+    """
+
+    AMERICAN_INDIAN = "AMERICAN_INDIAN"
+    ASIAN = "ASIAN"
+    BLACK = "BLACK"
+    HAWAIIAN = "HAWAIIAN"
+    WHITE = "WHITE"
+    OTHER = "OTHER"
+    ASKED_UNKNOWN = "ASKED_UNKNOWN"
+    UNKNOWN = "UNKNOWN"
+
+    def __str__(self):
+        """
+        Return the value of the enum as a string.
+        """
+        return self.value
+
+class Gender(enum.Enum):
+    """
+    Enum for the Gender field.
+    """
+
+    FEMALE = "FEMALE"
+    MALE = "MALE"
+    NON_BINARY = "NON_BINARY"
+    ASKED_DECLINED = "ASKED_DECLINED"
+    UNKNOWN = "UNKNOWN"
+
+    def __str__(self):
+        """
+        Return the value of the enum as a string.
+        """
+        return self.value
+
 
 class Name(pydantic.BaseModel):
     """
@@ -57,7 +101,7 @@ class Name(pydantic.BaseModel):
     given: typing.List[str] = []
     use: typing.Optional[str] = None
     prefix: typing.List[str] = []  # future use
-    suffix: typing.List[str] = []  # future use
+    suffix: typing.List[str] = []
 
 
 class Address(pydantic.BaseModel):
@@ -76,7 +120,7 @@ class Address(pydantic.BaseModel):
             "postal_code", "postalcode", "postalCode", "zip_code", "zipcode", "zipCode", "zip"
         ),
     )
-    county: typing.Optional[str] = None  # future use
+    county: typing.Optional[str] = None
     country: typing.Optional[str] = None
     latitude: typing.Optional[float] = None
     longitude: typing.Optional[float] = None
@@ -110,6 +154,9 @@ class PIIRecord(pydantic.BaseModel):
     address: typing.List[Address] = []
     name: typing.List[Name] = []
     telecom: typing.List[Telecom] = []
+    ssn: typing.Optional[str] = None
+    race: typing.Optional[Race] = None
+    gender: typing.Optional[Gender] = None
 
     @classmethod
     def model_construct(cls, _fields_set: set[str] | None = None, **values: typing.Any) -> typing.Self:
@@ -154,8 +201,71 @@ def parse_sex(cls, value):
             elif val in ["f", "female"]:
                 return Sex.FEMALE
             return Sex.UNKNOWN
+
+    @pydantic.field_validator("ssn", mode="before")
+    def parse_ssn(cls, value):
+        """
+        Parse the ssn string 
+        """
+        if value:
+            val = str(value).strip()
+
+            if re.match(r"^\d{3}-\d{2}-\d{4}$", val):
+                return val 
+
+            if len(val) != 9 or not val.isdigit():
+                return None
+
+            # Format back to the standard SSN format (XXX-XX-XXXX)
+            formatted_ssn = f"{val[:3]}-{val[3:5]}-{val[5:]}"
+            return formatted_ssn
+
+    @pydantic.field_validator("race", mode="before")
+    def parse_race(cls, value):
+        """
+        Prase the race string into a race enum
+        """
+
+        race_mapping = [
+        (["american indian", "alaska native"], Race.AMERICAN_INDIAN),
+        (["asian"], Race.ASIAN),
+        (["black", "african american"], Race.BLACK),
+        (["white"], Race.WHITE),
+        (["hawaiian", "pacific islander"], Race.HAWAIIAN),
+        (["asked unknown", "asked but unknown"], Race.ASKED_UNKNOWN),
+        (["unknown"], Race.UNKNOWN),
+    ]
 
-    def field_iter(self, feature: Feature) -> typing.Iterator[str]:
+        if value:
+            val = str(value).lower().strip()
+            for substrings, race in race_mapping:
+                if any(substring in val for substring in substrings):
+                    return race
+            return Race.OTHER
+
+
+
+    @pydantic.field_validator("gender", mode="before")
+    def parse_gender(cls, value):
+        """
+        Prase the gender string into a gender enum
+        """
+        if value:
+            val = str(value).lower().strip()
+            try:
+                return Gender(val)
+            except ValueError:
+                if "female" in val:
+                    return Gender.FEMALE
+                elif "male" in val:
+                    return Gender.MALE
+                elif "nonbinary" in val:
+                    return Gender.NON_BINARY
+                elif "declined" in val or "asked" in val:
+                    return Gender.ASKED_DECLINED
+                return Gender.UNKNOWN
+
+    def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
         """
         Given a field name, return an iterator of all string values for that field.
         Empty strings are not included in the iterator.
@@ -200,6 +310,28 @@ def field_iter(self, feature: Feature) -> typing.Iterator[str]:
             for name in self.name:
                 if name.family:
                     yield name.family
+        elif feature == Feature.SSN:
+            if self.ssn:
+                yield self.ssn
+        elif feature == Feature.RACE:
+            if self.race:
+                yield str(self.race)
+        elif feature == Feature.GENDER:
+            if self.gender:
+                yield str(self.gender)
+        elif feature == Feature.TELEPHONE:
+            for telecom in self.telecom:
+                if telecom.value:
+                    yield telecom.value
+        elif feature == Feature.SUFFIX:
+            for name in self.name:
+                for suffix in name.suffix:
+                    if suffix:
+                        yield suffix
+        elif feature == Feature.COUNTY:
+            for address in self.address:
+                if address.county:
+                    yield address.county
 
     def blocking_keys(self, key: models.BlockingKey) -> set[str]:
         """
@@ -214,19 +346,19 @@ def blocking_keys(self, key: models.BlockingKey) -> set[str]:
 
         if key == models.BlockingKey.BIRTHDATE:
             # NOTE: we could optimize here and remove the dashes from the date
-            vals.update(self.field_iter(Feature.BIRTHDATE))
+            vals.update(self.feature_iter(Feature.BIRTHDATE))
         elif key == models.BlockingKey.MRN:
-            vals.update({x[-4:] for x in self.field_iter(Feature.MRN)})
+            vals.update({x[-4:] for x in self.feature_iter(Feature.MRN)})
         elif key == models.BlockingKey.SEX:
-            vals.update(self.field_iter(Feature.SEX))
+            vals.update(self.feature_iter(Feature.SEX))
         elif key == models.BlockingKey.ZIP:
-            vals.update(self.field_iter(Feature.ZIP))
+            vals.update(self.feature_iter(Feature.ZIP))
         elif key == models.BlockingKey.FIRST_NAME:
-            vals.update({x[:4] for x in self.field_iter(Feature.FIRST_NAME)})
+            vals.update({x[:4] for x in self.feature_iter(Feature.FIRST_NAME)})
         elif key == models.BlockingKey.LAST_NAME:
-            vals.update({x[:4] for x in self.field_iter(Feature.LAST_NAME)})
+            vals.update({x[:4] for x in self.feature_iter(Feature.LAST_NAME)})
         elif key == models.BlockingKey.ADDRESS:
-            vals.update({x[:4] for x in self.field_iter(Feature.ADDRESS)})
+            vals.update({x[:4] for x in self.feature_iter(Feature.ADDRESS)})
 
         # if any vals are longer than the BLOCKING_KEY_MAX_LENGTH, raise an error
         if any(len(x) > models.BLOCKING_VALUE_MAX_LENGTH for x in vals):