diff --git a/.gitignore b/.gitignore index 70747423..5cf4c3d7 100644 --- a/.gitignore +++ b/.gitignore @@ -73,3 +73,6 @@ __pycache__/ # Databases *.sqlite3 + +# Test result files +output.csv diff --git a/compose.yml b/compose.yml index 208919df..5bb4ec7d 100644 --- a/compose.yml +++ b/compose.yml @@ -76,3 +76,25 @@ services: depends_on: api: condition: service_healthy + + algo-test-runner: + build: + context: tests/algorithm + dockerfile: Dockerfile.algo + env_file: + - tests/algorithm/algo.env + environment: + DB_URI: "postgresql+psycopg2://postgres:pw@db:5432/postgres" + API_URL: "http://api:8080" + volumes: + - ./tests/algorithm/scripts:/app/scripts + - ./tests/algorithm/data:/app/data + - ./tests/algorithm/results:/app/results + - ./tests/algorithm/configurations:/app/configurations + depends_on: + db: + condition: service_healthy + api: + condition: service_healthy + profiles: + - algo-test \ No newline at end of file diff --git a/tests/algorithm/Dockerfile.algo b/tests/algorithm/Dockerfile.algo new file mode 100644 index 00000000..d9ce073d --- /dev/null +++ b/tests/algorithm/Dockerfile.algo @@ -0,0 +1,12 @@ +# Use the official Python 3.11 slim image as the base +FROM python:3.12-slim + +# Set the working directory +WORKDIR /app + +# Copy the scripts and data directories into the image +COPY scripts /app/scripts +COPY data /app/data + +# Install Python dependencies +RUN pip install --no-cache-dir requests \ No newline at end of file diff --git a/tests/algorithm/README.md b/tests/algorithm/README.md new file mode 100644 index 00000000..861a1ccf --- /dev/null +++ b/tests/algorithm/README.md @@ -0,0 +1,104 @@ +# Record Linkage Algorithm Testing + +This repository contains a project to evaluate the match accuracy performance of the RecordLinker algorithm. + +## Prerequisites + +Before getting started, ensure you have the following installed: + +- [Docker](https://docs.docker.com/engine/install/) +- [Docker Compose](https://docs.docker.com/compose/install/) + +## Directory Structure + +- `/`: Contains the `.env` file and `Dockerfile` to build +- `configurations/`: Contains the configuration `.json` file that will be used for the test +- `data/`: Contains the data `.csv` files used for the algorithm test (seed file and test file) +- `results/`: Contains the results `.csv` file after running the test +- `scripts/`: Contains the scripts to run the test + +## Setup + +1. Build the Docker images: + + ```bash + docker compose --profile algo-test build + ``` + +2. Add seed and test data files + You can use the sample data files provided in the `data` directory or add your own data files. + The format of the input files should be a CSV file with the same column headers as shown in the sample files. + + `/data/sample_seed_data.csv` + + `/data/sample_test_data.csv` + + +3. Configure environment variables + + `/algo.env` + + Edit the environment variables in the file + +4. Edit the algorithm configuration file + + `/configurations/algorithm_configuration.json` + + Edit the configuration file to tune the algorithm parameters + +## Running Algorithm Tests + +1. Run the test + + ```bash + docker compose run --rm algo-test-runner scripts/run_test.py + ``` + +2. Analyze the results + + The results of the algorithm tests will be available in the `results/output.csv` file. + + The results will be in a CSV formatted file with the following columns: + `Test Case #`, `Expected Result`, `Match Result`, `Details` + +## Rerunning Algorithm Tests + +After you've run the algorithm tests, you may want to rerun the tests with different seed data, test data, or configurations. + +Edit the csv files and/or the configuration file as needed and then run the following commands to rerun the tests. + +1. Reset the mpi database + + ```bash + docker compose run --rm algo-test-runner python scripts/reset_db.py + ``` +2. Run the tests + + ```bash + docker compose run --rm algo-test-runner scripts/run_test.py + ``` + +## Environment Variables + +1. `env file`: The attributes that should be tuned for your particular algorithm test, + are located in the `algo_test.env` file. + +2. `environment`: The attributes that should likely remain static for all algorithm tests are located directly in the `compose.yml` file. + +### Algorithm Test Parameters + +The following environment variables can be tuned in the `algo-test.env` file: + +- `SEED_FILE`: The file containing person data to seed the mpi with +- `TEST_FILE`: The file containing patient data to test the algorithm with +- `ALGORITHM_CONFIGURATION`: The file containing the algorithm configuration json +- `ALGORITHM_NAME`: The name of the algorithm to use (either the name of your `ALGORITHM_CONFIGURATION` or can be the built in `dibbs-basic` or `dibbs-enhanced` algorithms) + + +## Cleanup + +After you've finished running algorithm tests and analyzing the results, you can stop and remove the Docker containers by running: + +```bash +docker compose --profile algo-test down +``` diff --git a/tests/algorithm/algo.env b/tests/algorithm/algo.env new file mode 100644 index 00000000..10016215 --- /dev/null +++ b/tests/algorithm/algo.env @@ -0,0 +1,4 @@ +SEED_FILE="data/sample_seed_data.csv" +TEST_FILE="data/sample_test_data.csv" +ALGORITHM_CONFIGURATION="configurations/algorithm_configuration.json" +ALGORITHM_NAME="test-config" diff --git a/tests/algorithm/configurations/algorithm_configuration.json b/tests/algorithm/configurations/algorithm_configuration.json new file mode 100644 index 00000000..6abde868 --- /dev/null +++ b/tests/algorithm/configurations/algorithm_configuration.json @@ -0,0 +1,66 @@ +{ + "label": "test-config", + "description": "test algorithm configuration", + "is_default": false, + "include_multiple_matches": true, + "belongingness_ratio": [0.75, 0.9], + "passes": [ + { + "blocking_keys": [ + "BIRTHDATE" + ], + "evaluators": [ + { + "feature": "FIRST_NAME", + "func": "func:recordlinker.linking.matchers.feature_match_fuzzy_string" + }, + { + "feature": "LAST_NAME", + "func": "func:recordlinker.linking.matchers.feature_match_exact" + } + ], + "rule": "func:recordlinker.linking.matchers.eval_perfect_match", + "cluster_ratio": 0.9, + "kwargs": { + "thresholds": { + "FIRST_NAME": 0.9, + "LAST_NAME": 0.9, + "BIRTHDATE": 0.95, + "ADDRESS": 0.9, + "CITY": 0.92, + "ZIP": 0.95 + } + } + }, + { + "blocking_keys": [ + "ZIP", + "FIRST_NAME", + "LAST_NAME", + "SEX" + ], + "evaluators": [ + { + "feature": "ADDRESS", + "func": "func:recordlinker.linking.matchers.feature_match_fuzzy_string" + }, + { + "feature": "BIRTHDATE", + "func": "func:recordlinker.linking.matchers.feature_match_exact" + } + ], + "rule": "func:recordlinker.linking.matchers.eval_perfect_match", + "cluster_ratio": 0.9, + "kwargs": { + "thresholds": { + "FIRST_NAME": 0.9, + "LAST_NAME": 0.9, + "BIRTHDATE": 0.95, + "ADDRESS": 0.9, + "CITY": 0.92, + "ZIP": 0.95 + } + } + } + ] +} \ No newline at end of file diff --git a/tests/algorithm/data/sample_seed_data.csv b/tests/algorithm/data/sample_seed_data.csv new file mode 100644 index 00000000..433c737c --- /dev/null +++ b/tests/algorithm/data/sample_seed_data.csv @@ -0,0 +1,6 @@ +Match Id,ID,BIRTHDATE,FIRST,LAST,SUFFIX,MAIDEN,RACE,ETHNICITY,GENDER,ADDRESS,CITY,STATE,COUNTY,ZIP,SSN, +1,3020167,1951-06-02,Linda,Nash,Sr,Gutierrez,Asian,Hispanic,F,968 Gonzalez Mount,South Emilybury,GU,North Kennethburgh County,93236,675-79-1449, +2,9488697,1942-08-03,Jose,Singleton,Sr,Ingram,Asian,Hispanic,M,631 Fowler Causeway,Port Williamfurt,IN,Wardburgh County,90637,587-60-3668, +3,1805504,1963-01-29,Ryan,Lawrence,IV,Armstrong,Black,Non-Hispanic,M,5256 Lisa Light,Port Monica,GA,South Christine County,51813,371-33-0433, +4,1792678,1950-08-10,Thomas,Brady,II,Cobb,White,Unknown,M,944 Hayes Port,Jonesville,FM,Jonesview County,6015,272-78-9905, +5,1332302,1972-08-26,Angie,Murphy,Sr,Mcmahon,Black,Non-Hispanic,F,60015 Edward Vista Suite 518,Lake Andreaview,UT,North Rodney County,46540,740-16-5170, diff --git a/tests/algorithm/data/sample_test_data.csv b/tests/algorithm/data/sample_test_data.csv new file mode 100644 index 00000000..0f2cd2ea --- /dev/null +++ b/tests/algorithm/data/sample_test_data.csv @@ -0,0 +1,7 @@ +Test Case #,Match Id,ID,BIRTHDATE,FIRST,LAST,SUFFIX,MAIDEN,RACE,ETHNICITY,GENDER,ADDRESS,CITY,STATE,COUNTY,ZIP,SSN,Expected Result +1,1,3020167,1951-06-02,Linda,Nash,Jr,Gutierrez,Asian,Hispanic,F,968 Gonzalez Mount,South Emilybury,GU,North Kennethburgh County,93236,675-79-1449,Should be a Match +2,2,9488697,1942-08-03,Singleton,Jose,Sr,Ingram,Asian,Hispanic,M,631 Fowler Causeway,Port Williamfurt,IN,Wardburgh County,90637,587-60-3668,Should be a Match +3,3,1805504,1963-01-29,Ryan,Law-rence,IV,Armstrong,Black,Non-Hispanic,M,5256 Lisa Light,Port Monica,GA,South Christine County,51813,371-33-0433,Should be a Match +4,4,1792678,1950-08-10,Tho-mas,Brady,II,Cobb,White,Unknown,M,944 Hayes Port,Jonesville,FM,Jonesview County,6015,272-78-9905,Should be a Match +5,4,1792678,1950-08-10,ThoMas,Brady,II,Cobb,White,Unknown,M,944 Hayes Port,Jonesville,FM,Jonesview County,6015,272-78-9905,Should be a Match +6,0,1792679,1950-18-10,ThoMas,Brady,II,Cobb,White,Unknown,M,944 Hayes Port,Jonesville,FM,Jonesview County,6015,272-78-9905,Should fail diff --git a/tests/algorithm/scripts/helpers.py b/tests/algorithm/scripts/helpers.py new file mode 100644 index 00000000..50821c5c --- /dev/null +++ b/tests/algorithm/scripts/helpers.py @@ -0,0 +1,43 @@ +import json + + +def dict_to_pii(record_data) -> dict | None: + # convert row to a pii_record + pii_record = { + "external_id": record_data.get('ID', None), + "birth_date": record_data.get("BIRTHDATE", None), + "sex": record_data.get("GENDER", None), + "address": [ + { + "line": [record_data.get("ADDRESS", None)], + "city": record_data.get("CITY", None), + "state": record_data.get("STATE", None), + "county": record_data.get("COUNTY", None), + "postal_code": str(record_data.get("ZIP", "")) + } + ], + "name": [ + { + "given": [record_data.get("FIRST", None)], + "family": record_data.get("LAST", None), + "suffix": [record_data.get("SUFFIX", None)] + } + ], + "ssn": record_data.get("SSN", None), + "race": record_data.get("RACE", None) + } + + return pii_record + + +def load_json(file_path: str) -> dict | None: + """ + Load JSON data from a file. + """ + with open(file_path, "rb") as fobj: + try: + content = json.load(fobj) + return content + except json.JSONDecodeError as exc: + print(f"Error loading JSON file: {exc}") + return None \ No newline at end of file diff --git a/tests/algorithm/scripts/reset_db.py b/tests/algorithm/scripts/reset_db.py new file mode 100644 index 00000000..404bb602 --- /dev/null +++ b/tests/algorithm/scripts/reset_db.py @@ -0,0 +1,18 @@ +import os + +import requests + + +def reset_db(api_url): + print("Resetting the database...") + try: + response = requests.delete(f"{api_url}/seed") + response.raise_for_status() # Raise an error for bad status codes + print("Database reset successfully") + except requests.exceptions.RequestException as e: + print(f"Failed to reset the database: {e}") + + +if __name__ == "__main__": + api_url = os.getenv("API_URL") + reset_db(api_url) diff --git a/tests/algorithm/scripts/run_test.py b/tests/algorithm/scripts/run_test.py new file mode 100755 index 00000000..6e826c2b --- /dev/null +++ b/tests/algorithm/scripts/run_test.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import os + +from helpers import load_json +from seed_db import seed_database +from send_test_records import send_test_records +from set_configuration import add_configuration +from set_configuration import check_if_config_already_exists +from set_configuration import update_configuration + + +def main(): + # Get the environment variables + api_url = os.getenv("API_URL") + algorithm_name = os.getenv("ALGORITHM_NAME") + algorithm_config_file = os.getenv("ALGORITHM_CONFIGURATION") + seed_csv = os.getenv("SEED_FILE") + test_csv = os.getenv("TEST_FILE") + + # setup the algorithm configuration + algorithm_config = load_json(algorithm_config_file) + if check_if_config_already_exists(algorithm_config, api_url): + update_configuration(algorithm_config, api_url) + else: + add_configuration(algorithm_config, api_url) + + seed_database(seed_csv, api_url) + + send_test_records(test_csv, algorithm_name, api_url) + +if __name__ == "__main__": + main() diff --git a/tests/algorithm/scripts/seed_db.py b/tests/algorithm/scripts/seed_db.py new file mode 100644 index 00000000..239d086a --- /dev/null +++ b/tests/algorithm/scripts/seed_db.py @@ -0,0 +1,43 @@ +import csv + +import requests +from helpers import dict_to_pii + + +def seed_database(csv_file, api_url): + MAX_CLUSTERS = 100 + cluster_group = [] + + print("Seeding the database...") + + # Read the CSV file using the csv module + with open(csv_file, mode='r', newline='', encoding='utf-8') as file: + reader = csv.DictReader(file) + + for row in reader: + record_data = {k: ("" if v in [None, "NaN"] else v) for k, v in row.items()} + + # convert dict to a pii_record + pii_record = dict_to_pii(record_data) + + # nesting for the seeding api request + cluster = {"records": [pii_record]} + cluster_group.append(cluster) + + if len(cluster_group) == MAX_CLUSTERS: + send_clusters_to_api(cluster_group, api_url) + cluster_group = [] + + if cluster_group: + send_clusters_to_api(cluster_group, api_url) + + print("Finished seeding the database.") + + +def send_clusters_to_api(cluster_group, api_url): + """Helper function to send a batch of clusters to the API.""" + try: + response = requests.post(f"{api_url}/seed", json={"clusters": cluster_group}) + response.raise_for_status() # Raise an error for bad status codes + except requests.exceptions.RequestException as e: + print(f"Failed to post batch: {e}") diff --git a/tests/algorithm/scripts/send_test_records.py b/tests/algorithm/scripts/send_test_records.py new file mode 100644 index 00000000..6ac36d6b --- /dev/null +++ b/tests/algorithm/scripts/send_test_records.py @@ -0,0 +1,69 @@ +import csv +import sys + +import requests +from helpers import dict_to_pii + + +def send_test_records(test_csv, algorithm_name, api_url): + output_data = [] + + print("Sending test records to the API...") + + # Read the CSV file using the csv module + with open(test_csv, mode='r', newline='', encoding='utf-8') as file: + reader = csv.DictReader(file) + + for row in reader: + # Replace None and "NaN" values with an empty string + record_data = {k: ("" if v in [None, "NaN"] else v) for k, v in row.items()} + + # Get info from row + match_info = { + "test_case_number": record_data['Test Case #'], + "match_id": record_data['Match Id'], + "should_match": record_data['Expected Result'], + } + + pii_record = dict_to_pii(record_data) + + response = send_record(pii_record, algorithm_name, api_url) + resposnse_json = response.json() + + if response.status_code == 422: + output_row = { + "Test Case #": match_info['test_case_number'], + "Expected Result": match_info['should_match'], + "Match Result": "Invalid record", + "Details": resposnse_json["detail"] + } + else: + output_row = { + "Test Case #": match_info['test_case_number'], + "Expected Result": match_info['should_match'], + "Match Result": resposnse_json['prediction'], + "Details": "" + } + output_data.append(output_row) + + # Save output data to the output file + with open("results/output.csv", mode='w', newline='', encoding='utf-8') as file: + fieldnames = ["Test Case #", "Expected Result", "Match Result", "Details"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + + writer.writeheader() + writer.writerows(output_data) + + print("Results saved to results/output.csv") + +def send_record(pii_record, algorithm_name, api_url): + """Helper function to send record to the API to be linked.""" + + try: + response = requests.post(f"{api_url}/link", json={"record": pii_record, "algorithm": algorithm_name}) + if response.status_code != 200 and response.status_code != 422: + raise requests.exceptions.HTTPError(f"Internal Server Error: {response.status_code}") + return response + except requests.exceptions.RequestException as e: + print(f"Internal Server Error: {e}\nExiting test") + sys.exit(1) \ No newline at end of file diff --git a/tests/algorithm/scripts/set_configuration.py b/tests/algorithm/scripts/set_configuration.py new file mode 100644 index 00000000..8dc977fb --- /dev/null +++ b/tests/algorithm/scripts/set_configuration.py @@ -0,0 +1,37 @@ +import requests + + +def add_configuration(algorithm_config, api_url): + """Add configuration to the algorithm.""" + + if algorithm_config is None: + print("Failed to load configuration") + return + try: + response = requests.post(f"{api_url}/algorithm", json=algorithm_config) + response.raise_for_status() # Raise an error for bad status codes + print(f"Successfully added algorithm configuration {algorithm_config["label"]}") + except requests.exceptions.RequestException as e: + print(f"Failed to add configuration: {e}") + +def update_configuration(algorithm_config, api_url): + """Update an algorithm configuration.""" + + if algorithm_config is None: + print("Failed to load configuration") + return + try: + response = requests.put(f"{api_url}/algorithm/{algorithm_config["label"]}", json=algorithm_config) + response.raise_for_status() # Raise an error for bad status codes + print(f"Successfully updated algorithm configuration {algorithm_config["label"]}") + except requests.exceptions.RequestException as e: + print(f"Failed to update algorithm configuration: {e}") + +def check_if_config_already_exists(algorithm_config, api_url): + """Check if the configuration already exists in the algorithm.""" + try: + response = requests.get(f"{api_url}/algorithm/{algorithm_config["label"]}") + response.raise_for_status() # Raise an error for bad status codes + return True + except requests.exceptions.RequestException: + return False