Some coverage updates

smart-on-fhir · Dec 4, 2024 · 773cf96 · 773cf96
1 parent 2172d7b
commit 773cf96
Show file tree

Hide file tree

Showing 9 changed files with 125 additions and 28 deletions.
diff --git a/src/shared/functions.py b/src/shared/functions.py
@@ -140,7 +140,7 @@ def update_metadata(
         # Should only be hit if you add a new JSON dict and forget to add it
         # to this function
         case _:
-            raise OSError(f"{meta_type} does not have a handler for updates.")
+            raise ValueError(f"{meta_type} does not have a handler for updates.")
     data_version_metadata.update(extra_items)
     return metadata
 
@@ -182,14 +182,22 @@ def move_s3_file(s3_client, s3_bucket_name: str, old_key: str, new_key: str) ->
         raise S3UploadError
 
 
-def get_s3_keys(s3_client, s3_bucket_name: str, prefix: str, token: str | None = None) -> list:
+def get_s3_keys(
+    s3_client,
+    s3_bucket_name: str,
+    prefix: str,
+    token: str | None = None,
+    max_keys: int | None = None,
+) -> list[str]:
     """Gets the list of all keys in S3 starting with the prefix"""
+    if max_keys is None:
+        max_keys = 1000
     if token:
         res = s3_client.list_objects_v2(
-            Bucket=s3_bucket_name, Prefix=prefix, ContinuationToken=token
+            Bucket=s3_bucket_name, Prefix=prefix, ContinuationToken=token, MaxKeys=max_keys
         )
     else:
-        res = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=prefix)
+        res = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=prefix, MaxKeys=max_keys)
     if "Contents" not in res:
         return []
     contents = [record["Key"] for record in res["Contents"]]
@@ -225,14 +233,15 @@ def get_latest_data_package_version(bucket, prefix):
         prefix = prefix + "/"
     s3_res = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
     highest_ver = None
-    for item in s3_res["Contents"]:
-        ver_str = item["Key"].replace(prefix, "").split("/")[0]
-        if ver_str.isdigit():
-            if highest_ver is None:
-                highest_ver = ver_str
-            else:
-                if int(highest_ver) < int(ver_str):
+    if "Contents" in s3_res:
+        for item in s3_res["Contents"]:
+            ver_str = item["Key"].replace(prefix, "").split("/")[1].split("__")[2]
+            if ver_str.isdigit():
+                if highest_ver is None:
                     highest_ver = ver_str
-    if highest_ver is None:
+                else:
+                    if int(highest_ver) < int(ver_str):
+                        highest_ver = ver_str
+    if "Contents" not in s3_res or highest_ver is None:
         logging.error("No data package versions found for %s", prefix)
     return highest_ver
diff --git a/src/site_upload/cache_api/cache_api.py b/src/site_upload/cache_api/cache_api.py
@@ -37,20 +37,17 @@ def cache_api_data(s3_client, s3_bucket_name: str, db: str, target: str) -> None
             "study": dp.split("__")[0],
             "name": dp.split("__")[1],
         }
-        try:
-            versions = column_types[dp_detail["study"]][dp_detail["name"]]
-            for version in versions:
-                dp_dict = {
-                    **dp_detail,
-                    **versions[version],
-                    "version": version,
-                    "id": f"{dp_detail['study']}__{dp_detail['name']}__{version}",
-                }
-                if "__flat" in dp:
-                    dp_dict["type"] = "flat"
-                dp_details.append(dp_dict)
-        except KeyError as e:
-            raise e
+        versions = column_types[dp_detail["study"]][dp_detail["name"]]
+        for version in versions:
+            dp_dict = {
+                **dp_detail,
+                **versions[version],
+                "version": version,
+                "id": f"{dp_detail['study']}__{dp_detail['name']}__{version}",
+            }
+            if "__flat" in dp:
+                dp_dict["type"] = "flat"
+            dp_details.append(dp_dict)
     s3_client.put_object(
         Bucket=s3_bucket_name,
         Key=f"{enums.BucketPath.CACHE.value}/{enums.JsonFilename.DATA_PACKAGES.value}.json",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -40,6 +40,8 @@ def _init_mock_data(s3_client, bucket, study, data_package, version):
     The following items are added:
         - Aggregates, with a site of plainsboro, in parquet and csv, for the
           study provided
+        - Flat tables, with a site of plainsboro, in parquet and csv, for the
+          study provided
         - a data_package cache for api testing
         - credentials for the 3 unit test hospitals (princeton, elsewhere, hope)
 
@@ -58,6 +60,20 @@ def _init_mock_data(s3_client, bucket, study, data_package, version):
         f"{enums.BucketPath.CSVAGGREGATE.value}/{study}/"
         f"{study}__{data_package}/{version}/{study}__{data_package}__aggregate.csv",
     )
+    s3_client.upload_file(
+        "./tests/test_data/flat_synthea_q_date_recent.parquet",
+        bucket,
+        f"{enums.BucketPath.FLAT.value}/{study}/{mock_utils.EXISTING_SITE}"
+        f"{study}__{data_package}__{version}/"
+        f"{study}__{data_package}__flat.parquet",
+    )
+    s3_client.upload_file(
+        "./tests/test_data/flat_synthea_q_date_recent.csv",
+        bucket,
+        f"{enums.BucketPath.CSVFLAT.value}/{study}/{mock_utils.EXISTING_SITE}"
+        f"{study}__{data_package}__{version}/"
+        f"{study}__{data_package}__flat.csv",
+    )
     s3_client.upload_file(
         "./tests/test_data/data_packages_cache.json",
         bucket,

diff --git a/tests/mock_utils.py b/tests/mock_utils.py
@@ -6,7 +6,7 @@
 TEST_PROCESS_COUNTS_ARN = "arn:aws:sns:us-east-1:123456789012:test-counts"
 TEST_PROCESS_STUDY_META_ARN = "arn:aws:sns:us-east-1:123456789012:test-meta"
 TEST_CACHE_API_ARN = "arn:aws:sns:us-east-1:123456789012:test-cache"
-ITEM_COUNT = 9
+ITEM_COUNT = 13
 DATA_PACKAGE_COUNT = 3
 
 EXISTING_SITE = "princeton_plainsboro_teaching_hospital"

diff --git a/tests/shared/test_functions.py b/tests/shared/test_functions.py
@@ -1,10 +1,21 @@
+"""Unit tests for shared functions.
+
+
+As of this writing, since a lot of this was historically covered by other tests,
+this file does not contain a 1-1 set of tests to the source module,
+instead focusing only on edge case scenarios (though in those cases, tests
+should be comprehensive). 1-1 coverage is a desirable long term goal.
+"""
+
 from contextlib import nullcontext as does_not_raise
 from unittest import mock
 
+import boto3
 import pandas
 import pytest
 
-from src.shared import functions, pandas_functions
+from src.shared import enums, functions, pandas_functions
+from tests import mock_utils
 
 
 @pytest.mark.parametrize(
@@ -60,3 +71,44 @@ def test_column_datatypes():
         "bool": "boolean",
         "string": "string",
     }
+
+
+def test_update_metadata_error(mock_bucket):
+    with pytest.raises(ValueError):
+        enums.JsonFilename.FOO = "foo"
+        functions.update_metadata(
+            metadata={}, study="", data_package="", version="", target="", meta_type="foo"
+        )
+
+
+def test_get_s3_keys(mock_bucket):
+    s3_client = boto3.client("s3")
+    res = functions.get_s3_keys(s3_client, mock_utils.TEST_BUCKET, "")
+    assert len(res) == mock_utils.ITEM_COUNT
+    res = functions.get_s3_keys(s3_client, mock_utils.TEST_BUCKET, "", max_keys=2)
+    assert len(res) == mock_utils.ITEM_COUNT
+    res = functions.get_s3_keys(s3_client, mock_utils.TEST_BUCKET, "cache")
+    assert res == ["cache/data_packages.json"]
+
+
+def test_latest_data_package_version(mock_bucket):
+    version = functions.get_latest_data_package_version(
+        mock_utils.TEST_BUCKET, f"{enums.BucketPath.AGGREGATE.value}/{mock_utils.EXISTING_STUDY}"
+    )
+    assert version == mock_utils.EXISTING_VERSION
+    s3_client = boto3.client("s3")
+    s3_client.upload_file(
+        "./tests/test_data/count_synthea_patient_agg.parquet",
+        mock_utils.TEST_BUCKET,
+        f"{enums.BucketPath.AGGREGATE.value}/{mock_utils.EXISTING_STUDY}/"
+        f"{mock_utils.EXISTING_STUDY}__{mock_utils.EXISTING_DATA_P}/"
+        f"{mock_utils.EXISTING_STUDY}__{mock_utils.EXISTING_DATA_P}__{mock_utils.NEW_VERSION}/"
+        f"{mock_utils.EXISTING_STUDY}__{mock_utils.EXISTING_DATA_P}__aggregate.parquet",
+    )
+    version = functions.get_latest_data_package_version(
+        mock_utils.TEST_BUCKET, f"{enums.BucketPath.AGGREGATE.value}/{mock_utils.EXISTING_STUDY}"
+    )
+    version = functions.get_latest_data_package_version(
+        mock_utils.TEST_BUCKET, f"{enums.BucketPath.AGGREGATE.value}/not_a_study"
+    )
+    assert version is None
diff --git a/tests/site_upload/test_powerset_merge.py b/tests/site_upload/test_powerset_merge.py
@@ -237,6 +237,8 @@ def test_powerset_merge_single_upload(
                 or item["Key"].startswith(enums.BucketPath.ERROR.value)
                 or item["Key"].startswith(enums.BucketPath.ADMIN.value)
                 or item["Key"].startswith(enums.BucketPath.CACHE.value)
+                or item["Key"].startswith(enums.BucketPath.FLAT.value)
+                or item["Key"].startswith(enums.BucketPath.CSVFLAT.value)
                 or item["Key"].endswith("study_periods.json")
             )
     if archives:

diff --git a/tests/site_upload/test_process_upload.py b/tests/site_upload/test_process_upload.py
@@ -167,6 +167,8 @@ def test_process_upload(
                 or item["Key"].startswith(enums.BucketPath.ERROR.value)
                 or item["Key"].startswith(enums.BucketPath.ADMIN.value)
                 or item["Key"].startswith(enums.BucketPath.CACHE.value)
+                or item["Key"].startswith(enums.BucketPath.FLAT.value)
+                or item["Key"].startswith(enums.BucketPath.CSVFLAT.value)
                 or item["Key"].endswith("study_periods.json")
                 or item["Key"].endswith("column_types.json")
             )

diff --git a/tests/test_data/flat_synthea_q_date_recent.csv b/tests/test_data/flat_synthea_q_date_recent.csv
@@ -0,0 +1,19 @@
+resource,subgroup,numerator,denominator,percentage
+Procedure,,2,5,40.00
+Procedure,cumulus__all,2,5,40.00
+Observation,,0,0,0.00
+Observation,cumulus__all,0,0,0.00
+MedicationRequest,,0,0,0.00
+MedicationRequest,cumulus__all,0,0,0.00
+Immunization,,0,0,0.00
+Immunization,cumulus__all,0,0,0.00
+Encounter,,1,4,25.00
+Encounter,cumulus__all,1,4,25.00
+DocumentReference,,0,0,0.00
+DocumentReference,cumulus__all,0,0,0.00
+DiagnosticReport,,0,0,0.00
+DiagnosticReport,cumulus__all,0,0,0.00
+Condition,,2,4,50.00
+Condition,cumulus__all,2,4,50.00
+AllergyIntolerance,,0,0,0.00
+AllergyIntolerance,cumulus__all,0,0,0.00
diff --git a/tests/test_data/flat_synthea_q_date_recent.parquet b/tests/test_data/flat_synthea_q_date_recent.parquet