From f40cfc22c1eac2a69ea0df14f88fdbaa982522fe Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Thu, 3 Oct 2024 16:05:29 -0400
Subject: [PATCH] upload: be more cautious about grabbing files for upload

Instead of looking for study name in the file, try harder to make sure
we are looking at a study *prefix* in the filename.
---
 .gitignore                                    |  13 ++-
 cumulus_library/actions/uploader.py           |   7 +-
 tests/test_actions.py                         |  93 +++++++++++++-----
 tests/test_cli.py                             |   8 +-
 ....csv => upload__count_synthea_patient.csv} |   0
 ... => upload__count_synthea_patient.parquet} | Bin
 .../test_data/upload/upload__meta_version.csv |   2 +
 .../upload/upload__meta_version.parquet       | Bin 0 -> 643 bytes
 8 files changed, 95 insertions(+), 28 deletions(-)
 rename tests/test_data/upload/{count_synthea_patient.csv => upload__count_synthea_patient.csv} (100%)
 rename tests/test_data/upload/{count_synthea_patient.parquet => upload__count_synthea_patient.parquet} (100%)
 create mode 100644 tests/test_data/upload/upload__meta_version.csv
 create mode 100644 tests/test_data/upload/upload__meta_version.parquet

diff --git a/.gitignore b/.gitignore
index aded8ded..9b0ad068 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,9 +10,20 @@ output.sql
 MRCONSO.RRF
 *.zip
 coverage.xml
-*.parquet
 valueset_data/
 
+# parquet and csv files show up during an export, which devs might do a fair bit in odd places.
+# So for safety, ignore them both here, except in the tests dir where we use them for test data.
+*.csv
+*.parquet
+!/tests/**/*.csv
+!/tests/**/*.parquet
+# test artifacts (TODO: have the tests work in a tmpdir instead)
+/cumulus_library/builders/valueset/lookup_drug_from_ingredient_rules.parquet
+/tests/test_data/study_static_file/bsvs/ICD10CM_2023AA.parquet
+/tests/test_data/study_static_file/bsvs/ICD10PCS_2023AA.parquet
+/tests/test_data/valueset/rules_file.parquet
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/cumulus_library/actions/uploader.py b/cumulus_library/actions/uploader.py
index ad684e66..b7aef83f 100644
--- a/cumulus_library/actions/uploader.py
+++ b/cumulus_library/actions/uploader.py
@@ -14,7 +14,7 @@ def upload_data(
     progress: Progress,
     file_upload_progress: TaskID,
     file_path: Path,
-    version: int,
+    version: str,
     args: dict,
 ):
     """Fetches presigned url and uploads file to aggregator"""
@@ -71,7 +71,10 @@ def upload_files(args: dict):
     if args["target"]:
         filtered_paths = []
         for path in file_paths:
-            if any(study in str(path) for study in args["target"]):
+            if any(
+                path.parent.name == study and path.name.startswith(f"{study}__")
+                for study in args["target"]
+            ):
                 filtered_paths.append(path)
         file_paths = filtered_paths
 
diff --git a/tests/test_actions.py b/tests/test_actions.py
index bb8ff43a..166d0082 100644
--- a/tests/test_actions.py
+++ b/tests/test_actions.py
@@ -1,4 +1,5 @@
 import builtins
+import contextlib
 import datetime
 import pathlib
 import shutil
@@ -11,6 +12,7 @@
 import requests
 import responses
 from freezegun import freeze_time
+from responses import matchers
 
 from cumulus_library import base_utils, enums, errors, log_utils, study_manifest
 from cumulus_library.actions import (
@@ -410,6 +412,52 @@ def test_import_study(tmp_path, mock_db_config):
         )
 
 
+def do_upload(
+    *,
+    login_error: bool = False,
+    user: str = "user",
+    id_token: str = "id",
+    preview: bool = True,
+    raises: contextlib.AbstractContextManager = does_not_raise(),
+    status: int = 204,
+    version: str = "12345.0",
+    data_path: pathlib.Path | None = pathlib.Path.cwd() / "tests/test_data/upload/",
+):
+    with raises:
+        if login_error:
+            responses.add(responses.POST, "https://upload.url.test/upload/", status=401)
+        else:
+            responses.add(
+                responses.POST,
+                "https://upload.url.test/upload/",
+                match=[
+                    matchers.json_params_matcher(
+                        {
+                            "study": "upload",
+                            "data_package": "upload__count_synthea_patient",
+                            "data_package_version": version,
+                            "filename": f"{user}_upload__count_synthea_patient.parquet",
+                        }
+                    )
+                ],
+                json={"url": "https://presigned.url.test", "fields": {"a": "b"}},
+            )
+        args = {
+            "data_path": data_path,
+            "id": id_token,
+            "preview": preview,
+            "target": ["upload"],
+            "url": "https://upload.url.test/upload/",
+            "user": user,
+        }
+        responses.add(responses.POST, "https://presigned.url.test", status=status)
+        uploader.upload_files(args)
+        if preview:
+            responses.assert_call_count("https://upload.url.test/upload/", 1)
+        elif raises == does_not_raise():
+            responses.assert_call_count("https://upload.url.test/upload/", 2)
+
+
 @pytest.mark.parametrize(
     "user,id_token,status,login_error,preview,raises",
     [
@@ -451,29 +499,28 @@ def test_import_study(tmp_path, mock_db_config):
 )
 @responses.activate
 def test_upload_data(user, id_token, status, preview, login_error, raises):
-    with raises:
-        if login_error:
-            responses.add(responses.POST, "https://upload.url.test/upload/", status=401)
-        else:
-            responses.add(
-                responses.POST,
-                "https://upload.url.test/upload/",
-                json={"url": "https://presigned.url.test", "fields": {"a": "b"}},
-            )
-        args = {
-            "data_path": pathlib.Path.cwd() / "tests/test_data/upload/",
-            "id": id_token,
-            "preview": preview,
-            "target": "core",
-            "url": "https://upload.url.test/upload/",
-            "user": user,
-        }
-        responses.add(responses.POST, "https://presigned.url.test", status=status)
-        uploader.upload_files(args)
-        if preview:
-            responses.assert_call_count("https://upload.url.test/upload/", 1)
-        elif raises == does_not_raise():
-            responses.assert_call_count("https://upload.url.test/upload/", 2)
+    do_upload(
+        user=user,
+        id_token=id_token,
+        status=status,
+        preview=preview,
+        login_error=login_error,
+        raises=raises,
+    )
+
+
+def test_upload_data_no_path():
+    with pytest.raises(SystemExit):
+        do_upload(data_path=None)
+
+
+@responses.activate
+def test_upload_data_no_version(tmp_path):
+    src = pathlib.Path.cwd() / "tests/test_data/upload/upload__count_synthea_patient.parquet"
+    dest = pathlib.Path(tmp_path) / "upload"
+    dest.mkdir()
+    shutil.copy(src, dest)
+    do_upload(data_path=dest, version="0")
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b9b0763c..79deeca0 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -601,8 +601,12 @@ def test_cli_upload_studies(mock_glob, args, status, login_error, raises):
 def test_cli_upload_filter(mock_upload_data, mock_glob, args, calls):
     mock_glob.side_effect = [
         [
-            Path(str(Path(__file__).parent) + "/test_data/count_synthea_patient.parquet"),
-            Path(str(Path(__file__).parent) + "/other_data/count_synthea_patient.parquet"),
+            Path(
+                str(Path(__file__).parent) + "/test_data/test_data__count_synthea_patient.parquet"
+            ),
+            Path(
+                str(Path(__file__).parent) + "/other_data/other_data__count_synthea_patient.parquet"
+            ),
         ],
     ]
     cli.main(cli_args=args)
diff --git a/tests/test_data/upload/count_synthea_patient.csv b/tests/test_data/upload/upload__count_synthea_patient.csv
similarity index 100%
rename from tests/test_data/upload/count_synthea_patient.csv
rename to tests/test_data/upload/upload__count_synthea_patient.csv
diff --git a/tests/test_data/upload/count_synthea_patient.parquet b/tests/test_data/upload/upload__count_synthea_patient.parquet
similarity index 100%
rename from tests/test_data/upload/count_synthea_patient.parquet
rename to tests/test_data/upload/upload__count_synthea_patient.parquet
diff --git a/tests/test_data/upload/upload__meta_version.csv b/tests/test_data/upload/upload__meta_version.csv
new file mode 100644
index 00000000..d9846cae
--- /dev/null
+++ b/tests/test_data/upload/upload__meta_version.csv
@@ -0,0 +1,2 @@
+"data_package_version"
+12345
diff --git a/tests/test_data/upload/upload__meta_version.parquet b/tests/test_data/upload/upload__meta_version.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7625038524579c27aad3e31b5273f0c3b40c4371
GIT binary patch
literal 643
zcmcgq!A`<J5S?yFQ;CO$uuV7FLoYTS00qSuPi`wlqef^Vf)`RtZ4Dx|AjF%$;~#kB
z$Y(iQAYQn5@g3&PzM0*b$xc&vRhpwZEnZVXp#Y$9Kz#DWDFs@fMhSJDs#N8swvPOI
z2JCK7F>lKvXo;c_uonuac~%EiXJ)bQPyAlu2LnF}d($u-KaXB1F^KW47<0uwwza<$
zVkPVzC;>D&5&+GyQXNcn9K_+!my!Gq*<2P3!Z_pq+n=11*c#4S1Yq3U7M^!|f3j@X
zoGW`0hmk<Eq(NK>F%wQC&V`7Cn}vkq2yr8Wt`L`!@w{d{HKcSW4SL9tyni9zL9}G@
z5{<Jg)6MkE&2&eWb_U&!7k3VvxLcdV54HAFf7t#AYA@4A>;d1ydR@s&tQbry@zb~W
ga8eGEr0l2ZXlAYc*{U8@s+E1l<&J$R08L!KA5_kahyVZp

literal 0
HcmV?d00001