From f40cfc22c1eac2a69ea0df14f88fdbaa982522fe Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Thu, 3 Oct 2024 16:05:29 -0400 Subject: [PATCH] upload: be more cautious about grabbing files for upload Instead of looking for study name in the file, try harder to make sure we are looking at a study *prefix* in the filename. --- .gitignore | 13 ++- cumulus_library/actions/uploader.py | 7 +- tests/test_actions.py | 93 +++++++++++++----- tests/test_cli.py | 8 +- ....csv => upload__count_synthea_patient.csv} | 0 ... => upload__count_synthea_patient.parquet} | Bin .../test_data/upload/upload__meta_version.csv | 2 + .../upload/upload__meta_version.parquet | Bin 0 -> 643 bytes 8 files changed, 95 insertions(+), 28 deletions(-) rename tests/test_data/upload/{count_synthea_patient.csv => upload__count_synthea_patient.csv} (100%) rename tests/test_data/upload/{count_synthea_patient.parquet => upload__count_synthea_patient.parquet} (100%) create mode 100644 tests/test_data/upload/upload__meta_version.csv create mode 100644 tests/test_data/upload/upload__meta_version.parquet diff --git a/.gitignore b/.gitignore index aded8ded..9b0ad068 100644 --- a/.gitignore +++ b/.gitignore @@ -10,9 +10,20 @@ output.sql MRCONSO.RRF *.zip coverage.xml -*.parquet valueset_data/ +# parquet and csv files show up during an export, which devs might do a fair bit in odd places. +# So for safety, ignore them both here, except in the tests dir where we use them for test data. +*.csv +*.parquet +!/tests/**/*.csv +!/tests/**/*.parquet +# test artifacts (TODO: have the tests work in a tmpdir instead) +/cumulus_library/builders/valueset/lookup_drug_from_ingredient_rules.parquet +/tests/test_data/study_static_file/bsvs/ICD10CM_2023AA.parquet +/tests/test_data/study_static_file/bsvs/ICD10PCS_2023AA.parquet +/tests/test_data/valueset/rules_file.parquet + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/cumulus_library/actions/uploader.py b/cumulus_library/actions/uploader.py index ad684e66..b7aef83f 100644 --- a/cumulus_library/actions/uploader.py +++ b/cumulus_library/actions/uploader.py @@ -14,7 +14,7 @@ def upload_data( progress: Progress, file_upload_progress: TaskID, file_path: Path, - version: int, + version: str, args: dict, ): """Fetches presigned url and uploads file to aggregator""" @@ -71,7 +71,10 @@ def upload_files(args: dict): if args["target"]: filtered_paths = [] for path in file_paths: - if any(study in str(path) for study in args["target"]): + if any( + path.parent.name == study and path.name.startswith(f"{study}__") + for study in args["target"] + ): filtered_paths.append(path) file_paths = filtered_paths diff --git a/tests/test_actions.py b/tests/test_actions.py index bb8ff43a..166d0082 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -1,4 +1,5 @@ import builtins +import contextlib import datetime import pathlib import shutil @@ -11,6 +12,7 @@ import requests import responses from freezegun import freeze_time +from responses import matchers from cumulus_library import base_utils, enums, errors, log_utils, study_manifest from cumulus_library.actions import ( @@ -410,6 +412,52 @@ def test_import_study(tmp_path, mock_db_config): ) +def do_upload( + *, + login_error: bool = False, + user: str = "user", + id_token: str = "id", + preview: bool = True, + raises: contextlib.AbstractContextManager = does_not_raise(), + status: int = 204, + version: str = "12345.0", + data_path: pathlib.Path | None = pathlib.Path.cwd() / "tests/test_data/upload/", +): + with raises: + if login_error: + responses.add(responses.POST, "https://upload.url.test/upload/", status=401) + else: + responses.add( + responses.POST, + "https://upload.url.test/upload/", + match=[ + matchers.json_params_matcher( + { + "study": "upload", + "data_package": "upload__count_synthea_patient", + "data_package_version": version, + "filename": f"{user}_upload__count_synthea_patient.parquet", + } + ) + ], + json={"url": "https://presigned.url.test", "fields": {"a": "b"}}, + ) + args = { + "data_path": data_path, + "id": id_token, + "preview": preview, + "target": ["upload"], + "url": "https://upload.url.test/upload/", + "user": user, + } + responses.add(responses.POST, "https://presigned.url.test", status=status) + uploader.upload_files(args) + if preview: + responses.assert_call_count("https://upload.url.test/upload/", 1) + elif raises == does_not_raise(): + responses.assert_call_count("https://upload.url.test/upload/", 2) + + @pytest.mark.parametrize( "user,id_token,status,login_error,preview,raises", [ @@ -451,29 +499,28 @@ def test_import_study(tmp_path, mock_db_config): ) @responses.activate def test_upload_data(user, id_token, status, preview, login_error, raises): - with raises: - if login_error: - responses.add(responses.POST, "https://upload.url.test/upload/", status=401) - else: - responses.add( - responses.POST, - "https://upload.url.test/upload/", - json={"url": "https://presigned.url.test", "fields": {"a": "b"}}, - ) - args = { - "data_path": pathlib.Path.cwd() / "tests/test_data/upload/", - "id": id_token, - "preview": preview, - "target": "core", - "url": "https://upload.url.test/upload/", - "user": user, - } - responses.add(responses.POST, "https://presigned.url.test", status=status) - uploader.upload_files(args) - if preview: - responses.assert_call_count("https://upload.url.test/upload/", 1) - elif raises == does_not_raise(): - responses.assert_call_count("https://upload.url.test/upload/", 2) + do_upload( + user=user, + id_token=id_token, + status=status, + preview=preview, + login_error=login_error, + raises=raises, + ) + + +def test_upload_data_no_path(): + with pytest.raises(SystemExit): + do_upload(data_path=None) + + +@responses.activate +def test_upload_data_no_version(tmp_path): + src = pathlib.Path.cwd() / "tests/test_data/upload/upload__count_synthea_patient.parquet" + dest = pathlib.Path(tmp_path) / "upload" + dest.mkdir() + shutil.copy(src, dest) + do_upload(data_path=dest, version="0") @pytest.mark.parametrize( diff --git a/tests/test_cli.py b/tests/test_cli.py index b9b0763c..79deeca0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -601,8 +601,12 @@ def test_cli_upload_studies(mock_glob, args, status, login_error, raises): def test_cli_upload_filter(mock_upload_data, mock_glob, args, calls): mock_glob.side_effect = [ [ - Path(str(Path(__file__).parent) + "/test_data/count_synthea_patient.parquet"), - Path(str(Path(__file__).parent) + "/other_data/count_synthea_patient.parquet"), + Path( + str(Path(__file__).parent) + "/test_data/test_data__count_synthea_patient.parquet" + ), + Path( + str(Path(__file__).parent) + "/other_data/other_data__count_synthea_patient.parquet" + ), ], ] cli.main(cli_args=args) diff --git a/tests/test_data/upload/count_synthea_patient.csv b/tests/test_data/upload/upload__count_synthea_patient.csv similarity index 100% rename from tests/test_data/upload/count_synthea_patient.csv rename to tests/test_data/upload/upload__count_synthea_patient.csv diff --git a/tests/test_data/upload/count_synthea_patient.parquet b/tests/test_data/upload/upload__count_synthea_patient.parquet similarity index 100% rename from tests/test_data/upload/count_synthea_patient.parquet rename to tests/test_data/upload/upload__count_synthea_patient.parquet diff --git a/tests/test_data/upload/upload__meta_version.csv b/tests/test_data/upload/upload__meta_version.csv new file mode 100644 index 00000000..d9846cae --- /dev/null +++ b/tests/test_data/upload/upload__meta_version.csv @@ -0,0 +1,2 @@ +"data_package_version" +12345 diff --git a/tests/test_data/upload/upload__meta_version.parquet b/tests/test_data/upload/upload__meta_version.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7625038524579c27aad3e31b5273f0c3b40c4371 GIT binary patch literal 643 zcmcgq!A`lKvXo;c_uonuac~%EiXJ)bQPyAlu2LnF}d($u-KaXB1F^KW47<0uwwza<$ zVkPVzC;>D&5&+GyQXNcn9K_+!my!Gq*<2P3!Z_pq+n=11*c#4S1Yq3U7M^!|f3j@X zoGW`0hmkF%wQC&V`7Cn}vkq2yr8Wt`L`!@w{d{HKcSW4SL9tyni9zL9}G@ z5{;d1ydR@s&tQbry@zb~W ga8eGEr0l2ZXlAYc*{U8@s+E1l<&J$R08L!KA5_kahyVZp literal 0 HcmV?d00001