-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
upload: be more cautious about grabbing files for upload #305
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ def upload_data( | |
progress: Progress, | ||
file_upload_progress: TaskID, | ||
file_path: Path, | ||
version: int, | ||
version: str, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is always passed a string, I think this type hint was just a typo. |
||
args: dict, | ||
): | ||
"""Fetches presigned url and uploads file to aggregator""" | ||
|
@@ -71,7 +71,10 @@ def upload_files(args: dict): | |
if args["target"]: | ||
filtered_paths = [] | ||
for path in file_paths: | ||
if any(study in str(path) for study in args["target"]): | ||
if any( | ||
path.parent.name == study and path.name.startswith(f"{study}__") | ||
for study in args["target"] | ||
): | ||
Comment on lines
-74
to
+77
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the actual code change here. Before, this was a simple substring check. Which for my use case (upload site/ -t core) was hitting files like site/data_metrics/data_metrics__count_c_us_core_v4_observation.parquet because it has core in its filename, even though it's not a core study file. So... I've changed this to check that both the leaf filename starts with study__ and it is inside a directory named for the study (a requirement from part of the uploading code that takes that directory name as the study name). This is pretty strict, but also rules that these files ought to follow? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep, makes sense - also keeps some hand moved data from leaking into the aggregator and creating new studies accidentally. |
||
filtered_paths.append(path) | ||
file_paths = filtered_paths | ||
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test modifications to allow testing more scenarios easily (in order to satisfy coverage tax) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import builtins | ||
import contextlib | ||
import datetime | ||
import pathlib | ||
import shutil | ||
|
@@ -11,6 +12,7 @@ | |
import requests | ||
import responses | ||
from freezegun import freeze_time | ||
from responses import matchers | ||
|
||
from cumulus_library import base_utils, enums, errors, log_utils, study_manifest | ||
from cumulus_library.actions import ( | ||
|
@@ -410,6 +412,52 @@ def test_import_study(tmp_path, mock_db_config): | |
) | ||
|
||
|
||
def do_upload( | ||
*, | ||
login_error: bool = False, | ||
user: str = "user", | ||
id_token: str = "id", | ||
preview: bool = True, | ||
raises: contextlib.AbstractContextManager = does_not_raise(), | ||
status: int = 204, | ||
version: str = "12345.0", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This |
||
data_path: pathlib.Path | None = pathlib.Path.cwd() / "tests/test_data/upload/", | ||
): | ||
with raises: | ||
if login_error: | ||
responses.add(responses.POST, "https://upload.url.test/upload/", status=401) | ||
else: | ||
responses.add( | ||
responses.POST, | ||
"https://upload.url.test/upload/", | ||
match=[ | ||
matchers.json_params_matcher( | ||
{ | ||
"study": "upload", | ||
"data_package": "upload__count_synthea_patient", | ||
"data_package_version": version, | ||
"filename": f"{user}_upload__count_synthea_patient.parquet", | ||
} | ||
) | ||
], | ||
json={"url": "https://presigned.url.test", "fields": {"a": "b"}}, | ||
) | ||
args = { | ||
"data_path": data_path, | ||
"id": id_token, | ||
"preview": preview, | ||
"target": ["upload"], | ||
"url": "https://upload.url.test/upload/", | ||
"user": user, | ||
} | ||
responses.add(responses.POST, "https://presigned.url.test", status=status) | ||
uploader.upload_files(args) | ||
if preview: | ||
responses.assert_call_count("https://upload.url.test/upload/", 1) | ||
elif raises == does_not_raise(): | ||
responses.assert_call_count("https://upload.url.test/upload/", 2) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"user,id_token,status,login_error,preview,raises", | ||
[ | ||
|
@@ -451,29 +499,28 @@ def test_import_study(tmp_path, mock_db_config): | |
) | ||
@responses.activate | ||
def test_upload_data(user, id_token, status, preview, login_error, raises): | ||
with raises: | ||
if login_error: | ||
responses.add(responses.POST, "https://upload.url.test/upload/", status=401) | ||
else: | ||
responses.add( | ||
responses.POST, | ||
"https://upload.url.test/upload/", | ||
json={"url": "https://presigned.url.test", "fields": {"a": "b"}}, | ||
) | ||
args = { | ||
"data_path": pathlib.Path.cwd() / "tests/test_data/upload/", | ||
"id": id_token, | ||
"preview": preview, | ||
"target": "core", | ||
"url": "https://upload.url.test/upload/", | ||
"user": user, | ||
} | ||
responses.add(responses.POST, "https://presigned.url.test", status=status) | ||
uploader.upload_files(args) | ||
if preview: | ||
responses.assert_call_count("https://upload.url.test/upload/", 1) | ||
elif raises == does_not_raise(): | ||
responses.assert_call_count("https://upload.url.test/upload/", 2) | ||
do_upload( | ||
user=user, | ||
id_token=id_token, | ||
status=status, | ||
preview=preview, | ||
login_error=login_error, | ||
raises=raises, | ||
) | ||
|
||
|
||
def test_upload_data_no_path(): | ||
with pytest.raises(SystemExit): | ||
do_upload(data_path=None) | ||
|
||
|
||
@responses.activate | ||
def test_upload_data_no_version(tmp_path): | ||
src = pathlib.Path.cwd() / "tests/test_data/upload/upload__count_synthea_patient.parquet" | ||
dest = pathlib.Path(tmp_path) / "upload" | ||
dest.mkdir() | ||
shutil.copy(src, dest) | ||
do_upload(data_path=dest, version="0") | ||
|
||
|
||
@pytest.mark.parametrize( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
"data_package_version" | ||
12345 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are parquet files checked into the repo in the test_data folder, so I modified this rule.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i would prefer to keep this - currently, export defaults to the project root if no arg is given, which means that exporting data is easy to accidentally commit - that the original origin of this rule. I've been explicitly commiting files when needed.
I agree the specific parquet files highlighted below should be fixed to run in tmpdirs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If that's the thinking, shouldn't we also ignore
*.csv
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could have nested ignores - a global ignore and then turn that off for the tests/ dir, probably. For both csv and parquet.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Eh? Updated to be a little more complicated, but it seems to work
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
very clever!