Skip to content
This repository was archived by the owner on Mar 31, 2026. It is now read-only.

Commit c5735c3

Browse files
authored
feat(storage): support returning skipped items as UserWarning in download_many_to_path (#1773)
This PR updates `download_many_to_path` return type in transfer_manager.py It now returns `List[None|Exception|UserWarning]` - `None` for successful download - `UserWarning` for - file skipped because `skip_if_exists=True` was provided and file exists. - Resolve path in either invalid or skips destination_directory (because of `..`) - `Exception` if error occurred during download.
1 parent 14cfd61 commit c5735c3

File tree

3 files changed

+184
-19
lines changed

3 files changed

+184
-19
lines changed

google/cloud/storage/transfer_manager.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -805,43 +805,60 @@ def download_many_to_path(
805805
806806
:raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.
807807
808-
:rtype: list
808+
:rtype: List[None|Exception|UserWarning]
809809
:returns: A list of results corresponding to, in order, each item in the
810-
input list. If an exception was received, it will be the result
811-
for that operation. Otherwise, the return value from the successful
812-
download method is used (which will be None).
810+
input list. If an exception was received or a download was skipped
811+
(e.g., due to existing file or path traversal), it will be the result
812+
for that operation (as an Exception or UserWarning, respectively).
813+
Otherwise, the result will be None for a successful download.
813814
"""
815+
results = [None] * len(blob_names)
814816
blob_file_pairs = []
817+
indices_to_process = []
815818

816-
for blob_name in blob_names:
819+
for i, blob_name in enumerate(blob_names):
817820
full_blob_name = blob_name_prefix + blob_name
818821
resolved_path = _resolve_path(destination_directory, blob_name)
819822
if not resolved_path.parent.is_relative_to(
820823
Path(destination_directory).resolve()
821824
):
822-
warnings.warn(
825+
msg = (
823826
f"The blob {blob_name} will **NOT** be downloaded. "
824827
f"The resolved destination_directory - {resolved_path.parent} - is either invalid or "
825828
f"escapes user provided {Path(destination_directory).resolve()} . Please download this file separately using `download_to_filename`"
826829
)
830+
warnings.warn(msg)
831+
results[i] = UserWarning(msg)
827832
continue
828833

829834
resolved_path = str(resolved_path)
835+
if skip_if_exists and os.path.isfile(resolved_path):
836+
msg = f"The blob {blob_name} is skipped because destination file already exists"
837+
results[i] = UserWarning(msg)
838+
continue
839+
830840
if create_directories:
831841
directory, _ = os.path.split(resolved_path)
832842
os.makedirs(directory, exist_ok=True)
833843
blob_file_pairs.append((bucket.blob(full_blob_name), resolved_path))
844+
indices_to_process.append(i)
834845

835-
return download_many(
846+
many_results = download_many(
836847
blob_file_pairs,
837848
download_kwargs=download_kwargs,
838849
deadline=deadline,
839850
raise_exception=raise_exception,
840851
worker_type=worker_type,
841852
max_workers=max_workers,
842-
skip_if_exists=skip_if_exists,
853+
skip_if_exists=False, # skip_if_exists is handled in the loop above
843854
)
844855

856+
for meta_index, result in zip(indices_to_process, many_results):
857+
results[meta_index] = result
858+
859+
return results
860+
861+
845862

846863
def download_chunks_concurrently(
847864
blob,

tests/system/test_transfer_manager.py

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,9 @@ def test_download_many_to_path_skips_download(
187187
[str(warning.message) for warning in w]
188188
)
189189

190-
# 1 total - 1 skipped = 0 results
191-
assert len(results) == 0
190+
# 1 total - 1 skipped = 1 result (containing Warning)
191+
assert len(results) == 1
192+
assert isinstance(results[0], UserWarning)
192193

193194

194195
@pytest.mark.parametrize(
@@ -266,6 +267,87 @@ def test_download_many_to_path_downloads_within_dest_dir(
266267
assert downloaded_contents == source_contents
267268

268269

270+
271+
def test_download_many_to_path_mixed_results(
272+
shared_bucket, file_data, blobs_to_delete
273+
):
274+
"""
275+
Test download_many_to_path with successful downloads, skip_if_exists skips, and path traversal skips.
276+
"""
277+
PREFIX = "mixed_results/"
278+
BLOBNAMES = [
279+
"success1.txt",
280+
"success2.txt",
281+
"exists.txt",
282+
"../escape.txt"
283+
]
284+
285+
FILE_BLOB_PAIRS = [
286+
(
287+
file_data["simple"]["path"],
288+
shared_bucket.blob(PREFIX + name),
289+
)
290+
for name in BLOBNAMES
291+
]
292+
293+
results = transfer_manager.upload_many(
294+
FILE_BLOB_PAIRS,
295+
skip_if_exists=True,
296+
deadline=DEADLINE,
297+
)
298+
for result in results:
299+
assert result is None
300+
301+
blobs = list(shared_bucket.list_blobs(prefix=PREFIX))
302+
blobs_to_delete.extend(blobs)
303+
assert len(blobs) == 4
304+
305+
# Actual Test
306+
with tempfile.TemporaryDirectory() as tempdir:
307+
existing_file_path = os.path.join(tempdir, "exists.txt")
308+
with open(existing_file_path, "w") as f:
309+
f.write("already here")
310+
311+
import warnings
312+
with warnings.catch_warnings(record=True) as w:
313+
warnings.simplefilter("always")
314+
results = transfer_manager.download_many_to_path(
315+
shared_bucket,
316+
BLOBNAMES,
317+
destination_directory=tempdir,
318+
blob_name_prefix=PREFIX,
319+
deadline=DEADLINE,
320+
create_directories=True,
321+
skip_if_exists=True,
322+
)
323+
324+
assert len(results) == 4
325+
326+
path_traversal_warnings = [
327+
warning
328+
for warning in w
329+
if str(warning.message).startswith("The blob ")
330+
and "will **NOT** be downloaded. The resolved destination_directory"
331+
in str(warning.message)
332+
]
333+
assert len(path_traversal_warnings) == 1, "---".join(
334+
[str(warning.message) for warning in w]
335+
)
336+
337+
assert results[0] is None
338+
assert results[1] is None
339+
assert isinstance(results[2], UserWarning)
340+
assert "skipped because destination file already exists" in str(results[2])
341+
assert isinstance(results[3], UserWarning)
342+
assert "will **NOT** be downloaded" in str(results[3])
343+
344+
assert os.path.exists(os.path.join(tempdir, "success1.txt"))
345+
assert os.path.exists(os.path.join(tempdir, "success2.txt"))
346+
347+
with open(existing_file_path, "r") as f:
348+
assert f.read() == "already here"
349+
350+
269351
def test_download_many(listable_bucket):
270352
blobs = list(listable_bucket.list_blobs())
271353
with tempfile.TemporaryDirectory() as tempdir:

tests/unit/test_transfer_manager.py

Lines changed: 75 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -530,9 +530,10 @@ def test_download_many_to_path():
530530
]
531531

532532
with mock.patch(
533-
"google.cloud.storage.transfer_manager.download_many"
533+
"google.cloud.storage.transfer_manager.download_many",
534+
return_value=[FAKE_RESULT] * len(BLOBNAMES),
534535
) as mock_download_many:
535-
transfer_manager.download_many_to_path(
536+
results = transfer_manager.download_many_to_path(
536537
bucket,
537538
BLOBNAMES,
538539
destination_directory=PATH_ROOT,
@@ -553,11 +554,71 @@ def test_download_many_to_path():
553554
raise_exception=True,
554555
max_workers=MAX_WORKERS,
555556
worker_type=WORKER_TYPE,
556-
skip_if_exists=True,
557+
skip_if_exists=False,
557558
)
559+
assert results == [FAKE_RESULT] * len(BLOBNAMES)
558560
for blobname in BLOBNAMES:
559561
bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)
560562

563+
def test_download_many_to_path_with_skip_if_exists():
564+
bucket = mock.Mock()
565+
566+
BLOBNAMES = ["file_a.txt", "file_b.txt", "dir_a/file_c.txt"]
567+
PATH_ROOT = "mypath/"
568+
BLOB_NAME_PREFIX = "myprefix/"
569+
DOWNLOAD_KWARGS = {"accept-encoding": "fake-gzip"}
570+
MAX_WORKERS = 7
571+
DEADLINE = 10
572+
WORKER_TYPE = transfer_manager.THREAD
573+
574+
from google.cloud.storage.transfer_manager import _resolve_path
575+
576+
existing_file = str(_resolve_path(PATH_ROOT, "file_a.txt"))
577+
578+
def isfile_side_effect(path):
579+
return path == existing_file
580+
581+
EXPECTED_BLOB_FILE_PAIRS = [
582+
(mock.ANY, str(_resolve_path(PATH_ROOT, "file_b.txt"))),
583+
(mock.ANY, str(_resolve_path(PATH_ROOT, "dir_a/file_c.txt"))),
584+
]
585+
586+
with mock.patch("os.path.isfile", side_effect=isfile_side_effect):
587+
with mock.patch(
588+
"google.cloud.storage.transfer_manager.download_many",
589+
return_value=[FAKE_RESULT, FAKE_RESULT],
590+
) as mock_download_many:
591+
results = transfer_manager.download_many_to_path(
592+
bucket,
593+
BLOBNAMES,
594+
destination_directory=PATH_ROOT,
595+
blob_name_prefix=BLOB_NAME_PREFIX,
596+
download_kwargs=DOWNLOAD_KWARGS,
597+
deadline=DEADLINE,
598+
create_directories=False,
599+
raise_exception=True,
600+
max_workers=MAX_WORKERS,
601+
worker_type=WORKER_TYPE,
602+
skip_if_exists=True,
603+
)
604+
605+
mock_download_many.assert_called_once_with(
606+
EXPECTED_BLOB_FILE_PAIRS,
607+
download_kwargs=DOWNLOAD_KWARGS,
608+
deadline=DEADLINE,
609+
raise_exception=True,
610+
max_workers=MAX_WORKERS,
611+
worker_type=WORKER_TYPE,
612+
skip_if_exists=False,
613+
)
614+
615+
assert len(results) == 3
616+
assert isinstance(results[0], UserWarning)
617+
assert str(results[0]) == "The blob file_a.txt is skipped because destination file already exists"
618+
assert results[1] == FAKE_RESULT
619+
assert results[2] == FAKE_RESULT
620+
621+
561622

562623
@pytest.mark.parametrize(
563624
"blobname",
@@ -584,9 +645,10 @@ def test_download_many_to_path_skips_download(blobname):
584645
with warnings.catch_warnings(record=True) as w:
585646
warnings.simplefilter("always")
586647
with mock.patch(
587-
"google.cloud.storage.transfer_manager.download_many"
648+
"google.cloud.storage.transfer_manager.download_many",
649+
return_value=[],
588650
) as mock_download_many:
589-
transfer_manager.download_many_to_path(
651+
results = transfer_manager.download_many_to_path(
590652
bucket,
591653
BLOBNAMES,
592654
destination_directory=PATH_ROOT,
@@ -614,8 +676,10 @@ def test_download_many_to_path_skips_download(blobname):
614676
raise_exception=True,
615677
max_workers=MAX_WORKERS,
616678
worker_type=WORKER_TYPE,
617-
skip_if_exists=True,
679+
skip_if_exists=False,
618680
)
681+
assert len(results) == 1
682+
assert isinstance(results[0], UserWarning)
619683

620684

621685
@pytest.mark.parametrize(
@@ -649,9 +713,10 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname):
649713
]
650714

651715
with mock.patch(
652-
"google.cloud.storage.transfer_manager.download_many"
716+
"google.cloud.storage.transfer_manager.download_many",
717+
return_value=[FAKE_RESULT],
653718
) as mock_download_many:
654-
transfer_manager.download_many_to_path(
719+
results = transfer_manager.download_many_to_path(
655720
bucket,
656721
BLOBNAMES,
657722
destination_directory=PATH_ROOT,
@@ -672,8 +737,9 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname):
672737
raise_exception=True,
673738
max_workers=MAX_WORKERS,
674739
worker_type=WORKER_TYPE,
675-
skip_if_exists=True,
740+
skip_if_exists=False,
676741
)
742+
assert results == [FAKE_RESULT]
677743
bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)
678744

679745

0 commit comments

Comments
 (0)