forked from inveniosoftware/invenio-s3
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Implementation of multipart upload, as described in RFC 0072 * See inveniosoftware/rfcs#91 Co-authored-by: Mirek Simek <[email protected]>
- Loading branch information
1 parent
a18ae9e
commit 5402d6b
Showing
5 changed files
with
304 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,3 +60,5 @@ target/ | |
|
||
# Vim swapfiles | ||
.*.sw? | ||
|
||
.vscode/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2024 Miroslav Simek | ||
# | ||
# Invenio-S3 is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
|
||
"""Low level client for S3 multipart uploads.""" | ||
|
||
import datetime | ||
|
||
# WARNING: low-level code. The underlying s3fs currently does not have support | ||
# for multipart uploads without keeping the S3File instance in memory between requests. | ||
# To overcome this limitation, we have to use the low-level API directly separated in the | ||
# LowLevelS3File class. | ||
|
||
|
||
class MultipartS3File: | ||
"""Low level client for S3 multipart uploads.""" | ||
|
||
def __init__(self, fs, path, upload_id=None): | ||
"""Initialize the low level client. | ||
:param fs: S3FS instance | ||
:param path: The path of the file (with bucket and version) | ||
:param upload_id: The upload ID of the multipart upload, can be none to get a new upload. | ||
""" | ||
self.fs = fs | ||
self.path = path | ||
self.bucket, self.key, self.version_id = fs.split_path(path) | ||
self.s3_client = fs.s3 | ||
self.acl = fs.s3_additional_kwargs.get("ACL", "") | ||
self.upload_id = upload_id | ||
|
||
def create_multipart_upload(self): | ||
"""Create a new multipart upload. | ||
:returns: The upload ID of the multipart upload. | ||
""" | ||
mpu = self.s3_client.create_multipart_upload( | ||
Bucket=self.bucket, Key=self.key, ACL=self.acl | ||
) | ||
# TODO: error handling here | ||
self.upload_id = mpu["UploadId"] | ||
return self.upload_id | ||
|
||
def get_parts(self, max_parts): | ||
"""List the parts of the multipart upload. | ||
:param max_parts: The maximum number of parts to list. | ||
:returns: The list of parts, including checksums and etags. | ||
""" | ||
ret = self.s3_client.list_parts( | ||
Bucket=self.bucket, | ||
Key=self.key, | ||
UploadId=self.upload_id, | ||
MaxParts=max_parts, | ||
PartNumberMarker=0, | ||
) | ||
return ret.get("Parts", []) | ||
|
||
def upload_part(self, part_number, data): | ||
"""Upload a part of the multipart upload. Will be used only in tests. | ||
:param part_number: The part number. | ||
:param data: The data to upload. | ||
""" | ||
part = self.s3_client.upload_part( | ||
Bucket=self.bucket, | ||
Key=self.key, | ||
UploadId=self.upload_id, | ||
PartNumber=part_number, | ||
Body=data, | ||
) | ||
return part | ||
|
||
def _complete_operation_part_parameters(self, part): | ||
"""Filter parameters for the complete operation.""" | ||
ret = {} | ||
for k in [ | ||
"PartNumber", | ||
"ETag", | ||
"ChecksumCRC32", | ||
"ChecksumCRC32C", | ||
"ChecksumSHA1", | ||
"ChecksumSHA256", | ||
]: | ||
if k in part: | ||
ret[k] = part[k] | ||
return ret | ||
|
||
def get_part_links(self, max_parts, url_expiration): | ||
""" | ||
Generate pre-signed URLs for the parts of the multipart upload. | ||
:param max_parts: The maximum number of parts to list. | ||
:param url_expiration: The expiration time of the URLs in seconds | ||
:returns: The list of parts with pre-signed URLs and expiration times. | ||
""" | ||
expiration = datetime.datetime.utcnow() + datetime.timedelta( | ||
seconds=url_expiration | ||
) | ||
expiration = expiration.replace(microsecond=0).isoformat() + "Z" | ||
|
||
return { | ||
"parts": [ | ||
{ | ||
"part": part + 1, | ||
"url": self.s3_client.generate_presigned_url( | ||
"upload_part", | ||
Params={ | ||
"Bucket": self.bucket, | ||
"Key": self.key, | ||
"UploadId": self.upload_id, | ||
"PartNumber": part + 1, | ||
}, | ||
ExpiresIn=url_expiration, | ||
), | ||
"expiration": expiration, | ||
} | ||
for part in range(max_parts) | ||
] | ||
} | ||
|
||
def complete_multipart_upload(self, parts): | ||
"""Complete the multipart upload. | ||
:param parts: The list of parts (as from self.get_parts), including checksums and etags. | ||
""" | ||
return self.s3_client.complete_multipart_upload( | ||
Bucket=self.bucket, | ||
Key=self.key, | ||
UploadId=self.upload_id, | ||
MultipartUpload={ | ||
"Parts": [ | ||
self._complete_operation_part_parameters(part) for part in parts | ||
] | ||
}, | ||
) | ||
|
||
def abort_multipart_upload(self): | ||
"""Abort the multipart upload.""" | ||
return self.s3_client.abort_multipart_upload( | ||
Bucket=self.bucket, Key=self.key, UploadId=self.upload_id | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import pytest | ||
|
||
MB = 2**20 | ||
|
||
|
||
def test_multipart_flow(base_app, s3fs): | ||
part_size = 7 * MB | ||
last_part_size = 5 * MB | ||
|
||
# initialize the upload | ||
upload_metadata = dict( | ||
parts=2, part_size=part_size, size=part_size + last_part_size | ||
) | ||
upload_metadata |= s3fs.multipart_initialize_upload(**upload_metadata) or {} | ||
|
||
# can not commit just now because no parts were uploaded | ||
with pytest.raises(ValueError): | ||
s3fs.multipart_commit_upload(**upload_metadata) | ||
|
||
# check that links are generated | ||
|
||
links = s3fs.multipart_links(**upload_metadata)["parts"] | ||
assert len(links) == 2 | ||
assert links[0]["part"] == 1 | ||
assert "url" in links[0] | ||
assert links[1]["part"] == 2 | ||
assert "url" in links[1] | ||
|
||
# upload the first part manually | ||
multipart_file = s3fs.multipart_file(upload_metadata["uploadId"]) | ||
multipart_file.upload_part(1, b"0" * part_size) | ||
assert len(multipart_file.get_parts(2)) == 1 | ||
|
||
# still can not commit because not all parts were uploaded | ||
with pytest.raises(ValueError): | ||
s3fs.multipart_commit_upload(**upload_metadata) | ||
|
||
# upload the second part | ||
multipart_file.upload_part(2, b"1" * last_part_size) | ||
assert len(multipart_file.get_parts(2)) == 2 | ||
|
||
s3fs.multipart_commit_upload(**upload_metadata) | ||
|
||
assert s3fs.open("rb").read() == b"0" * part_size + b"1" * last_part_size | ||
|
||
|
||
def test_multipart_abort(base_app, s3fs): | ||
part_size = 7 * MB | ||
last_part_size = 5 * MB | ||
|
||
# initialize the upload | ||
upload_metadata = dict( | ||
parts=2, part_size=part_size, size=part_size + last_part_size | ||
) | ||
upload_metadata |= s3fs.multipart_initialize_upload(**upload_metadata) or {} | ||
|
||
s3fs.multipart_abort_upload(**upload_metadata) | ||
|
||
|
||
def test_set_content_not_supported(base_app, s3fs): | ||
part_size = 7 * MB | ||
last_part_size = 5 * MB | ||
|
||
# initialize the upload | ||
upload_metadata = dict( | ||
parts=2, part_size=part_size, size=part_size + last_part_size | ||
) | ||
upload_metadata |= s3fs.multipart_initialize_upload(**upload_metadata) or {} | ||
|
||
with pytest.raises(NotImplementedError): | ||
s3fs.multipart_set_content(1, b"0" * part_size, part_size, **upload_metadata) |