Skip to content

Commit a0657e1

Browse files
ArrayRecord Teamcopybara-github
authored andcommitted
Update ArrayRecord Reader to allow reading from GCS directly since Reglieli
PiperOrigin-RevId: 797434847
1 parent d0ed18b commit a0657e1

File tree

3 files changed

+30
-10
lines changed

3 files changed

+30
-10
lines changed

MODULE.bazel

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,17 @@ module(
2121
repo_name = "com_google_array_record",
2222
)
2323

24-
bazel_dep(name = "rules_proto", version = "7.0.2")
25-
bazel_dep(name = "rules_python", version = "1.0.0")
24+
bazel_dep(name = "rules_proto", version = "7.1.0")
25+
bazel_dep(name = "rules_python", version = "1.4.1")
2626
bazel_dep(name = "platforms", version = "0.0.11")
2727
bazel_dep(name = "protobuf", version = "31.1")
2828
bazel_dep(name = "googletest", version = "1.15.2")
29-
bazel_dep(name = "abseil-cpp", version = "20250127.0")
29+
bazel_dep(name = "abseil-cpp", version = "20250127.1")
3030
bazel_dep(name = "abseil-py", version = "2.1.0")
3131
bazel_dep(name = "eigen", version = "3.4.0.bcr.3")
32-
bazel_dep(name = "riegeli", version = "0.0.0-20241218-3385e3c")
32+
bazel_dep(name = "riegeli", version = "0.0.0-20250717-5b2e77e")
3333
bazel_dep(name = "pybind11_bazel", version = "2.12.0")
34+
bazel_dep(name = "google_cloud_cpp", version = "3.0.0-rc0")
3435

3536
SUPPORTED_PYTHON_VERSIONS = [
3637
"3.10",

python/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ pybind_extension(
2020
"@riegeli//riegeli/base:initializer",
2121
"@riegeli//riegeli/bytes:fd_reader",
2222
"@riegeli//riegeli/bytes:fd_writer",
23+
"@riegeli//riegeli/gcs:gcs_object",
24+
"@riegeli//riegeli/gcs:gcs_reader",
2325
],
2426
)
2527

python/array_record_module.cc

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ limitations under the License.
2222
#include <vector>
2323

2424
#include "absl/status/status.h"
25+
#include "absl/strings/match.h"
2526
#include "absl/strings/str_format.h"
2627
#include "absl/strings/string_view.h"
2728
#include "cpp/array_record_reader.h"
@@ -34,6 +35,8 @@ limitations under the License.
3435
#include "riegeli/base/maker.h"
3536
#include "riegeli/bytes/fd_reader.h"
3637
#include "riegeli/bytes/fd_writer.h"
38+
#include "riegeli/gcs/gcs_object.h"
39+
#include "riegeli/gcs/gcs_reader.h"
3740

3841
namespace py = pybind11;
3942

@@ -50,10 +53,13 @@ PYBIND11_MODULE(array_record_module, m) {
5053
throw py::value_error(
5154
std::string(status_or_option.status().message()));
5255
}
56+
riegeli::FdWriterBase::Options file_writer_options;
57+
file_writer_options.set_buffer_size(size_t{16} << 20);
5358
// Release the GIL because IO is time consuming.
5459
py::gil_scoped_release scoped_release;
5560
return new array_record::ArrayRecordWriter(
56-
riegeli::Maker<riegeli::FdWriter>(path),
61+
riegeli::Maker<riegeli::FdWriter>(
62+
path, std::move(file_writer_options)),
5763
status_or_option.value());
5864
}),
5965
py::arg("path"), py::arg("options") = "")
@@ -84,18 +90,29 @@ PYBIND11_MODULE(array_record_module, m) {
8490
std::string(status_or_option.status().message()));
8591
}
8692
riegeli::FdReaderBase::Options file_reader_options;
93+
riegeli::GcsReader::Options gcs_reader_options;
8794
if (kwargs.contains("file_reader_buffer_size")) {
8895
auto file_reader_buffer_size =
8996
kwargs["file_reader_buffer_size"].cast<int64_t>();
9097
file_reader_options.set_buffer_size(file_reader_buffer_size);
98+
gcs_reader_options.set_buffer_size(file_reader_buffer_size);
9199
}
92100
// Release the GIL because IO is time consuming.
93101
py::gil_scoped_release scoped_release;
94-
return new array_record::ArrayRecordReader(
95-
riegeli::Maker<riegeli::FdReader>(
96-
path, std::move(file_reader_options)),
97-
status_or_option.value(),
98-
array_record::ArrayRecordGlobalPool());
102+
if (absl::StartsWith(path, "gs://")) {
103+
return new array_record::ArrayRecordReader(
104+
riegeli::Maker<riegeli::GcsReader>(
105+
google::cloud::storage::Client(),
106+
riegeli::GcsObject(path), std::move(gcs_reader_options)),
107+
status_or_option.value(),
108+
array_record::ArrayRecordGlobalPool());
109+
} else {
110+
return new array_record::ArrayRecordReader(
111+
riegeli::Maker<riegeli::FdReader>(
112+
path, std::move(file_reader_options)),
113+
status_or_option.value(),
114+
array_record::ArrayRecordGlobalPool());
115+
}
99116
}),
100117
py::arg("path"), py::arg("options") = "", R"(
101118
ArrayRecordReader for fast sequential or random access.

0 commit comments

Comments
 (0)