man-group · phoebusm · Nov 22, 2025 · Oct 30, 2025 · Nov 21, 2025 · phoebusm
@@ -555,6 +555,7 @@ set(arcticdb_srcs
         version/version_utils.cpp
         version/symbol_list.cpp
         version/version_map_batch_methods.cpp
+        version/version_tasks.cpp
         storage/s3/ec2_utils.cpp
 )
 

@@ -24,6 +24,21 @@ namespace arcticdb {
 
 using OutputFrame = std::variant<pipelines::PandasOutputFrame, ArrowOutputFrame>;
 
+struct ARCTICDB_VISIBILITY_HIDDEN NodeReadResult {
+    NodeReadResult(
+            const StreamId& symbol, OutputFrame&& frame_data,
+            arcticdb::proto::descriptors::NormalizationMetadata&& norm_meta
+    ) :
+        symbol_(symbol),
+        frame_data_(std::move(frame_data)),
+        norm_meta_(std::move(norm_meta)) {};
+    StreamId symbol_;
+    OutputFrame frame_data_;
+    arcticdb::proto::descriptors::NormalizationMetadata norm_meta_;
+
+    ARCTICDB_MOVE_ONLY_DEFAULT(NodeReadResult)
+};
+
 struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
     ReadResult(
             const std::variant<VersionedItem, std::vector<VersionedItem>>& versioned_item, OutputFrame&& frame_data,
@@ -32,15 +47,15 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
                     arcticdb::proto::descriptors::UserDefinedMetadata,
                     std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>& user_meta,
             const arcticdb::proto::descriptors::UserDefinedMetadata& multi_key_meta,
-            std::vector<entity::AtomKey>&& multi_keys
+            std::vector<NodeReadResult>&& node_results = {}
     ) :
         item(versioned_item),
         frame_data(std::move(frame_data)),
         output_format(output_format),
         norm_meta(norm_meta),
         user_meta(user_meta),
         multi_key_meta(multi_key_meta),
-        multi_keys(std::move(multi_keys)) {}
+        node_results(std::move(node_results)) {}
     std::variant<VersionedItem, std::vector<VersionedItem>> item;
     OutputFrame frame_data;
     OutputFormat output_format;
@@ -50,71 +65,53 @@ struct ARCTICDB_VISIBILITY_HIDDEN ReadResult {
             std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>
             user_meta;
     arcticdb::proto::descriptors::UserDefinedMetadata multi_key_meta;
-    std::vector<entity::AtomKey> multi_keys;
+    std::vector<NodeReadResult> node_results;
 
     ARCTICDB_MOVE_ONLY_DEFAULT(ReadResult)
 };
 
-inline ReadResult create_python_read_result(
-        const std::variant<VersionedItem, std::vector<VersionedItem>>& version, OutputFormat output_format,
-        FrameAndDescriptor&& fd,
-        std::optional<std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>&& user_meta = std::nullopt
-) {
-    auto result = std::move(fd);
-
-    // If version is a vector then this was a multi-symbol join, so the user_meta vector should have a value
-    // Otherwise, there is a single piece of metadata on the frame descriptor
-    util::check(
-            std::holds_alternative<VersionedItem>(version) ^ user_meta.has_value(),
-            "Unexpected argument combination to create_python_read_result"
-    );
-
-    // Very old (pre Nov-2020) PandasIndex protobuf messages had no "start" or "step" fields. If is_physically_stored
-    // (renamed from is_not_range_index) was false, the index was always RangeIndex(num_rows, 1)
-    // This used to be handled in the Python layer by passing None to the DataFrame index parameter, which would then
-    // default to RangeIndex(num_rows, 1). However, the empty index also has is_physically_stored as false, and because
-    // integer protobuf fields default to zero if they are not present on the wire, it is impossible to tell from
-    // the normalization metadata alone if the data was written with an empty index, or with a very old range index.
-    // We therefore patch the normalization metadata here in this case
-    auto norm_meta = result.desc_.mutable_proto().mutable_normalization();
-    if (norm_meta->has_df() || norm_meta->has_series()) {
-        auto common = norm_meta->has_df() ? norm_meta->mutable_df()->mutable_common()
-                                          : norm_meta->mutable_series()->mutable_common();
-        if (common->has_index()) {
-            auto index = common->mutable_index();
-            if (result.desc_.index().type() == IndexDescriptor::Type::ROWCOUNT && !index->is_physically_stored() &&
-                index->start() == 0 && index->step() == 0) {
-                index->set_step(1);
-            }
-        }
-    }
-
-    auto python_frame = [&]() -> OutputFrame {
-        if (output_format == OutputFormat::ARROW) {
-            return ArrowOutputFrame{segment_to_arrow_data(result.frame_)};
-        } else {
-            return pipelines::PandasOutputFrame{result.frame_};
-        }
-    }();
-    util::print_total_mem_usage(__FILE__, __LINE__, __FUNCTION__);
-
-    const auto& desc_proto = result.desc_.proto();
-    std::variant<
-            arcticdb::proto::descriptors::UserDefinedMetadata,
-            std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>
-            metadata;
-    if (user_meta.has_value()) {
-        metadata = std::move(*user_meta);
-    } else {
-        metadata = std::move(desc_proto.user_meta());
-    }
-    return {version,
-            std::move(python_frame),
-            output_format,
-            desc_proto.normalization(),
-            metadata,
-            desc_proto.multi_key_meta(),
-            std::move(result.keys_)};
-}
+namespace version_store {
+
+struct SymbolProcessingResult {
+    VersionedItem versioned_item_;
+    proto::descriptors::UserDefinedMetadata metadata_;
+    OutputSchema output_schema_;
+    std::vector<EntityId> entity_ids_;
+};
+
+struct ReadVersionOutput {
+    ReadVersionOutput() = delete;
+    ReadVersionOutput(VersionedItem&& versioned_item, FrameAndDescriptor&& frame_and_descriptor) :
+        versioned_item_(std::move(versioned_item)),
+        frame_and_descriptor_(std::move(frame_and_descriptor)) {}
+
+    ARCTICDB_MOVE_ONLY_DEFAULT(ReadVersionOutput)
+
+    VersionedItem versioned_item_;
+    FrameAndDescriptor frame_and_descriptor_;
+};
+
+struct ReadVersionWithNodesOutput {
+    ReadVersionOutput root_;
+    std::vector<ReadVersionOutput> nodes_;
+};
+
+struct MultiSymbolReadOutput {
+    MultiSymbolReadOutput() = delete;
+    MultiSymbolReadOutput(
+            std::vector<VersionedItem>&& versioned_items,
+            std::vector<proto::descriptors::UserDefinedMetadata>&& metadatas, FrameAndDescriptor&& frame_and_descriptor
+    ) :
+        versioned_items_(std::move(versioned_items)),
+        metadatas_(std::move(metadatas)),
+        frame_and_descriptor_(std::move(frame_and_descriptor)) {}
+
+    ARCTICDB_MOVE_ONLY_DEFAULT(MultiSymbolReadOutput)
+
+    std::vector<VersionedItem> versioned_items_;
+    std::vector<proto::descriptors::UserDefinedMetadata> metadatas_;
+    FrameAndDescriptor frame_and_descriptor_;
+};
+} // namespace version_store
 
 } // namespace arcticdb
@@ -47,6 +47,80 @@ inline void apply_type_handlers(SegmentInMemory seg, std::any& handler_data, Out
     }
 }
 
+inline ReadResult create_python_read_result(
+        const std::variant<VersionedItem, std::vector<VersionedItem>>& version, OutputFormat output_format,
+        FrameAndDescriptor&& fd,
+        std::optional<std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>&& user_meta = std::nullopt,
+        std::vector<version_store::ReadVersionOutput>&& node_outputs = {}
+) {
+    auto result = std::move(fd);
+
+    // If version is a vector then this was a multi-symbol join, so the user_meta vector should have a value
+    // Otherwise, there is a single piece of metadata on the frame descriptor
+    util::check(
+            std::holds_alternative<VersionedItem>(version) ^ user_meta.has_value(),
+            "Unexpected argument combination to create_python_read_result"
+    );
+
+    // Very old (pre Nov-2020) PandasIndex protobuf messages had no "start" or "step" fields. If is_physically_stored
+    // (renamed from is_not_range_index) was false, the index was always RangeIndex(num_rows, 1)
+    // This used to be handled in the Python layer by passing None to the DataFrame index parameter, which would then
+    // default to RangeIndex(num_rows, 1). However, the empty index also has is_physically_stored as false, and because
+    // integer protobuf fields default to zero if they are not present on the wire, it is impossible to tell from
+    // the normalization metadata alone if the data was written with an empty index, or with a very old range index.
+    // We therefore patch the normalization metadata here in this case
+    auto norm_meta = result.desc_.mutable_proto().mutable_normalization();
+    if (norm_meta->has_df() || norm_meta->has_series()) {
+        auto common = norm_meta->has_df() ? norm_meta->mutable_df()->mutable_common()
+                                          : norm_meta->mutable_series()->mutable_common();
+        if (common->has_index()) {
+            auto index = common->mutable_index();
+            if (result.desc_.index().type() == IndexDescriptor::Type::ROWCOUNT && !index->is_physically_stored() &&
+                index->start() == 0 && index->step() == 0) {
+                index->set_step(1);
+            }
+        }
+    }
+
+    auto get_python_frame = [output_format](auto& result) -> OutputFrame {
+        if (output_format == OutputFormat::ARROW) {
+            return ArrowOutputFrame{segment_to_arrow_data(result.frame_)};
+        } else {
+            return pipelines::PandasOutputFrame{result.frame_};
+        }
+    };
+    auto python_frame = get_python_frame(result);
+    util::print_total_mem_usage(__FILE__, __LINE__, __FUNCTION__);
+
+    const auto& desc_proto = result.desc_.proto();
+    std::variant<
+            arcticdb::proto::descriptors::UserDefinedMetadata,
+            std::vector<arcticdb::proto::descriptors::UserDefinedMetadata>>
+            metadata;
+    if (user_meta.has_value()) {
+        metadata = std::move(*user_meta);
+    } else {
+        metadata = std::move(desc_proto.user_meta());
+    }
+
+    std::vector<NodeReadResult> node_results;
+    for (auto& node_output : node_outputs) {
+        auto& node_fd = node_output.frame_and_descriptor_;
+        auto node_python_frame = get_python_frame(node_fd);
+        auto node_metadata = node_fd.desc_.proto().normalization();
+        node_results.emplace_back(
+                node_output.versioned_item_.symbol(), std::move(node_python_frame), std::move(node_metadata)
+        );
+    }
+    return {version,
+            std::move(python_frame),
+            output_format,
+            desc_proto.normalization(),
+            metadata,
+            desc_proto.multi_key_meta(),
+            std::move(node_results)};
+}
+
 inline ReadResult read_result_from_single_frame(
         FrameAndDescriptor& frame_and_desc, const AtomKey& key, std::any& handler_data, OutputFormat output_format
 ) {

@@ -34,7 +34,10 @@ inline py::tuple adapt_read_df(ReadResult&& ret, std::any* const handler_data) {
             }
     );
     auto multi_key_meta = python_util::pb_to_python(ret.multi_key_meta);
-    return py::make_tuple(ret.item, std::move(ret.frame_data), pynorm, pyuser_meta, multi_key_meta, ret.multi_keys);
+    auto node_results = python_util::node_results_to_python_list(std::move(ret.node_results));
+    return py::make_tuple(
+            ret.item, std::move(ret.frame_data), pynorm, pyuser_meta, multi_key_meta, std::move(node_results)
+    );
 };
 
 } // namespace arcticdb
@@ -224,6 +224,16 @@ class PyTimestampRange {
     timestamp end_;
 };
 
+inline py::list node_results_to_python_list(std::vector<NodeReadResult>&& node_results) {
+    py::list node_results_list;
+    for (auto& node_result : node_results) {
+        node_results_list.append(py::make_tuple(
+                node_result.symbol_, std::move(node_result.frame_data_), pb_to_python(node_result.norm_meta_)
+        ));
+    }
+    return node_results_list;
+}
+
 inline py::list adapt_read_dfs(std::vector<std::variant<ReadResult, DataError>>&& r, std::any* const handler) {
     auto ret = std::move(r);
     py::list lst;
@@ -232,22 +242,22 @@ inline py::list adapt_read_dfs(std::vector<std::variant<ReadResult, DataError>>&
         util::variant_match(
                 res,
                 [&lst, &output_format](ReadResult& read_result) {
-                    auto pynorm = python_util::pb_to_python(read_result.norm_meta);
+                    auto pynorm = pb_to_python(read_result.norm_meta);
                     util::check(
                             std::holds_alternative<proto::descriptors::UserDefinedMetadata>(read_result.user_meta),
                             "Expected single user metadata in adapt_read_dfs, received vector"
                     );
-                    auto pyuser_meta = python_util::pb_to_python(
-                            std::get<proto::descriptors::UserDefinedMetadata>(read_result.user_meta)
-                    );
-                    auto multi_key_meta = python_util::pb_to_python(read_result.multi_key_meta);
+                    auto pyuser_meta =
+                            pb_to_python(std::get<proto::descriptors::UserDefinedMetadata>(read_result.user_meta));
+                    auto multi_key_meta = pb_to_python(read_result.multi_key_meta);
+                    auto node_results = node_results_to_python_list(std::move(read_result.node_results));
                     lst.append(py::make_tuple(
                             read_result.item,
                             std::move(read_result.frame_data),
                             pynorm,
                             pyuser_meta,
                             multi_key_meta,
-                            read_result.multi_keys
+                            std::move(node_results)
                     ));
                     util::check(
                             !output_format.has_value() || output_format.value() == read_result.output_format,