Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions be/src/storage/segment/variant/variant_column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1205,6 +1205,10 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, ColumnMetaAcce
_statistics->subcolumns_non_null_size.emplace(relative_path.get_path(),
column_pb.none_null_size());
}
// 3.1.2 may store a flat JSON key like {"a.b": 1} as a single PathInData part.
// New compaction schema and query path expect a dot-split multi-part shape.
// Normalize here before inserting into the meta tree to match that shape.
relative_path.normalize_legacy_flat_dot_key();
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

relative_path = PathInData(relative_path.get_path()); 这样可以吗

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

会丢掉typed

_subcolumns_meta_info->add(
relative_path,
SubcolumnMeta {
Expand Down
30 changes: 30 additions & 0 deletions be/src/util/json/path_in_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,36 @@ PathInData PathInData::copy_pop_back() const {
return new_path;
}

void PathInData::normalize_legacy_flat_dot_key() {
// Single pass: bail out on any part carrying non-default metadata
// (is_nested / anonymous_array_level > 0) -- those would be lost by
// the rewrite below. Also note whether any key still has a dot.
bool has_dot_in_key = false;
for (const Part& part : parts) {
if (part.is_nested || part.anonymous_array_level != 0) {
return;
}
if (!has_dot_in_key && part.key.find('.') != std::string_view::npos) {
has_dot_in_key = true;
}
}
if (!has_dot_in_key) {
return; // already well-formed
}

// re-splitting it yields the dot-normalized parts.
parts.clear();
const char* begin = path.data();
const char* end = path.data() + path.size();
for (const char* it = begin; it != end; ++it) {
if (*it == '.') {
parts.emplace_back(std::string_view {begin, static_cast<size_t>(it - begin)}, false, 0);
begin = it + 1;
}
}
parts.emplace_back(std::string_view {begin, static_cast<size_t>(end - begin)}, false, 0);
}

PathInData PathInData::copy_pop_nfront(size_t n) const {
if (n >= parts.size()) {
return {};
Expand Down
4 changes: 4 additions & 0 deletions be/src/util/json/path_in_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ class PathInData {
PathInData copy_pop_front() const;
PathInData copy_pop_nfront(size_t n) const;
PathInData copy_pop_back() const;

/// Re-split `path` on dots to flatten legacy parts whose key still contains a dot.
/// E.g. 3.1.2 stored `{"a.b":1}` as one part `"a.b"`, this method rewrites it into two parts `"a"` / `"b"`.
void normalize_legacy_flat_dot_key();
static bool try_strip_prefix(const std::string& name, const std::string& prefix_dot,
std::string* out);
static PathInData append(const PathInData& base, std::string_view suffix);
Expand Down
266 changes: 266 additions & 0 deletions be/test/storage/segment/variant_column_writer_reader_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,272 @@ static std::vector<std::string> normalize_json_rows(const std::vector<std::strin
return normalized;
}

// Direct unit tests for PathInData::normalize_legacy_flat_dot_key().
// Covers every shape the legacy cloud-4.1.2 JSON parser / PathInDataBuilder
// could produce for dot-containing keys.
TEST_F(VariantColumnWriterReaderTest, test_normalize_legacy_flat_dot_key_shapes) {
// Helper: build a PathInData from a list of (key, is_nested, anon_level).
auto make_path = [](std::vector<std::tuple<std::string_view, bool, UInt8>> spec,
bool is_typed = false) {
PathInData::Parts parts;
for (auto& [key, nested, level] : spec) {
parts.emplace_back(key, nested, level);
}
PathInData p(parts);
if (is_typed) {
// Round-trip via string_view constructor preserves is_typed flag.
p = PathInData(p.get_path(), true);
// Re-apply metadata we want (none for the typed test case).
}
return p;
};

// 1. {"a.b": 1} -> single part "a.b" -> split into [a, b]
{
auto p = make_path({{"a.b", false, 0}});
ASSERT_EQ(p.get_parts().size(), 1U);
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 2U);
EXPECT_EQ(p.get_parts()[0].key, "a");
EXPECT_EQ(p.get_parts()[1].key, "b");
EXPECT_EQ(p.get_path(), "a.b"); // path string unchanged
}

// 2. {"a": {"b.c": 1}} -> parts [("a"), ("b.c")] -> split into [a, b, c]
{
auto p = make_path({{"a", false, 0}, {"b.c", false, 0}});
ASSERT_EQ(p.get_parts().size(), 2U);
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 3U);
EXPECT_EQ(p.get_parts()[0].key, "a");
EXPECT_EQ(p.get_parts()[1].key, "b");
EXPECT_EQ(p.get_parts()[2].key, "c");
}

// 3. {"a.b": {"c.d": 1}} -> parts [("a.b"), ("c.d")] -> split into [a, b, c, d]
{
auto p = make_path({{"a.b", false, 0}, {"c.d", false, 0}});
ASSERT_EQ(p.get_parts().size(), 2U);
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 4U);
EXPECT_EQ(p.get_parts()[0].key, "a");
EXPECT_EQ(p.get_parts()[1].key, "b");
EXPECT_EQ(p.get_parts()[2].key, "c");
EXPECT_EQ(p.get_parts()[3].key, "d");
}

// 4. {"x": {"y": {"a.b": 1}}} -> parts [("x"), ("y"), ("a.b")] -> [x, y, a, b]
{
auto p = make_path({{"x", false, 0}, {"y", false, 0}, {"a.b", false, 0}});
ASSERT_EQ(p.get_parts().size(), 3U);
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 4U);
EXPECT_EQ(p.get_parts()[0].key, "x");
EXPECT_EQ(p.get_parts()[1].key, "y");
EXPECT_EQ(p.get_parts()[2].key, "a");
EXPECT_EQ(p.get_parts()[3].key, "b");
}

// 5. {"a.b.c.d": 1} -> single part "a.b.c.d" -> [a, b, c, d]
{
auto p = make_path({{"a.b.c.d", false, 0}});
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 4U);
EXPECT_EQ(p.get_parts()[0].key, "a");
EXPECT_EQ(p.get_parts()[1].key, "b");
EXPECT_EQ(p.get_parts()[2].key, "c");
EXPECT_EQ(p.get_parts()[3].key, "d");
}

// 6. Bail-out: a part marked is_nested=true -> untouched
{
auto p = make_path({{"a.b", true, 0}});
ASSERT_EQ(p.get_parts().size(), 1U);
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 1U);
EXPECT_EQ(p.get_parts()[0].key, "a.b");
EXPECT_TRUE(p.get_parts()[0].is_nested);
}

// 7. Bail-out: a part with anonymous_array_level > 0 -> untouched
{
auto p = make_path({{"a.b", false, 2}});
ASSERT_EQ(p.get_parts().size(), 1U);
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 1U);
EXPECT_EQ(p.get_parts()[0].key, "a.b");
EXPECT_EQ(p.get_parts()[0].anonymous_array_level, 2);
}

// 8. Bail-out: nested on inner part with dot on outer -> untouched
{
auto p = make_path({{"a.b", false, 0}, {"c", true, 0}});
auto parts_before = p.get_parts().size();
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), parts_before);
EXPECT_EQ(p.get_parts()[0].key, "a.b");
EXPECT_EQ(p.get_parts()[1].key, "c");
}

// 9. No-op: regular multi-part without dots in keys -> unchanged
{
auto p = make_path({{"a", false, 0}, {"b", false, 0}});
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 2U);
EXPECT_EQ(p.get_parts()[0].key, "a");
EXPECT_EQ(p.get_parts()[1].key, "b");
}

// 10. No-op: single part without dot -> unchanged
{
auto p = make_path({{"hello", false, 0}});
p.normalize_legacy_flat_dot_key();
ASSERT_EQ(p.get_parts().size(), 1U);
EXPECT_EQ(p.get_parts()[0].key, "hello");
}

// 11. is_typed is preserved across normalization
{
PathInData typed(std::string_view("a.b"), /*is_typed=*/true);
ASSERT_TRUE(typed.get_is_typed());
// typed is already 2 parts (from string_view ctor), but normalize
// must still preserve is_typed.
typed.normalize_legacy_flat_dot_key();
EXPECT_TRUE(typed.get_is_typed());
}
}

// Regression test for legacy flat-dot-key compatibility.
//
// Old versions (e.g. cloud-4.1.2 with variant_max_subcolumns_count=0) stored
// a flat JSON key like {"a.b": 1} as a single PathInData part "a.b" in the
// segment's ColumnPathInfo protobuf. New master compaction schema builds
// query paths by splitting on dots (3+ parts including root), which does not
// match the 1-part tree node and causes silent data loss during compaction.
//
// This test writes a normal variant segment via the writer, then *mutates*
// the resulting footer to turn a subcolumn's `column_path_info` into the
// legacy 1-part form, then calls `VariantColumnReader::init()` and verifies
// that the normalization inside init() rebuilds a multi-level tree that can
// be queried via both `get_subcolumn_meta_by_path` and prefix-path lookup.
TEST_F(VariantColumnWriterReaderTest, test_legacy_flat_dot_key_reader_init) {
// 1. create tablet_schema with a variant column that has nested subcolumns
TabletSchemaPB schema_pb;
schema_pb.set_keys_type(KeysType::DUP_KEYS);
construct_column(schema_pb.add_column(), 1, "VARIANT", "V1", /*max_subcolumns=*/10);
_tablet_schema = std::make_shared<TabletSchema>();
_tablet_schema->init_from_pb(schema_pb);

// 2. create tablet
TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema));
_tablet_schema->set_external_segment_meta_used_default(false);
tablet_meta->_tablet_id = 20000;
_tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get());
EXPECT_TRUE(_tablet->init().ok());
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok());
EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok());

// 3. create file_writer
io::FileWriterPtr file_writer;
auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0);
auto st = io::global_local_filesystem()->create_file(file_path, &file_writer);
EXPECT_TRUE(st.ok()) << st.msg();

// 4. create column_writer
SegmentFooterPB footer;
ColumnWriterOptions opts;
opts.meta = footer.add_columns();
opts.compression_type = CompressionTypePB::LZ4;
opts.file_writer = file_writer.get();
opts.footer = &footer;
RowsetWriterContext rowset_ctx;
rowset_ctx.write_type = DataWriteType::TYPE_DIRECT;
opts.rowset_ctx = &rowset_ctx;
opts.rowset_ctx->tablet_schema = _tablet_schema;
TabletColumn column = _tablet_schema->column(0);
_init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4);

std::unique_ptr<ColumnWriter> writer;
EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok());
EXPECT_TRUE(writer->init().ok());

// 5. write nested json so the writer naturally creates a subcolumn "a.b"
// with a 2-part path ["a", "b"].
std::vector<std::string> jsons;
const int kNumRows = 8;
for (int i = 0; i < kNumRows; ++i) {
jsons.push_back(R"({"a": {"b": "v)" + std::to_string(i) + R"("}})");
}
EXPECT_TRUE(append_json_batch(writer.get(), jsons).ok());
EXPECT_TRUE(writer->finish().ok());
EXPECT_TRUE(writer->write_data().ok());
EXPECT_TRUE(writer->write_ordinal_index().ok());
EXPECT_TRUE(writer->write_zone_map().ok());
EXPECT_TRUE(file_writer->close().ok());
footer.set_num_rows(kNumRows);

// 6. Locate the "V1.a.b" subcolumn in the footer and mutate its
// column_path_info into the legacy 1-part form: pb.path = "V1.a.b" but
// path_part_infos = [{"V1"}, {"a.b"}]. This is exactly what cloud-4.1.2
// wrote for JSON key {"a.b": ...}.
int target_idx = -1;
for (int i = 1; i < footer.columns_size(); ++i) {
const auto& col_meta = footer.columns(i);
if (!col_meta.has_column_path_info()) {
continue;
}
if (col_meta.column_path_info().path() == "v1.a.b") {
target_idx = i;
break;
}
}
ASSERT_GT(target_idx, 0) << "failed to locate subcolumn V1.a.b in footer";

auto* target_path_info = footer.mutable_columns(target_idx)->mutable_column_path_info();
target_path_info->clear_path_part_infos();
auto* root_part = target_path_info->add_path_part_infos();
root_part->set_key("v1");
root_part->set_is_nested(false);
root_part->set_anonymous_array_level(0);
auto* legacy_part = target_path_info->add_path_part_infos();
legacy_part->set_key("a.b"); // single legacy part containing a dot
legacy_part->set_is_nested(false);
legacy_part->set_anonymous_array_level(0);
target_path_info->set_has_nested(false);

// 7. Now initialize a fresh VariantColumnReader with the mutated footer.
// The init() path calls _subcolumns_meta_info->add() for each subcolumn;
// our fix normalizes the legacy 1-part relative path "a.b" into a
// 2-part path ["a", "b"] so the tree has root -> "a" -> "b".
io::FileReaderSPtr file_reader;
st = io::global_local_filesystem()->open_file(file_path, &file_reader);
ASSERT_TRUE(st.ok()) << st.msg();

std::shared_ptr<segment_v2::ColumnReader> column_reader;
st = create_variant_root_reader(footer, file_reader, _tablet_schema, &column_reader);
ASSERT_TRUE(st.ok()) << st.msg();
auto* variant_reader = assert_cast<segment_v2::VariantColumnReader*>(column_reader.get());
ASSERT_NE(variant_reader, nullptr);

// 8. Verify that queries against the normalized tree succeed.
// - Leaf lookup "a.b" (PathInData splits into 2 parts) should hit.
// - Intermediate lookup "a" should return the TUPLE parent, which
// has exactly one child "b".
const auto* leaf_node = variant_reader->get_subcolumn_meta_by_path(PathInData("a.b"));
ASSERT_NE(leaf_node, nullptr)
<< "normalized tree should be able to find leaf 'a.b' via multi-part query";
EXPECT_TRUE(leaf_node->is_scalar());
EXPECT_GE(leaf_node->data.footer_ordinal, 0);

const auto* subtree = variant_reader->get_subcolumns_meta_info();
ASSERT_NE(subtree, nullptr);
const auto* intermediate = subtree->find_exact(PathInData("a"));
ASSERT_NE(intermediate, nullptr)
<< "normalized tree should expose intermediate node 'a' as a TUPLE";
EXPECT_FALSE(intermediate->is_scalar());
EXPECT_EQ(intermediate->children.size(), 1U);
}

TEST_F(VariantColumnWriterReaderTest, test_statics) {
// VariantStatisticsPB stats_pb;
// auto* subcolumns_stats = stats_pb.mutable_sparse_column_non_null_size();
Expand Down
Loading