From 3638ea433793c6d3a608fb7e95ff6fda42b725e3 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Thu, 9 Apr 2026 17:40:40 +0800 Subject: [PATCH 1/3] [fix](filecache) unify TTL expiration calculation and persist pending rowset timestamps The file cache TTL flow used inconsistent expiration rules across write, read, and warmup paths. Query paths clamped expired values to 0, while writer and some warmup paths could still produce non-zero absolute expiration timestamps. Pending rowsets also delayed persisting newest_write_timestamp, which let cache writes use a different time base from later reads and warmup tasks. This change adds a shared helper to calculate file cache expiration with validation, overflow protection, and expired-value clamping. It reuses the same logic in rowset read, rowset write, and cloud warmup paths so the same cache hash gets the same expiration_time consistently. This change also persists newest_write_timestamp into pending/prepared rowset meta during initialization when the context already provides a valid value. That keeps import/write, query, and warmup flows aligned on the same timestamp base and avoids generating different TTL cache directories for the same object. --- be/src/cloud/cloud_internal_service.cpp | 10 ++--- be/src/cloud/cloud_rowset_writer.cpp | 3 ++ be/src/cloud/cloud_tablet.cpp | 9 ++--- be/src/cloud/cloud_warm_up_manager.cpp | 10 ++--- be/src/io/cache/file_cache_expiration.h | 47 ++++++++++++++++++++++ be/src/olap/rowset/beta_rowset_reader.cpp | 10 ++--- be/src/olap/rowset/beta_rowset_writer.cpp | 3 ++ be/src/olap/rowset/rowset_writer_context.h | 6 +-- 8 files changed, 68 insertions(+), 30 deletions(-) create mode 100644 be/src/io/cache/file_cache_expiration.h diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index e453bc7ee443a2..48e8d6624de77f 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -30,6 +30,7 @@ #include "io/cache/block_file_cache.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" +#include "io/cache/file_cache_expiration.h" #include "runtime/thread_context.h" #include "runtime/workload_management/io_throttle.h" #include "util/async_io.h" @@ -446,13 +447,8 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c << " us, tablet_id: " << rs_meta.tablet_id() << ", rowset_id: " << rowset_id.to_string(); } - int64_t expiration_time = - tablet_meta->ttl_seconds() == 0 || rs_meta.newest_write_timestamp() <= 0 - ? 0 - : rs_meta.newest_write_timestamp() + tablet_meta->ttl_seconds(); - if (expiration_time <= UnixSeconds()) { - expiration_time = 0; - } + int64_t expiration_time = io::calc_file_cache_expiration_time( + rs_meta.newest_write_timestamp(), tablet_meta->ttl_seconds()); if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpTriggerSource::EVENT_DRIVEN)) { LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string() diff --git a/be/src/cloud/cloud_rowset_writer.cpp b/be/src/cloud/cloud_rowset_writer.cpp index c5b58049ae42b4..de152ae8341391 100644 --- a/be/src/cloud/cloud_rowset_writer.cpp +++ b/be/src/cloud/cloud_rowset_writer.cpp @@ -65,6 +65,9 @@ Status CloudRowsetWriter::init(const RowsetWriterContext& rowset_writer_context) if (_context.rowset_state == PREPARED || _context.rowset_state == COMMITTED) { _is_pending = true; _rowset_meta->set_load_id(_context.load_id); + if (_context.newest_write_timestamp > 0) { + _rowset_meta->set_newest_write_timestamp(_context.newest_write_timestamp); + } } else { // Rowset generated by compaction or schema change _rowset_meta->set_version(_context.version); diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 75f3218c5f3cda..99ecc4d8908830 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -47,6 +47,7 @@ #include "cpp/sync_point.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" +#include "io/cache/file_cache_expiration.h" #include "olap/base_tablet.h" #include "olap/compaction.h" #include "olap/cumulative_compaction_time_series_policy.h" @@ -445,12 +446,8 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ continue; } - int64_t expiration_time = - _tablet_meta->ttl_seconds() == 0 || - rowset_meta->newest_write_timestamp() <= 0 - ? 0 - : rowset_meta->newest_write_timestamp() + - _tablet_meta->ttl_seconds(); + int64_t expiration_time = io::calc_file_cache_expiration_time( + rowset_meta->newest_write_timestamp(), _tablet_meta->ttl_seconds()); g_file_cache_cloud_tablet_submitted_segment_num << 1; if (rs->rowset_meta()->segment_file_size(seg_id) > 0) { g_file_cache_cloud_tablet_submitted_segment_size diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 85f350ef0b0cd6..dd2b678447c549 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -33,6 +33,7 @@ #include "common/cast_set.h" #include "common/logging.h" #include "io/cache/block_file_cache_downloader.h" +#include "io/cache/file_cache_expiration.h" #include "olap/rowset/beta_rowset.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/tablet.h" @@ -237,13 +238,8 @@ void CloudWarmUpManager::handle_jobs() { continue; } - int64_t expiration_time = - tablet_meta->ttl_seconds() == 0 || rs->newest_write_timestamp() <= 0 - ? 0 - : rs->newest_write_timestamp() + tablet_meta->ttl_seconds(); - if (expiration_time <= UnixSeconds()) { - expiration_time = 0; - } + int64_t expiration_time = io::calc_file_cache_expiration_time( + rs->newest_write_timestamp(), tablet_meta->ttl_seconds()); if (!tablet->add_rowset_warmup_state(*rs, WarmUpTriggerSource::JOB)) { LOG(INFO) << "found duplicate warmup task for rowset " << rs->rowset_id() << ", skip it"; diff --git a/be/src/io/cache/file_cache_expiration.h b/be/src/io/cache/file_cache_expiration.h new file mode 100644 index 00000000000000..050658ecf77df7 --- /dev/null +++ b/be/src/io/cache/file_cache_expiration.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "util/time.h" + +namespace doris::io { + +// Calc absolute expiration timestamp for file cache TTL mode. +// +// Return 0 means treat it as non-TTL cache (NORMAL/INDEX/DISPOSABLE) and avoid +// putting data into TTL queues / TTL-path directories. +inline int64_t calc_file_cache_expiration_time(int64_t newest_write_timestamp, + int64_t ttl_seconds) { + if (ttl_seconds <= 0 || newest_write_timestamp <= 0) { + return 0; + } + + // Overflow protection. + if (newest_write_timestamp > std::numeric_limits::max() - ttl_seconds) { + return 0; + } + + const int64_t expiration_time = newest_write_timestamp + ttl_seconds; + // Clamp expired TTL to 0 to keep behavior consistent across read/write/warmup. + return expiration_time > UnixSeconds() ? expiration_time : 0; +} + +} // namespace doris::io diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 0b44597ed95671..6a0bb53a48fcc9 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -30,6 +30,7 @@ #include "common/logging.h" #include "common/status.h" +#include "io/cache/file_cache_expiration.h" #include "io/io_common.h" #include "olap/block_column_predicate.h" #include "olap/column_predicate.h" @@ -224,13 +225,8 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context _read_context->runtime_state->query_options().disable_file_cache; } - _read_options.io_ctx.expiration_time = - read_context->ttl_seconds > 0 && _rowset->rowset_meta()->newest_write_timestamp() > 0 - ? _rowset->rowset_meta()->newest_write_timestamp() + read_context->ttl_seconds - : 0; - if (_read_options.io_ctx.expiration_time <= UnixSeconds()) { - _read_options.io_ctx.expiration_time = 0; - } + _read_options.io_ctx.expiration_time = io::calc_file_cache_expiration_time( + _rowset->rowset_meta()->newest_write_timestamp(), read_context->ttl_seconds); bool enable_segment_cache = true; auto* state = read_context->runtime_state; diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 8e0794ce5d3018..d3039182428947 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -315,6 +315,9 @@ Status BaseBetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_conte _is_pending = true; _rowset_meta->set_txn_id(_context.txn_id); _rowset_meta->set_load_id(_context.load_id); + if (_context.newest_write_timestamp > 0) { + _rowset_meta->set_newest_write_timestamp(_context.newest_write_timestamp); + } } else { _rowset_meta->set_version(_context.version); _rowset_meta->set_newest_write_timestamp(_context.newest_write_timestamp); diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index 727c445a61edb4..c57346bca163f2 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -27,6 +27,7 @@ #include "cloud/config.h" #include "common/status.h" +#include "io/cache/file_cache_expiration.h" #include "io/fs/encrypted_fs_factory.h" #include "io/fs/file_system.h" #include "io/fs/file_writer.h" @@ -249,9 +250,8 @@ struct RowsetWriterContext { return io::FileWriterOptions { .write_file_cache = should_write_cache, .is_cold_data = is_hot_data, - .file_cache_expiration = file_cache_ttl_sec > 0 && newest_write_timestamp > 0 - ? newest_write_timestamp + file_cache_ttl_sec - : 0, + .file_cache_expiration = io::calc_file_cache_expiration_time( + newest_write_timestamp, static_cast(file_cache_ttl_sec)), .approximate_bytes_to_write = approximate_bytes_to_write}; } }; From bef5cdbd772d217bd5abf1ef46517d1d7ea9ec69 Mon Sep 17 00:00:00 2001 From: zhengyu Date: Tue, 14 Apr 2026 16:32:43 +0800 Subject: [PATCH 2/3] [fix](filecache) anchor temporary TTL expiration to tablet creation time Keep the temporary file-cache TTL fix stable without changing newest_write_timestamp semantics. - compute file-cache expiration from tablet_meta->creation_time() across read, write, warmup, and sync_meta - stop persisting pending rowset newest_write_timestamp early and keep it dedicated to rowset freshness logic - update sync_meta to refresh both segment and inverted-index cache expiration when TTL anchor metadata changes - add UT coverage for expiration helper behavior and writer-side base timestamp propagation - add a cloud docker regression for the creation-time anchor semantics using SQL-generated data This is a temporary fix for branch-4.0/3.1. It intentionally changes file-cache TTL semantics from latest-write-based expiration to tablet-creation-based expiration, so late writes on old tablets no longer refresh TTL. --- be/src/cloud/cloud_internal_service.cpp | 4 +- be/src/cloud/cloud_rowset_writer.cpp | 3 - be/src/cloud/cloud_tablet.cpp | 36 ++- be/src/cloud/cloud_warm_up_manager.cpp | 2 +- be/src/io/cache/file_cache_expiration.h | 9 +- be/src/olap/rowset/beta_rowset_reader.cpp | 2 +- be/src/olap/rowset/beta_rowset_writer.cpp | 3 - be/src/olap/rowset/rowset_reader_context.h | 1 + be/src/olap/rowset/rowset_writer_context.h | 5 +- be/src/olap/tablet.cpp | 1 + be/src/olap/tablet_reader.cpp | 1 + .../io/cache/file_cache_expiration_test.cpp | 56 +++++ .../ttl/test_ttl_creation_time_anchor.groovy | 207 ++++++++++++++++++ ttl_fix.md | 207 ++++++++++++++++++ ttl_fix_final.md | 163 ++++++++++++++ ttl_fix_regression_plan.md | 66 ++++++ ttl_fix_ut_plan.md | 53 +++++ 17 files changed, 795 insertions(+), 24 deletions(-) create mode 100644 be/test/io/cache/file_cache_expiration_test.cpp create mode 100644 regression-test/suites/cloud_p0/cache/ttl/test_ttl_creation_time_anchor.groovy create mode 100644 ttl_fix.md create mode 100644 ttl_fix_final.md create mode 100644 ttl_fix_regression_plan.md create mode 100644 ttl_fix_ut_plan.md diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 48e8d6624de77f..4660ae3c01063a 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -447,8 +447,8 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c << " us, tablet_id: " << rs_meta.tablet_id() << ", rowset_id: " << rowset_id.to_string(); } - int64_t expiration_time = io::calc_file_cache_expiration_time( - rs_meta.newest_write_timestamp(), tablet_meta->ttl_seconds()); + int64_t expiration_time = io::calc_file_cache_expiration_time(tablet_meta->creation_time(), + tablet_meta->ttl_seconds()); if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpTriggerSource::EVENT_DRIVEN)) { LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string() diff --git a/be/src/cloud/cloud_rowset_writer.cpp b/be/src/cloud/cloud_rowset_writer.cpp index de152ae8341391..c5b58049ae42b4 100644 --- a/be/src/cloud/cloud_rowset_writer.cpp +++ b/be/src/cloud/cloud_rowset_writer.cpp @@ -65,9 +65,6 @@ Status CloudRowsetWriter::init(const RowsetWriterContext& rowset_writer_context) if (_context.rowset_state == PREPARED || _context.rowset_state == COMMITTED) { _is_pending = true; _rowset_meta->set_load_id(_context.load_id); - if (_context.newest_write_timestamp > 0) { - _rowset_meta->set_newest_write_timestamp(_context.newest_write_timestamp); - } } else { // Rowset generated by compaction or schema change _rowset_meta->set_version(_context.version); diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 99ecc4d8908830..1624dfc9f547d5 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -45,6 +45,7 @@ #include "common/config.h" #include "common/logging.h" #include "cpp/sync_point.h" +#include "io/cache/block_file_cache.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" #include "io/cache/file_cache_expiration.h" @@ -447,7 +448,7 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ } int64_t expiration_time = io::calc_file_cache_expiration_time( - rowset_meta->newest_write_timestamp(), _tablet_meta->ttl_seconds()); + _tablet_meta->creation_time(), _tablet_meta->ttl_seconds()); g_file_cache_cloud_tablet_submitted_segment_num << 1; if (rs->rowset_meta()->segment_file_size(seg_id) > 0) { g_file_cache_cloud_tablet_submitted_segment_size @@ -935,6 +936,7 @@ Result> CloudTablet::create_rowset_writer( context.tablet_id = tablet_id(); context.index_id = index_id(); context.partition_id = partition_id(); + context.file_cache_base_timestamp = tablet_meta()->creation_time(); context.enable_unique_key_merge_on_write = enable_unique_key_merge_on_write(); context.encrypt_algorithm = tablet_meta()->encryption_algorithm(); return RowsetFactory::create_rowset_writer(_engine, context, vertical); @@ -976,6 +978,7 @@ Result> CloudTablet::create_transient_rowset_write context.tablet_id = tablet_id(); context.index_id = index_id(); context.partition_id = partition_id(); + context.file_cache_base_timestamp = tablet_meta()->creation_time(); context.enable_unique_key_merge_on_write = enable_unique_key_merge_on_write(); context.txn_expiration = txn_expiration; context.encrypt_algorithm = tablet_meta()->encryption_algorithm(); @@ -1572,19 +1575,38 @@ Status CloudTablet::sync_meta() { return st; } + auto old_creation_time = _tablet_meta->creation_time(); + auto new_creation_time = tablet_meta->creation_time(); + bool creation_time_changed = old_creation_time != new_creation_time; + if (creation_time_changed) { + _tablet_meta->set_creation_time(new_creation_time); + } + + auto old_ttl_seconds = _tablet_meta->ttl_seconds(); auto new_ttl_seconds = tablet_meta->ttl_seconds(); - if (_tablet_meta->ttl_seconds() != new_ttl_seconds) { + bool ttl_changed = old_ttl_seconds != new_ttl_seconds; + if (ttl_changed) { _tablet_meta->set_ttl_seconds(new_ttl_seconds); - int64_t cur_time = UnixSeconds(); + } + + if (creation_time_changed || ttl_changed) { + int64_t new_expiration_time = + io::calc_file_cache_expiration_time(new_creation_time, new_ttl_seconds); std::shared_lock rlock(_meta_lock); for (auto& [_, rs] : _rs_version_map) { for (int seg_id = 0; seg_id < rs->num_segments(); ++seg_id) { - int64_t new_expiration_time = - new_ttl_seconds + rs->rowset_meta()->newest_write_timestamp(); - new_expiration_time = new_expiration_time > cur_time ? new_expiration_time : 0; auto file_key = Segment::file_cache_key(rs->rowset_id().to_string(), seg_id); auto* file_cache = io::FileCacheFactory::instance()->get_by_path(file_key); - file_cache->modify_expiration_time(file_key, new_expiration_time); + if (file_cache != nullptr) { + file_cache->modify_expiration_time(file_key, new_expiration_time); + } + } + for (const auto& file_name : rs->get_index_file_names()) { + auto file_key = io::BlockFileCache::hash(file_name); + auto* file_cache = io::FileCacheFactory::instance()->get_by_path(file_key); + if (file_cache != nullptr) { + file_cache->modify_expiration_time(file_key, new_expiration_time); + } } } } diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index dd2b678447c549..02df3e9113739d 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -239,7 +239,7 @@ void CloudWarmUpManager::handle_jobs() { } int64_t expiration_time = io::calc_file_cache_expiration_time( - rs->newest_write_timestamp(), tablet_meta->ttl_seconds()); + tablet_meta->creation_time(), tablet_meta->ttl_seconds()); if (!tablet->add_rowset_warmup_state(*rs, WarmUpTriggerSource::JOB)) { LOG(INFO) << "found duplicate warmup task for rowset " << rs->rowset_id() << ", skip it"; diff --git a/be/src/io/cache/file_cache_expiration.h b/be/src/io/cache/file_cache_expiration.h index 050658ecf77df7..1a377e8e330652 100644 --- a/be/src/io/cache/file_cache_expiration.h +++ b/be/src/io/cache/file_cache_expiration.h @@ -28,18 +28,17 @@ namespace doris::io { // // Return 0 means treat it as non-TTL cache (NORMAL/INDEX/DISPOSABLE) and avoid // putting data into TTL queues / TTL-path directories. -inline int64_t calc_file_cache_expiration_time(int64_t newest_write_timestamp, - int64_t ttl_seconds) { - if (ttl_seconds <= 0 || newest_write_timestamp <= 0) { +inline int64_t calc_file_cache_expiration_time(int64_t base_timestamp, int64_t ttl_seconds) { + if (ttl_seconds <= 0 || base_timestamp <= 0) { return 0; } // Overflow protection. - if (newest_write_timestamp > std::numeric_limits::max() - ttl_seconds) { + if (base_timestamp > std::numeric_limits::max() - ttl_seconds) { return 0; } - const int64_t expiration_time = newest_write_timestamp + ttl_seconds; + const int64_t expiration_time = base_timestamp + ttl_seconds; // Clamp expired TTL to 0 to keep behavior consistent across read/write/warmup. return expiration_time > UnixSeconds() ? expiration_time : 0; } diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 6a0bb53a48fcc9..c55afdfd260e9c 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -226,7 +226,7 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context } _read_options.io_ctx.expiration_time = io::calc_file_cache_expiration_time( - _rowset->rowset_meta()->newest_write_timestamp(), read_context->ttl_seconds); + read_context->file_cache_base_timestamp, read_context->ttl_seconds); bool enable_segment_cache = true; auto* state = read_context->runtime_state; diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index d3039182428947..8e0794ce5d3018 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -315,9 +315,6 @@ Status BaseBetaRowsetWriter::init(const RowsetWriterContext& rowset_writer_conte _is_pending = true; _rowset_meta->set_txn_id(_context.txn_id); _rowset_meta->set_load_id(_context.load_id); - if (_context.newest_write_timestamp > 0) { - _rowset_meta->set_newest_write_timestamp(_context.newest_write_timestamp); - } } else { _rowset_meta->set_version(_context.version); _rowset_meta->set_newest_write_timestamp(_context.newest_write_timestamp); diff --git a/be/src/olap/rowset/rowset_reader_context.h b/be/src/olap/rowset/rowset_reader_context.h index acf18cf86a4744..e6b47729196e7e 100644 --- a/be/src/olap/rowset/rowset_reader_context.h +++ b/be/src/olap/rowset/rowset_reader_context.h @@ -87,6 +87,7 @@ struct RowsetReaderContext { // slots that cast may be eliminated in storage layer std::map target_cast_type_for_variants; int64_t ttl_seconds = 0; + int64_t file_cache_base_timestamp = 0; std::map virtual_column_exprs; std::map vir_cid_to_idx_in_block; diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index c57346bca163f2..50b061e82999cf 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -109,6 +109,7 @@ struct RowsetWriterContext { bool write_file_cache = false; bool is_hot_data = false; uint64_t file_cache_ttl_sec = 0; + int64_t file_cache_base_timestamp = 0; uint64_t approximate_bytes_to_write = 0; // If true, compaction output only writes index files to file cache, not data files bool compaction_output_write_index_only = false; @@ -250,8 +251,8 @@ struct RowsetWriterContext { return io::FileWriterOptions { .write_file_cache = should_write_cache, .is_cold_data = is_hot_data, - .file_cache_expiration = io::calc_file_cache_expiration_time( - newest_write_timestamp, static_cast(file_cache_ttl_sec)), + .file_cache_expiration = static_cast(io::calc_file_cache_expiration_time( + file_cache_base_timestamp, static_cast(file_cache_ttl_sec))), .approximate_bytes_to_write = approximate_bytes_to_write}; } }; diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index d47f54dca38072..6ed2a188193842 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -1991,6 +1991,7 @@ void Tablet::_init_context_common_fields(RowsetWriterContext& context) { context.tablet_id = tablet_id(); context.partition_id = partition_id(); context.tablet_schema_hash = schema_hash(); + context.file_cache_base_timestamp = tablet_meta()->creation_time(); context.rowset_type = tablet_meta()->preferred_rowset_type(); // Alpha Rowset will be removed in the future, so that if the tablet's default rowset type is // alpha rowset, then set the newly created rowset to storage engine's default rowset. diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp index 8028d18d4ed0c5..349bb6a6414d38 100644 --- a/be/src/olap/tablet_reader.cpp +++ b/be/src/olap/tablet_reader.cpp @@ -216,6 +216,7 @@ Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) { _reader_context.output_columns = &read_params.output_columns; _reader_context.push_down_agg_type_opt = read_params.push_down_agg_type_opt; _reader_context.ttl_seconds = _tablet->ttl_seconds(); + _reader_context.file_cache_base_timestamp = _tablet->tablet_meta()->creation_time(); _reader_context.score_runtime = read_params.score_runtime; _reader_context.collection_statistics = read_params.collection_statistics; diff --git a/be/test/io/cache/file_cache_expiration_test.cpp b/be/test/io/cache/file_cache_expiration_test.cpp new file mode 100644 index 00000000000000..1d9c570a3ffcd3 --- /dev/null +++ b/be/test/io/cache/file_cache_expiration_test.cpp @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io/cache/file_cache_expiration.h" + +#include + +#include +#include + +#include "olap/rowset/rowset_writer_context.h" + +namespace doris::io { + +TEST(FileCacheExpirationTest, ReturnsZeroForInvalidOrExpiredBase) { + EXPECT_EQ(0, calc_file_cache_expiration_time(0, 60)); + EXPECT_EQ(0, calc_file_cache_expiration_time(-1, 60)); + EXPECT_EQ(0, calc_file_cache_expiration_time(UnixSeconds() - 10, 5)); + EXPECT_EQ(0, calc_file_cache_expiration_time(std::numeric_limits::max() - 1, 10)); +} + +TEST(FileCacheExpirationTest, UsesBaseTimestamp) { + const int64_t base_timestamp = UnixSeconds(); + const int64_t ttl_seconds = 120; + + EXPECT_EQ(base_timestamp + ttl_seconds, + calc_file_cache_expiration_time(base_timestamp, ttl_seconds)); +} + +TEST(FileCacheExpirationTest, RowsetWriterContextUsesFileCacheBaseTimestamp) { + doris::RowsetWriterContext context; + context.write_file_cache = true; + context.file_cache_ttl_sec = 60; + const int64_t base_timestamp = UnixSeconds(); + context.file_cache_base_timestamp = base_timestamp; + context.newest_write_timestamp = base_timestamp + 600; + + auto opts = context.get_file_writer_options(); + EXPECT_EQ(static_cast(base_timestamp + 60), opts.file_cache_expiration); +} + +} // namespace doris::io diff --git a/regression-test/suites/cloud_p0/cache/ttl/test_ttl_creation_time_anchor.groovy b/regression-test/suites/cloud_p0/cache/ttl/test_ttl_creation_time_anchor.groovy new file mode 100644 index 00000000000000..58f9bde9a890ec --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/ttl/test_ttl_creation_time_anchor.groovy @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions + +suite("test_ttl_creation_time_anchor", "docker") { + def options = new ClusterOptions() + options.cloudMode = true + options.beNum = 1 + options.feConfigs += [ + "cloud_cluster_check_interval_second=1", + ] + options.beConfigs += [ + "file_cache_enter_disk_resource_limit_mode_percent=99", + "enable_evict_file_cache_in_advance=false", + "file_cache_background_monitor_interval_ms=1000", + ] + + docker(options) { + String[][] backends = sql """ show backends """ + String backendId + def backendIdToBackendIP = [:] + def backendIdToBackendHttpPort = [:] + def backendIdToBackendBrpcPort = [:] + for (String[] backend in backends) { + if (backend[9].equals("true")) { + backendIdToBackendIP.put(backend[0], backend[1]) + backendIdToBackendHttpPort.put(backend[0], backend[4]) + backendIdToBackendBrpcPort.put(backend[0], backend[5]) + } + } + assertEquals(backendIdToBackendIP.size(), 1) + + backendId = backendIdToBackendIP.keySet()[0] + def clearUrl = backendIdToBackendIP.get(backendId) + ":" + + backendIdToBackendHttpPort.get(backendId) + + """/api/file_cache?op=clear&sync=true""" + + def clearFileCache = { check_func -> + httpTest { + endpoint "" + uri clearUrl + op "get" + body "" + check check_func + } + } + + def getCacheMetrics = { + long ttlCacheSize = 0L + long normalQueueCacheSize = 0L + httpTest { + endpoint backendIdToBackendIP.get(backendId) + ":" + + backendIdToBackendBrpcPort.get(backendId) + uri "/brpc_metrics" + op "get" + check { respCode, body -> + assertEquals("${respCode}".toString(), "200") + String out = "${body}".toString() + def strs = out.split('\n') + for (String line in strs) { + if (line.startsWith("#")) { + continue + } + if (line.contains("file_cache_ttl_cache_size")) { + def idx = line.indexOf(' ') + ttlCacheSize = line.substring(idx + 1).trim().toLong() + } + if (line.contains("file_cache_normal_queue_cache_size")) { + def idx = line.indexOf(' ') + normalQueueCacheSize = line.substring(idx + 1).trim().toLong() + } + } + } + } + return [ttlCacheSize, normalQueueCacheSize] + } + + def waitForCacheEmpty = { + int maxTry = 15 + while (maxTry-- > 0) { + def metrics = getCacheMetrics() + if (metrics[0] == 0L && metrics[1] == 0L) { + return + } + sleep(2000) + } + def metrics = getCacheMetrics() + assertEquals(metrics[0], 0L) + assertEquals(metrics[1], 0L) + } + + def waitForFreshTabletMetrics = { + int maxTry = 30 + while (maxTry-- > 0) { + def metrics = getCacheMetrics() + if (metrics[0] > 0L && metrics[1] == 0L) { + return + } + sleep(2000) + } + def metrics = getCacheMetrics() + assertTrue(metrics[0] > 0L) + assertEquals(metrics[1], 0L) + } + + def waitForOldTabletMetrics = { + int maxTry = 30 + while (maxTry-- > 0) { + def metrics = getCacheMetrics() + if (metrics[0] == 0L && metrics[1] > 0L) { + return + } + sleep(2000) + } + def metrics = getCacheMetrics() + assertEquals(metrics[0], 0L) + assertTrue(metrics[1] > 0L) + } + + def createTable = { String tableName, int ttlSeconds -> + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ + CREATE TABLE ${tableName} ( + id BIGINT, + payload STRING + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "${ttlSeconds}", + "disable_auto_compaction" = "true" + ) + """ + } + + def insertRows = { String tableName -> + sql """ + INSERT INTO ${tableName} + SELECT + number, + concat(cast(number as string), '-', repeat(md5(cast(number as string)), 16)) + FROM numbers("number" = "100000") + """ + sql """ sync """ + } + + def scanTable = { String tableName -> + def result = sql """ select sum(length(payload)) from ${tableName} """ + assertTrue(result[0][0].toString().toLong() > 0L) + } + + sql """ set enable_file_cache = true """ + sql """ set disable_file_cache = false """ + + def freshTableName = "ttl_creation_time_fresh_" + Math.abs(UUID.randomUUID().hashCode()) + try { + clearFileCache.call() { + respCode, body -> {} + } + waitForCacheEmpty() + createTable(freshTableName, 180) + insertRows(freshTableName) + clearFileCache.call() { + respCode, body -> {} + } + waitForCacheEmpty() + scanTable(freshTableName) + waitForFreshTabletMetrics() + } finally { + sql """ drop table if exists ${freshTableName} """ + } + + def oldTableName = "ttl_creation_time_old_" + Math.abs(UUID.randomUUID().hashCode()) + try { + clearFileCache.call() { + respCode, body -> {} + } + waitForCacheEmpty() + createTable(oldTableName, 5) + sleep(7000) + insertRows(oldTableName) + clearFileCache.call() { + respCode, body -> {} + } + waitForCacheEmpty() + scanTable(oldTableName) + waitForOldTabletMetrics() + } finally { + sql """ drop table if exists ${oldTableName} """ + } + } +} diff --git a/ttl_fix.md b/ttl_fix.md new file mode 100644 index 00000000000000..0a7b2e1601e587 --- /dev/null +++ b/ttl_fix.md @@ -0,0 +1,207 @@ +# Doris BE FileCache TTL/Expiration 临时修复方案(切换到 Tablet Creation Time) + +本文基于当前临时修复分支上的实现与 review 结论,重新整理 FileCache TTL 的短期修复方案。目标不是恢复“理想 TTL 语义”,而是在正式 TTL 重构上线前,优先解决 **expiration_time 在 query / write / warmup / sync_meta 间不一致** 导致的 cache 漂移、rename 抖动和残留目录问题。 + +> 约定: +> - `ttl_seconds` 是 duration(秒数) +> - `expiration_time` 是绝对时间戳(Unix epoch seconds) +> - `base_timestamp` 是本次临时修复中用于计算 file cache expiration 的稳定时间基准 +> - “磁盘 TTL”指 `FSFileCacheStorage` 目录名中编码的 `.../_/...` + +## 0. 结论先行 + +临时修复不再尝试“前移并复用 `newest_write_timestamp`”,而是: + +1. 保留统一的 `expiration_time` helper,统一 clamp / overflow / invalid-base 规则; +2. 将 file cache TTL 的 `base_timestamp` 统一切换为 `tablet_meta->creation_time()`; +3. 明确 `newest_write_timestamp` 继续只承担 rowset freshness 语义,不再作为 file cache TTL 基准; +4. `query / write / warmup / sync_meta` 四条链路全部使用同一套 `creation_time + ttl_seconds` 计算; +5. `sync_meta` 在 TTL 属性变化时,不仅更新 segment data cache,也要同步更新 inverted index cache,避免只改一半。 + +## 1. 为什么放弃 `newest_write_timestamp` 方案 + +上一版思路是: + +- 保持 `expiration_time = newest_write_timestamp + ttl_seconds` +- 为了让 write/read/warmup 使用同一个时间基准,把 pending rowset 的 `newest_write_timestamp` 提前写入 rowset meta + +这个方向虽然能缓解 `t0/t1` 分裂,但 review 后确认有两个问题: + +1. **它改变了 `newest_write_timestamp` 的既有语义** + - cooldown、warmup_delta_data 的“最近写入”判定、cloud freshness fallback 等逻辑,都默认它表达 rowset 的写入新鲜度; + - 把它前移到 writer 初始化时刻,本质上会让 rowset“看起来更老”。 + +2. **它修的是 file cache TTL,不该顺带改其他功能的时间语义** + - 临时修复应该把影响面收敛在 TTL 自身; + - 如果为了修 TTL 而重定义 `newest_write_timestamp`,副作用范围会超出本次修复目标。 + +所以这次方案明确回退这部分改动:**撤掉 pending rowset 提前持久化 `newest_write_timestamp` 的实现。** + +## 2. 为什么接受 `tablet_meta->creation_time()` 作为 base + +TTL 当前的问题,本质上不是 “+ ttl” 这一步,而是 `base` 不稳定。 + +使用 `tablet_meta->creation_time()` 的理由: + +1. **稳定** + - tablet 创建后该值天然固定,不会在 import/build/commit/query/warmup 间漂移。 + +2. **天然是 tablet 级别** + - file cache TTL 的竞争对象实际是 tablet 下同一远端对象对应的 cache hash; + - 用 tablet 级时间基准比 rowset 级时间基准更容易保证全链路一致。 + +3. **不影响 `newest_write_timestamp` 现有语义** + - cooldown、warmup_delta_data 等依赖“最近写入时间”的逻辑可以保持不变; + - TTL 修复只影响 file cache expiration 的计算。 + +4. **符合当前业务接受范围** + - 这个临时方案主要面向 cloud + dynamic partition 场景; + - partition/tablet 会持续新建,TTL 按 tablet 创建时间计算通常是可接受的。 + +## 3. 语义变化与接受范围 + +切到 `tablet_meta->creation_time()` 后,TTL 语义会从: + +- “相对最新写入时间过期” + +变为: + +- “相对 tablet 创建时间过期” + +这意味着: + +1. **晚到写入不会刷新 TTL** + - 老 tablet 在 TTL 窗口过去之后,即使有新写入,新的 cache block 也会按 `expiration_time = 0` 进入 normal queue。 + +2. **TTL 不再表达数据新鲜度,而表达 tablet 生命周期窗口** + - 这不是最终理想语义,但它是本次临时修复有意接受的 tradeoff。 + +3. **creation_time 非法时统一降级为非 TTL** + - 若 `creation_time <= 0`,则 `expiration_time = 0`; + - 这样不会制造新的多值竞争。 + +本次临时修复明确接受上述语义变化,优先保证稳定性、一致性和不影响其他逻辑。 + +## 4. 统一 helper:只保留一套计算规则 + +统一 helper 形态: + +```c++ +int64_t calc_file_cache_expiration_time(int64_t base_timestamp, int64_t ttl_seconds) { + if (ttl_seconds <= 0 || base_timestamp <= 0) { + return 0; + } + if (base_timestamp > std::numeric_limits::max() - ttl_seconds) { + return 0; + } + int64_t expiration_time = base_timestamp + ttl_seconds; + return expiration_time > UnixSeconds() ? expiration_time : 0; +} +``` + +规则统一为: + +1. `ttl_seconds <= 0` -> `0` +2. `base_timestamp <= 0` -> `0` +3. 溢出 -> `0` +4. 已过期 -> clamp 到 `0` + +这样可以避免 query / write / warmup / sync_meta 因为“是否 clamp”或“边界值不同”再次分叉。 + +## 5. 代码落点 + +### 5.1 Write Path + +写路径不再使用 `newest_write_timestamp` 算 file-cache expiration,而是显式传递: + +- `RowsetWriterContext.file_cache_base_timestamp = tablet_meta->creation_time()` + +然后在 `RowsetWriterContext::get_file_writer_options()` 中统一调用 helper: + +- `calc_file_cache_expiration_time(file_cache_base_timestamp, file_cache_ttl_sec)` + +这样导入写 cache 时固化下来的 expiration,与后续 query/warmup/sync_meta 的稳定基准一致。 + +### 5.2 Read Path + +读路径通过 tablet reader 将稳定基准下传到 rowset reader: + +- `RowsetReaderContext.file_cache_base_timestamp = tablet->tablet_meta()->creation_time()` + +`BetaRowsetReader` 不再读取 rowset 的 `newest_write_timestamp` 来算 TTL。 + +### 5.3 Warmup Path + +三类 warmup 入口统一使用: + +- `calc_file_cache_expiration_time(tablet_meta->creation_time(), tablet_meta->ttl_seconds())` + +覆盖: + +- `cloud_tablet.cpp`(sync rowset warmup) +- `cloud_internal_service.cpp`(event-driven warmup) +- `cloud_warm_up_manager.cpp`(job warmup) + +### 5.4 Sync Meta + +当 `sync_meta` 发现 `tablet_meta->creation_time()` 或 `tablet_meta->ttl_seconds()` 发生变化时: + +1. 重新计算目标 expiration: + - `calc_file_cache_expiration_time(tablet_meta->creation_time(), new_ttl_seconds)` +2. 对当前 tablet 下所有 cached segment data 调 `modify_expiration_time` +3. **对当前 tablet 下所有 cached inverted index 也调 `modify_expiration_time`** + +最后一点很重要。若只改 segment,不改 index,会留下同一 rowset 内 data/index TTL 分叉的问题。 + +## 6. 不做的事情 + +本次临时修复不做: + +1. 不恢复“写入刷新 TTL”的旧语义 +2. 不尝试兼容频繁 `ALTER TABLE TTL` +3. 不做跨 clone / restore / rebuild 的更复杂时间语义修正 +4. 不保证这是最终最优解,正式 TTL 重构仍然是长期方案 + +## 7. 验收标准 + +### 7.1 行为一致性 + +同一个 tablet 内同一个远端对象,在以下路径中使用相同的 `base_timestamp`: + +- query +- write +- warmup +- sync_meta + +### 7.2 语义隔离 + +- file cache TTL 不再依赖 `newest_write_timestamp` +- `newest_write_timestamp` 语义保持不变,不影响 cooldown、warmup_delta_data 等逻辑 + +### 7.3 边界统一 + +- 非法 base、已过期、overflow 都统一得到 `expiration_time = 0` + +### 7.4 TTL 属性变更后的完整性 + +- `sync_meta` 修改 expiration 时,segment data 和 inverted index 都被覆盖 + +## 8. 建议验证 + +1. **UT** + - helper 对 invalid / expired / overflow 的 clamp 行为 + - writer-side `file_cache_base_timestamp` 传播 + +2. **Cloud docker regression** + - 新创建 tablet 立即写入,TTL queue 仍生效 + - 创建 table 后等待超过 TTL 再写入,cache 不再进入 TTL queue,而进入 normal queue + +3. **Code inspection** + - 确认没有剩余路径仍使用 `newest_write_timestamp` 计算 file-cache expiration + +## 9. 适用分支 + +- `branch-4.0` +- `branch-3.1` + +`master / branch-4.1 / cloud-26.1` 已有正式修复路径,和本临时方案方向不同,不应直接 pick 本补丁。 diff --git a/ttl_fix_final.md b/ttl_fix_final.md new file mode 100644 index 00000000000000..db52fed8596936 --- /dev/null +++ b/ttl_fix_final.md @@ -0,0 +1,163 @@ +目标说明 +TTL 重构已经可以解决所有 TTL 已知问题,但是由于重构上线周期很长,在这段时间里需要一个范围受控的临时修复方案。 + +我们要做的: +- 使用 ttl 后系统稳定,没有 cache 空间泄漏问题,没有 NOT FOUND miss 问题 +- 保证 query / write / warmup / sync_meta 对同一对象得到一致的 expiration_time +- 不改变 `newest_write_timestamp` 在 cooldown、warmup_delta_data 等现有逻辑里的语义 +在临时修复方案里,我们不做: +- 完美优雅的修复(重构已经做了这部分工作) +- 允许用户频繁 alter table ttl 属性 +- 保持“新写入会刷新 TTL”这类旧语义 + +问题 +内存中的 FileCacheKey.meta.expiration_time 和 磁盘目录路径里的 .../_/... 两者不一致。 + +这类不一致会表现为: +1. 同一个 hash 在不同入口(query/warmup/import)下反复触发 modify_expiration_time / rename; +2. 磁盘目录中出现“旧 expiration 目录残留”,但内存里已是“新 expiration”; +3. 导入后立即查询命中同 hash 时,出现 TTL 行为漂移(是否进入 TTL queue、是否被视为过期) 。 + +- ttl_seconds:持续时长(duration,秒)。 +- expiration_time:绝对时间戳(Unix seconds)。 +- “磁盘 TTL”指 FSFileCacheStorage 的目录名后缀:_。 +- base_timestamp:本次临时修复里用于计算 file cache TTL 的稳定基准时间,统一取 `tablet_meta->creation_time()`。 + +TTL 端到端传递路径分析 +1. 查询路径 +1. 元数据来源 + - TabletMeta::ttl_seconds() + - TabletMeta::creation_time() +2. 读取时计算 expiration + - be/src/olap/rowset/beta_rowset_reader.cpp + - 写入 _read_options.io_ctx.expiration_time +3. 进入 file cache + - CachedRemoteFileReader / BlockFileCache::get_or_set + - CacheContext.expiration_time -> FileCacheKey.meta.expiration_time +4. 落磁盘 + - FSFileCacheStorage::get_path_in_local_cache_v2(hash, expiration_time) + - 目录编码为 _ +结论:query 侧是“内存 key + 磁盘目录”共享同一个 `io_ctx.expiration_time` 的主路径。 + +2. Warmup 路径(Sync Rowset / Job / Event-driven) + +入口三类: +- be/src/cloud/cloud_tablet.cpp(add_rowsets(... warmup_delta_data)) +- be/src/cloud/cloud_warm_up_manager.cpp(周期/一次性 warmup) +- be/src/cloud/cloud_internal_service.cpp(事件驱动 warmup) +流程: + +1. 构造 DownloadFileMeta.ctx.expiration_time +2. BlockFileCacheDownloader 下载并写 cache +3. 进入 BlockFileCache、最终落盘到 _ +结论:warmup 是第二条“主动写 cache”的主路径,如果计算规则与 query/import 不一致,会直接制造同 hash 不同 expiration。 + +3. 导入写路径 +核心链路: +1. Rowset writer context 初始化 + - context.file_cache_base_timestamp + - context.file_cache_ttl_sec = tablet->ttl_seconds() +2. 生成 file writer options + - be/src/olap/rowset/rowset_writer_context.h + - FileWriterOptions.file_cache_expiration +3. 远端写 + 本地 cache + - S3FileWriter::appendv + - UploadFileBuffer::on_upload + - upload_to_local_file_cache +4. cache builder 把 expiration 带入 block key + - FileWriter::init_cache_builder + - FileCacheAllocatorBuilder::allocate_cache_holder + - CacheContext.expiration_time -> FileCacheKey.meta.expiration_time +5. 落盘路径 + - FSFileCacheStorage 目录名 _ +结论:导入路径会在“写入当下”把 expiration 固化到 cache block;如果 base_timestamp 不稳定,就会和 query/warmup 产生长期分叉。 + +可能导致“不一致”的主要场景 +以下按影响与复现概率排序。 +A. 读写 clamp 规则不一致(过期归零 vs 不归零) +触发条件: +- query 侧对 newest_write_timestamp + ttl <= now 做 clamp(变 0) +- import/warmup 某些路径仍写非 0 绝对时间 +结果: +- 同 hash 在 query 得到 expiration=0,在导入/预热得到 expiration>0 +- modify_expiration_time/rename 高频触发 +- 短期内出现“磁盘目录还是旧 expiration,内存已切新 expiration” + +B. 以 rowset write time 为基准不稳定(pending rowset t0/t1 分裂只是其中一种表现) +触发条件: +- 不同入口依赖 rowset 级别时间戳(尤其是 newest_write_timestamp) +- 同一对象在导入、build/commit、query、warmup 看到的“写入时间”并不天然一致 +结果: +- 导入写 cache 可能使用 t0 + ttl +- 查询/预热读取 rowset_meta 可能使用 t1 + ttl +- 未改 TTL property 也会出现一致性问题 +这是你观察到“没有 alter ttl 也不一致”的关键来源之一。 + +C. Warmup 路径自成一套计算规则 +触发条件: +- warmup 入口(sync_rowset/job/event-driven)未完全复用 query/write 的统一计算 +- 或某个入口未做 clamp +结果: +- warmup 成为第三个 expiration_time 来源 +- 同 hash 在 query/import/warmup 三端形成三值竞争 + +修复方案 +A. 读侧 clamp(过期归零) vs 写侧不 clamp(仍写非 0 绝对时间) +抽象统一的 expiration 计算函数,并在写路径补齐 clamp +int64_t calc_file_cache_expiration(int64_t base_timestamp, int64_t ttl_seconds) { + if (ttl_seconds <= 0 || base_timestamp <= 0) return 0; + int64_t exp = base_timestamp + ttl_seconds; // 需溢出保护 + return exp > UnixSeconds() ? exp : 0; +} +- Query:be/src/olap/rowset/beta_rowset_reader.cpp +- Write:be/src/olap/rowset/rowset_writer_context.h +- Warmup:be/src/cloud/cloud_internal_service.cpp、be/src/cloud/cloud_tablet.cpp、be/src/cloud/cloud_warm_up_manager.cpp +- SyncMeta:be/src/cloud/cloud_tablet.cpp +修完后,写入 _ 与后续 query/warmup 的规则一致,消除“写非0、读0”的系统性差异。 + +B. 使用稳定的 tablet 级基准,避免复用 `newest_write_timestamp` +以 TabletMeta/TabletMetaSharedPtr 的 creation_time() 作为 file cache expiration 基准 +思路: +- 不再用 newest_write_timestamp + ttl; +- 改为 tablet_meta->creation_time() + ttl(并做 clamp); +- query/write/warmup/sync_meta 统一显式传递这个稳定基准。 +核心收益: +1. 规避 B 类 t0/t1 漂移; +2. 基准稳定,减少同 hash 的反复改 expiration; +3. 不需要修改 newest_write_timestamp 语义,避免影响 cooldown、warmup_delta_data 等其他逻辑。 +风险/语义变化(很重要): +1. TTL 语义从“相对最新写入”变为“相对 tablet 创建时间”; +2. 新写入不会延长缓存寿命; +3. clone/restore/rebuild 场景下 creation_time 语义可能不等价于“数据新鲜度”; +4. 历史 meta 若 creation_time 缺失(0)需定义 fallback,本次临时修复统一按非 TTL(expiration=0)处理; +5. 若目标是“随写入刷新 TTL”,该方案有意放弃这种语义,优先保证稳定性与一致性。 + +C. Warmup 路径的 expiration 计算规则不一致(尤其缺少 clamp) + +warmup 与 query/write 完全复用同一套 calc + clamp +把 warmup 入口统一改成: +- expiration_time = calc_file_cache_expiration(tablet_meta->creation_time(), ttl_seconds) +至少覆盖: +- be/src/cloud/cloud_internal_service.cpp +- be/src/cloud/cloud_tablet.cpp +- be/src/cloud/cloud_warm_up_manager.cpp + +D. `sync_meta` 发现 TTL anchor 相关元数据变化时,data/index 都要一起更新 + +当 `sync_meta` 发现 `tablet_meta->creation_time()` 或 `tablet_meta->ttl_seconds()` 变化时: +- 重新计算 `calc_file_cache_expiration(tablet_meta->creation_time(), new_ttl_seconds)` +- 用这个统一结果更新 segment data cache +- 同时更新 inverted index cache + +如果只更新 segment 而漏掉 index,仍会残留同一 rowset 内部的 TTL 视图不一致问题。 + +验收标准 +1. query/write/warmup/sync_meta 四条链路对同一 tablet 内对象使用相同的 base_timestamp。 +2. file cache TTL 计算不再依赖 newest_write_timestamp,因此不改变 cooldown、warmup_delta_data 等现有逻辑语义。 +3. 对于 old tablet 的 late write,TTL 不会被“刷新”;这属于本次临时修复引入且接受的语义变化。 +4. 已过期或非法 base_timestamp 统一落到 expiration_time=0,避免同 hash 多值竞争。 +5. `sync_meta` 在 TTL anchor 相关元数据变化时,segment data 和 inverted index 都会一起迁移到新的 expiration。 + +版本/分支 +master/branch-4.1/cloud-26.1 带了正式的修复,与临时修复冲突,所以不用也不能 pick +branch-4.0 + branch-3.1 版本可以 pick diff --git a/ttl_fix_regression_plan.md b/ttl_fix_regression_plan.md new file mode 100644 index 00000000000000..0b805a7e4d2ae5 --- /dev/null +++ b/ttl_fix_regression_plan.md @@ -0,0 +1,66 @@ +# Doris Regression Test Plan Template + +## 1. Summary of Change +- **Intent**: switch the temporary file-cache TTL anchor from `newest_write_timestamp` to `tablet_meta->creation_time()` so expiration is stable across write/read/warmup/sync_meta without affecting rowset freshness semantics. +- **Affected modules/files**: + - `be/src/io/cache/file_cache_expiration.h` + - `be/src/olap/rowset/rowset_writer_context.h` + - `be/src/olap/rowset/rowset_reader_context.h` + - `be/src/olap/rowset/beta_rowset_reader.cpp` + - `be/src/olap/tablet_reader.cpp` + - `be/src/cloud/cloud_internal_service.cpp` + - `be/src/cloud/cloud_tablet.cpp` + - `be/src/cloud/cloud_warm_up_manager.cpp` + - `regression-test/suites/cloud_p0/cache/ttl/...` +- **User-provided context**: this is a temporary fix for TTL inconsistency on branch-4.0/3.1. Stability and cache correctness are the priority; changing TTL semantics from latest-write based to tablet-creation based is acceptable for the target dynamic-partition-heavy workloads. + +## 2. Risk Analysis +- **Primary risk**: an old tablet may stop placing newly written data into the TTL queue because TTL is now anchored to tablet creation time. +- **Secondary risks**: + - one remaining path may still calculate expiration from rowset time and reintroduce rename/modify drift + - `sync_meta` may update segment data but miss inverted index cache entries + - metrics-based regression may be flaky if cache population is too small +- **Backward compatibility concerns**: file-cache TTL semantics change from “relative to latest rowset write” to “relative to tablet creation time”. + +## 3. Test Scope +- **Target suites**: `regression-test/suites/cloud_p0/cache/ttl` +- **Excluded areas**: multi-cluster warmup orchestration, restore/clone end-to-end, non-cloud suites +- **Required environment**: cloud docker regression environment with one alive BE in the target cluster + +## 4. Test Matrix +| Scenario | Setup | Action | Expected Result | Signal/Metric | +| --- | --- | --- | --- | --- | +| TTL on a fresh tablet still enters TTL queue | Create table with `file_cache_ttl_seconds`, load/query data soon after create | Load or query to populate file cache | TTL cache is populated | `ttl_cache_size > 0` | +| Late write on an old tablet does not refresh TTL | Create table with short TTL, wait past TTL before the first load on that table | Load/query after waiting | New cache entries are treated as non-TTL | `ttl_cache_size == 0`, `normal_queue_cache_size > 0` | + +## 5. Data & Setup +- **Tables/schemas**: simple single-bucket table in cloud cache suite +- **Test data**: generated rows via `INSERT INTO ... SELECT FROM numbers(...)` with a sufficiently wide string payload; avoid `BROKER LOAD` / external object-store dependencies +- **Config toggles**: + - `enable_evict_file_cache_in_advance=false` + - `file_cache_enter_disk_resource_limit_mode_percent=99` + +## 6. Assertions +- **Result correctness**: + - TTL queue is used for fresh tablets + - TTL queue is not used for late writes to old tablets +- **Metrics / logs / profiles**: + - validate `ttl_cache_size` + - validate `normal_queue_cache_size` +- **Negative cases**: + - no reliance on `newest_write_timestamp` refresh behavior + +## 7. Implementation Notes +- **Candidate test files**: + - add `regression-test/suites/cloud_p0/cache/ttl/test_ttl_creation_time_anchor.groovy` +- **Reuse existing utilities**: + - cache clear HTTP helper + - metrics scraping via `/brpc_metrics` + - docker `ClusterOptions` cloud-mode wrapper for a self-contained regression environment +- **Cleanup requirements**: + - clear file cache before the test + - drop created tables after the test + +## 8. Approval Checklist +- [x] Plan reviewed by user through the request to proceed end-to-end in this turn +- [x] User approved implementation diff --git a/ttl_fix_ut_plan.md b/ttl_fix_ut_plan.md new file mode 100644 index 00000000000000..a486acadc594ef --- /dev/null +++ b/ttl_fix_ut_plan.md @@ -0,0 +1,53 @@ +# Unit Test Plan + +## Context +- Change source: current branch `ttl-filecache-fix`, replacing the temporary `newest_write_timestamp` anchor with `tablet_meta->creation_time()` for file-cache TTL expiration. +- Intent: keep file-cache TTL expiration stable across write/read/warmup/sync_meta without changing `newest_write_timestamp` semantics. + +## Scope summary +- Files/modules touched: + - `be/src/io/cache/file_cache_expiration.h` + - `be/src/olap/rowset/rowset_writer_context.h` + - `be/src/olap/rowset/rowset_reader_context.h` + - `be/src/olap/rowset/beta_rowset_reader.cpp` + - `be/src/olap/tablet_reader.cpp` + - `be/src/cloud/cloud_tablet.cpp` + - `be/src/cloud/cloud_internal_service.cpp` + - `be/src/cloud/cloud_warm_up_manager.cpp` + - `be/src/olap/tablet.cpp` + - `be/src/cloud/cloud_tablet.cpp` +- Behaviors changed: + - File-cache TTL expiration is now computed from a stable tablet-level base timestamp. + - `newest_write_timestamp` remains a rowset freshness field and is no longer repurposed for file-cache TTL. + - Pending rowset initialization no longer persists `newest_write_timestamp` early for TTL alignment. + +## Test cases +1. `calc_file_cache_expiration_time_returns_zero_for_invalid_or_expired_base` + - Purpose: verify clamp behavior is still correct for invalid base, expired base, and overflow. + - Setup/fixtures: direct helper test. + - Inputs: `base_timestamp <= 0`, expired `base_timestamp + ttl`, overflow edge. + - Expected outcomes: helper returns `0`. + - Mocks/stubs: none. +2. `calc_file_cache_expiration_time_uses_base_timestamp` + - Purpose: verify the helper preserves the stable anchor semantics. + - Setup/fixtures: direct helper test. + - Inputs: valid `base_timestamp`, positive `ttl_seconds`. + - Expected outcomes: helper returns `base_timestamp + ttl_seconds`. + - Mocks/stubs: none. +3. `rowset_writer_context_uses_file_cache_base_timestamp` + - Purpose: ensure writer-side cache expiration uses `file_cache_base_timestamp` rather than `newest_write_timestamp`. + - Setup/fixtures: instantiate `RowsetWriterContext`. + - Inputs: different values for `file_cache_base_timestamp` and `newest_write_timestamp`. + - Expected outcomes: `get_file_writer_options().file_cache_expiration` follows the base timestamp. + - Mocks/stubs: none. + +## Coverage matrix (optional) +- Helper clamp/overflow -> test cases 1 and 2 +- Writer anchor propagation -> test case 3 + +## Risks & gaps +- Read/warmup/sync_meta integration is hard to unit-test directly without large fixtures; that coverage will be carried by regression tests and targeted code inspection. +- No direct unit coverage for clone/restore semantics in this patch. + +## Approvals +- [x] Plan approved by user through the request to proceed end-to-end in this turn From 1ac12b7246ce153194da4526cccdc98e1b42ca7f Mon Sep 17 00:00:00 2001 From: zhengyu Date: Tue, 14 Apr 2026 18:35:36 +0800 Subject: [PATCH 3/3] [fix](filecache) drop temporary TTL design docs from branch-4.0 PR Remove the temporary TTL design and test-planning markdown files from the branch-4.0 PR. These notes are useful for local analysis, but they should not be part of the backport patch itself. Deleted files: - ttl_fix.md - ttl_fix_final.md - ttl_fix_regression_plan.md - ttl_fix_ut_plan.md --- ttl_fix.md | 207 ------------------------------------- ttl_fix_final.md | 163 ----------------------------- ttl_fix_regression_plan.md | 66 ------------ ttl_fix_ut_plan.md | 53 ---------- 4 files changed, 489 deletions(-) delete mode 100644 ttl_fix.md delete mode 100644 ttl_fix_final.md delete mode 100644 ttl_fix_regression_plan.md delete mode 100644 ttl_fix_ut_plan.md diff --git a/ttl_fix.md b/ttl_fix.md deleted file mode 100644 index 0a7b2e1601e587..00000000000000 --- a/ttl_fix.md +++ /dev/null @@ -1,207 +0,0 @@ -# Doris BE FileCache TTL/Expiration 临时修复方案(切换到 Tablet Creation Time) - -本文基于当前临时修复分支上的实现与 review 结论,重新整理 FileCache TTL 的短期修复方案。目标不是恢复“理想 TTL 语义”,而是在正式 TTL 重构上线前,优先解决 **expiration_time 在 query / write / warmup / sync_meta 间不一致** 导致的 cache 漂移、rename 抖动和残留目录问题。 - -> 约定: -> - `ttl_seconds` 是 duration(秒数) -> - `expiration_time` 是绝对时间戳(Unix epoch seconds) -> - `base_timestamp` 是本次临时修复中用于计算 file cache expiration 的稳定时间基准 -> - “磁盘 TTL”指 `FSFileCacheStorage` 目录名中编码的 `.../_/...` - -## 0. 结论先行 - -临时修复不再尝试“前移并复用 `newest_write_timestamp`”,而是: - -1. 保留统一的 `expiration_time` helper,统一 clamp / overflow / invalid-base 规则; -2. 将 file cache TTL 的 `base_timestamp` 统一切换为 `tablet_meta->creation_time()`; -3. 明确 `newest_write_timestamp` 继续只承担 rowset freshness 语义,不再作为 file cache TTL 基准; -4. `query / write / warmup / sync_meta` 四条链路全部使用同一套 `creation_time + ttl_seconds` 计算; -5. `sync_meta` 在 TTL 属性变化时,不仅更新 segment data cache,也要同步更新 inverted index cache,避免只改一半。 - -## 1. 为什么放弃 `newest_write_timestamp` 方案 - -上一版思路是: - -- 保持 `expiration_time = newest_write_timestamp + ttl_seconds` -- 为了让 write/read/warmup 使用同一个时间基准,把 pending rowset 的 `newest_write_timestamp` 提前写入 rowset meta - -这个方向虽然能缓解 `t0/t1` 分裂,但 review 后确认有两个问题: - -1. **它改变了 `newest_write_timestamp` 的既有语义** - - cooldown、warmup_delta_data 的“最近写入”判定、cloud freshness fallback 等逻辑,都默认它表达 rowset 的写入新鲜度; - - 把它前移到 writer 初始化时刻,本质上会让 rowset“看起来更老”。 - -2. **它修的是 file cache TTL,不该顺带改其他功能的时间语义** - - 临时修复应该把影响面收敛在 TTL 自身; - - 如果为了修 TTL 而重定义 `newest_write_timestamp`,副作用范围会超出本次修复目标。 - -所以这次方案明确回退这部分改动:**撤掉 pending rowset 提前持久化 `newest_write_timestamp` 的实现。** - -## 2. 为什么接受 `tablet_meta->creation_time()` 作为 base - -TTL 当前的问题,本质上不是 “+ ttl” 这一步,而是 `base` 不稳定。 - -使用 `tablet_meta->creation_time()` 的理由: - -1. **稳定** - - tablet 创建后该值天然固定,不会在 import/build/commit/query/warmup 间漂移。 - -2. **天然是 tablet 级别** - - file cache TTL 的竞争对象实际是 tablet 下同一远端对象对应的 cache hash; - - 用 tablet 级时间基准比 rowset 级时间基准更容易保证全链路一致。 - -3. **不影响 `newest_write_timestamp` 现有语义** - - cooldown、warmup_delta_data 等依赖“最近写入时间”的逻辑可以保持不变; - - TTL 修复只影响 file cache expiration 的计算。 - -4. **符合当前业务接受范围** - - 这个临时方案主要面向 cloud + dynamic partition 场景; - - partition/tablet 会持续新建,TTL 按 tablet 创建时间计算通常是可接受的。 - -## 3. 语义变化与接受范围 - -切到 `tablet_meta->creation_time()` 后,TTL 语义会从: - -- “相对最新写入时间过期” - -变为: - -- “相对 tablet 创建时间过期” - -这意味着: - -1. **晚到写入不会刷新 TTL** - - 老 tablet 在 TTL 窗口过去之后,即使有新写入,新的 cache block 也会按 `expiration_time = 0` 进入 normal queue。 - -2. **TTL 不再表达数据新鲜度,而表达 tablet 生命周期窗口** - - 这不是最终理想语义,但它是本次临时修复有意接受的 tradeoff。 - -3. **creation_time 非法时统一降级为非 TTL** - - 若 `creation_time <= 0`,则 `expiration_time = 0`; - - 这样不会制造新的多值竞争。 - -本次临时修复明确接受上述语义变化,优先保证稳定性、一致性和不影响其他逻辑。 - -## 4. 统一 helper:只保留一套计算规则 - -统一 helper 形态: - -```c++ -int64_t calc_file_cache_expiration_time(int64_t base_timestamp, int64_t ttl_seconds) { - if (ttl_seconds <= 0 || base_timestamp <= 0) { - return 0; - } - if (base_timestamp > std::numeric_limits::max() - ttl_seconds) { - return 0; - } - int64_t expiration_time = base_timestamp + ttl_seconds; - return expiration_time > UnixSeconds() ? expiration_time : 0; -} -``` - -规则统一为: - -1. `ttl_seconds <= 0` -> `0` -2. `base_timestamp <= 0` -> `0` -3. 溢出 -> `0` -4. 已过期 -> clamp 到 `0` - -这样可以避免 query / write / warmup / sync_meta 因为“是否 clamp”或“边界值不同”再次分叉。 - -## 5. 代码落点 - -### 5.1 Write Path - -写路径不再使用 `newest_write_timestamp` 算 file-cache expiration,而是显式传递: - -- `RowsetWriterContext.file_cache_base_timestamp = tablet_meta->creation_time()` - -然后在 `RowsetWriterContext::get_file_writer_options()` 中统一调用 helper: - -- `calc_file_cache_expiration_time(file_cache_base_timestamp, file_cache_ttl_sec)` - -这样导入写 cache 时固化下来的 expiration,与后续 query/warmup/sync_meta 的稳定基准一致。 - -### 5.2 Read Path - -读路径通过 tablet reader 将稳定基准下传到 rowset reader: - -- `RowsetReaderContext.file_cache_base_timestamp = tablet->tablet_meta()->creation_time()` - -`BetaRowsetReader` 不再读取 rowset 的 `newest_write_timestamp` 来算 TTL。 - -### 5.3 Warmup Path - -三类 warmup 入口统一使用: - -- `calc_file_cache_expiration_time(tablet_meta->creation_time(), tablet_meta->ttl_seconds())` - -覆盖: - -- `cloud_tablet.cpp`(sync rowset warmup) -- `cloud_internal_service.cpp`(event-driven warmup) -- `cloud_warm_up_manager.cpp`(job warmup) - -### 5.4 Sync Meta - -当 `sync_meta` 发现 `tablet_meta->creation_time()` 或 `tablet_meta->ttl_seconds()` 发生变化时: - -1. 重新计算目标 expiration: - - `calc_file_cache_expiration_time(tablet_meta->creation_time(), new_ttl_seconds)` -2. 对当前 tablet 下所有 cached segment data 调 `modify_expiration_time` -3. **对当前 tablet 下所有 cached inverted index 也调 `modify_expiration_time`** - -最后一点很重要。若只改 segment,不改 index,会留下同一 rowset 内 data/index TTL 分叉的问题。 - -## 6. 不做的事情 - -本次临时修复不做: - -1. 不恢复“写入刷新 TTL”的旧语义 -2. 不尝试兼容频繁 `ALTER TABLE TTL` -3. 不做跨 clone / restore / rebuild 的更复杂时间语义修正 -4. 不保证这是最终最优解,正式 TTL 重构仍然是长期方案 - -## 7. 验收标准 - -### 7.1 行为一致性 - -同一个 tablet 内同一个远端对象,在以下路径中使用相同的 `base_timestamp`: - -- query -- write -- warmup -- sync_meta - -### 7.2 语义隔离 - -- file cache TTL 不再依赖 `newest_write_timestamp` -- `newest_write_timestamp` 语义保持不变,不影响 cooldown、warmup_delta_data 等逻辑 - -### 7.3 边界统一 - -- 非法 base、已过期、overflow 都统一得到 `expiration_time = 0` - -### 7.4 TTL 属性变更后的完整性 - -- `sync_meta` 修改 expiration 时,segment data 和 inverted index 都被覆盖 - -## 8. 建议验证 - -1. **UT** - - helper 对 invalid / expired / overflow 的 clamp 行为 - - writer-side `file_cache_base_timestamp` 传播 - -2. **Cloud docker regression** - - 新创建 tablet 立即写入,TTL queue 仍生效 - - 创建 table 后等待超过 TTL 再写入,cache 不再进入 TTL queue,而进入 normal queue - -3. **Code inspection** - - 确认没有剩余路径仍使用 `newest_write_timestamp` 计算 file-cache expiration - -## 9. 适用分支 - -- `branch-4.0` -- `branch-3.1` - -`master / branch-4.1 / cloud-26.1` 已有正式修复路径,和本临时方案方向不同,不应直接 pick 本补丁。 diff --git a/ttl_fix_final.md b/ttl_fix_final.md deleted file mode 100644 index db52fed8596936..00000000000000 --- a/ttl_fix_final.md +++ /dev/null @@ -1,163 +0,0 @@ -目标说明 -TTL 重构已经可以解决所有 TTL 已知问题,但是由于重构上线周期很长,在这段时间里需要一个范围受控的临时修复方案。 - -我们要做的: -- 使用 ttl 后系统稳定,没有 cache 空间泄漏问题,没有 NOT FOUND miss 问题 -- 保证 query / write / warmup / sync_meta 对同一对象得到一致的 expiration_time -- 不改变 `newest_write_timestamp` 在 cooldown、warmup_delta_data 等现有逻辑里的语义 -在临时修复方案里,我们不做: -- 完美优雅的修复(重构已经做了这部分工作) -- 允许用户频繁 alter table ttl 属性 -- 保持“新写入会刷新 TTL”这类旧语义 - -问题 -内存中的 FileCacheKey.meta.expiration_time 和 磁盘目录路径里的 .../_/... 两者不一致。 - -这类不一致会表现为: -1. 同一个 hash 在不同入口(query/warmup/import)下反复触发 modify_expiration_time / rename; -2. 磁盘目录中出现“旧 expiration 目录残留”,但内存里已是“新 expiration”; -3. 导入后立即查询命中同 hash 时,出现 TTL 行为漂移(是否进入 TTL queue、是否被视为过期) 。 - -- ttl_seconds:持续时长(duration,秒)。 -- expiration_time:绝对时间戳(Unix seconds)。 -- “磁盘 TTL”指 FSFileCacheStorage 的目录名后缀:_。 -- base_timestamp:本次临时修复里用于计算 file cache TTL 的稳定基准时间,统一取 `tablet_meta->creation_time()`。 - -TTL 端到端传递路径分析 -1. 查询路径 -1. 元数据来源 - - TabletMeta::ttl_seconds() - - TabletMeta::creation_time() -2. 读取时计算 expiration - - be/src/olap/rowset/beta_rowset_reader.cpp - - 写入 _read_options.io_ctx.expiration_time -3. 进入 file cache - - CachedRemoteFileReader / BlockFileCache::get_or_set - - CacheContext.expiration_time -> FileCacheKey.meta.expiration_time -4. 落磁盘 - - FSFileCacheStorage::get_path_in_local_cache_v2(hash, expiration_time) - - 目录编码为 _ -结论:query 侧是“内存 key + 磁盘目录”共享同一个 `io_ctx.expiration_time` 的主路径。 - -2. Warmup 路径(Sync Rowset / Job / Event-driven) - -入口三类: -- be/src/cloud/cloud_tablet.cpp(add_rowsets(... warmup_delta_data)) -- be/src/cloud/cloud_warm_up_manager.cpp(周期/一次性 warmup) -- be/src/cloud/cloud_internal_service.cpp(事件驱动 warmup) -流程: - -1. 构造 DownloadFileMeta.ctx.expiration_time -2. BlockFileCacheDownloader 下载并写 cache -3. 进入 BlockFileCache、最终落盘到 _ -结论:warmup 是第二条“主动写 cache”的主路径,如果计算规则与 query/import 不一致,会直接制造同 hash 不同 expiration。 - -3. 导入写路径 -核心链路: -1. Rowset writer context 初始化 - - context.file_cache_base_timestamp - - context.file_cache_ttl_sec = tablet->ttl_seconds() -2. 生成 file writer options - - be/src/olap/rowset/rowset_writer_context.h - - FileWriterOptions.file_cache_expiration -3. 远端写 + 本地 cache - - S3FileWriter::appendv - - UploadFileBuffer::on_upload - - upload_to_local_file_cache -4. cache builder 把 expiration 带入 block key - - FileWriter::init_cache_builder - - FileCacheAllocatorBuilder::allocate_cache_holder - - CacheContext.expiration_time -> FileCacheKey.meta.expiration_time -5. 落盘路径 - - FSFileCacheStorage 目录名 _ -结论:导入路径会在“写入当下”把 expiration 固化到 cache block;如果 base_timestamp 不稳定,就会和 query/warmup 产生长期分叉。 - -可能导致“不一致”的主要场景 -以下按影响与复现概率排序。 -A. 读写 clamp 规则不一致(过期归零 vs 不归零) -触发条件: -- query 侧对 newest_write_timestamp + ttl <= now 做 clamp(变 0) -- import/warmup 某些路径仍写非 0 绝对时间 -结果: -- 同 hash 在 query 得到 expiration=0,在导入/预热得到 expiration>0 -- modify_expiration_time/rename 高频触发 -- 短期内出现“磁盘目录还是旧 expiration,内存已切新 expiration” - -B. 以 rowset write time 为基准不稳定(pending rowset t0/t1 分裂只是其中一种表现) -触发条件: -- 不同入口依赖 rowset 级别时间戳(尤其是 newest_write_timestamp) -- 同一对象在导入、build/commit、query、warmup 看到的“写入时间”并不天然一致 -结果: -- 导入写 cache 可能使用 t0 + ttl -- 查询/预热读取 rowset_meta 可能使用 t1 + ttl -- 未改 TTL property 也会出现一致性问题 -这是你观察到“没有 alter ttl 也不一致”的关键来源之一。 - -C. Warmup 路径自成一套计算规则 -触发条件: -- warmup 入口(sync_rowset/job/event-driven)未完全复用 query/write 的统一计算 -- 或某个入口未做 clamp -结果: -- warmup 成为第三个 expiration_time 来源 -- 同 hash 在 query/import/warmup 三端形成三值竞争 - -修复方案 -A. 读侧 clamp(过期归零) vs 写侧不 clamp(仍写非 0 绝对时间) -抽象统一的 expiration 计算函数,并在写路径补齐 clamp -int64_t calc_file_cache_expiration(int64_t base_timestamp, int64_t ttl_seconds) { - if (ttl_seconds <= 0 || base_timestamp <= 0) return 0; - int64_t exp = base_timestamp + ttl_seconds; // 需溢出保护 - return exp > UnixSeconds() ? exp : 0; -} -- Query:be/src/olap/rowset/beta_rowset_reader.cpp -- Write:be/src/olap/rowset/rowset_writer_context.h -- Warmup:be/src/cloud/cloud_internal_service.cpp、be/src/cloud/cloud_tablet.cpp、be/src/cloud/cloud_warm_up_manager.cpp -- SyncMeta:be/src/cloud/cloud_tablet.cpp -修完后,写入 _ 与后续 query/warmup 的规则一致,消除“写非0、读0”的系统性差异。 - -B. 使用稳定的 tablet 级基准,避免复用 `newest_write_timestamp` -以 TabletMeta/TabletMetaSharedPtr 的 creation_time() 作为 file cache expiration 基准 -思路: -- 不再用 newest_write_timestamp + ttl; -- 改为 tablet_meta->creation_time() + ttl(并做 clamp); -- query/write/warmup/sync_meta 统一显式传递这个稳定基准。 -核心收益: -1. 规避 B 类 t0/t1 漂移; -2. 基准稳定,减少同 hash 的反复改 expiration; -3. 不需要修改 newest_write_timestamp 语义,避免影响 cooldown、warmup_delta_data 等其他逻辑。 -风险/语义变化(很重要): -1. TTL 语义从“相对最新写入”变为“相对 tablet 创建时间”; -2. 新写入不会延长缓存寿命; -3. clone/restore/rebuild 场景下 creation_time 语义可能不等价于“数据新鲜度”; -4. 历史 meta 若 creation_time 缺失(0)需定义 fallback,本次临时修复统一按非 TTL(expiration=0)处理; -5. 若目标是“随写入刷新 TTL”,该方案有意放弃这种语义,优先保证稳定性与一致性。 - -C. Warmup 路径的 expiration 计算规则不一致(尤其缺少 clamp) - -warmup 与 query/write 完全复用同一套 calc + clamp -把 warmup 入口统一改成: -- expiration_time = calc_file_cache_expiration(tablet_meta->creation_time(), ttl_seconds) -至少覆盖: -- be/src/cloud/cloud_internal_service.cpp -- be/src/cloud/cloud_tablet.cpp -- be/src/cloud/cloud_warm_up_manager.cpp - -D. `sync_meta` 发现 TTL anchor 相关元数据变化时,data/index 都要一起更新 - -当 `sync_meta` 发现 `tablet_meta->creation_time()` 或 `tablet_meta->ttl_seconds()` 变化时: -- 重新计算 `calc_file_cache_expiration(tablet_meta->creation_time(), new_ttl_seconds)` -- 用这个统一结果更新 segment data cache -- 同时更新 inverted index cache - -如果只更新 segment 而漏掉 index,仍会残留同一 rowset 内部的 TTL 视图不一致问题。 - -验收标准 -1. query/write/warmup/sync_meta 四条链路对同一 tablet 内对象使用相同的 base_timestamp。 -2. file cache TTL 计算不再依赖 newest_write_timestamp,因此不改变 cooldown、warmup_delta_data 等现有逻辑语义。 -3. 对于 old tablet 的 late write,TTL 不会被“刷新”;这属于本次临时修复引入且接受的语义变化。 -4. 已过期或非法 base_timestamp 统一落到 expiration_time=0,避免同 hash 多值竞争。 -5. `sync_meta` 在 TTL anchor 相关元数据变化时,segment data 和 inverted index 都会一起迁移到新的 expiration。 - -版本/分支 -master/branch-4.1/cloud-26.1 带了正式的修复,与临时修复冲突,所以不用也不能 pick -branch-4.0 + branch-3.1 版本可以 pick diff --git a/ttl_fix_regression_plan.md b/ttl_fix_regression_plan.md deleted file mode 100644 index 0b805a7e4d2ae5..00000000000000 --- a/ttl_fix_regression_plan.md +++ /dev/null @@ -1,66 +0,0 @@ -# Doris Regression Test Plan Template - -## 1. Summary of Change -- **Intent**: switch the temporary file-cache TTL anchor from `newest_write_timestamp` to `tablet_meta->creation_time()` so expiration is stable across write/read/warmup/sync_meta without affecting rowset freshness semantics. -- **Affected modules/files**: - - `be/src/io/cache/file_cache_expiration.h` - - `be/src/olap/rowset/rowset_writer_context.h` - - `be/src/olap/rowset/rowset_reader_context.h` - - `be/src/olap/rowset/beta_rowset_reader.cpp` - - `be/src/olap/tablet_reader.cpp` - - `be/src/cloud/cloud_internal_service.cpp` - - `be/src/cloud/cloud_tablet.cpp` - - `be/src/cloud/cloud_warm_up_manager.cpp` - - `regression-test/suites/cloud_p0/cache/ttl/...` -- **User-provided context**: this is a temporary fix for TTL inconsistency on branch-4.0/3.1. Stability and cache correctness are the priority; changing TTL semantics from latest-write based to tablet-creation based is acceptable for the target dynamic-partition-heavy workloads. - -## 2. Risk Analysis -- **Primary risk**: an old tablet may stop placing newly written data into the TTL queue because TTL is now anchored to tablet creation time. -- **Secondary risks**: - - one remaining path may still calculate expiration from rowset time and reintroduce rename/modify drift - - `sync_meta` may update segment data but miss inverted index cache entries - - metrics-based regression may be flaky if cache population is too small -- **Backward compatibility concerns**: file-cache TTL semantics change from “relative to latest rowset write” to “relative to tablet creation time”. - -## 3. Test Scope -- **Target suites**: `regression-test/suites/cloud_p0/cache/ttl` -- **Excluded areas**: multi-cluster warmup orchestration, restore/clone end-to-end, non-cloud suites -- **Required environment**: cloud docker regression environment with one alive BE in the target cluster - -## 4. Test Matrix -| Scenario | Setup | Action | Expected Result | Signal/Metric | -| --- | --- | --- | --- | --- | -| TTL on a fresh tablet still enters TTL queue | Create table with `file_cache_ttl_seconds`, load/query data soon after create | Load or query to populate file cache | TTL cache is populated | `ttl_cache_size > 0` | -| Late write on an old tablet does not refresh TTL | Create table with short TTL, wait past TTL before the first load on that table | Load/query after waiting | New cache entries are treated as non-TTL | `ttl_cache_size == 0`, `normal_queue_cache_size > 0` | - -## 5. Data & Setup -- **Tables/schemas**: simple single-bucket table in cloud cache suite -- **Test data**: generated rows via `INSERT INTO ... SELECT FROM numbers(...)` with a sufficiently wide string payload; avoid `BROKER LOAD` / external object-store dependencies -- **Config toggles**: - - `enable_evict_file_cache_in_advance=false` - - `file_cache_enter_disk_resource_limit_mode_percent=99` - -## 6. Assertions -- **Result correctness**: - - TTL queue is used for fresh tablets - - TTL queue is not used for late writes to old tablets -- **Metrics / logs / profiles**: - - validate `ttl_cache_size` - - validate `normal_queue_cache_size` -- **Negative cases**: - - no reliance on `newest_write_timestamp` refresh behavior - -## 7. Implementation Notes -- **Candidate test files**: - - add `regression-test/suites/cloud_p0/cache/ttl/test_ttl_creation_time_anchor.groovy` -- **Reuse existing utilities**: - - cache clear HTTP helper - - metrics scraping via `/brpc_metrics` - - docker `ClusterOptions` cloud-mode wrapper for a self-contained regression environment -- **Cleanup requirements**: - - clear file cache before the test - - drop created tables after the test - -## 8. Approval Checklist -- [x] Plan reviewed by user through the request to proceed end-to-end in this turn -- [x] User approved implementation diff --git a/ttl_fix_ut_plan.md b/ttl_fix_ut_plan.md deleted file mode 100644 index a486acadc594ef..00000000000000 --- a/ttl_fix_ut_plan.md +++ /dev/null @@ -1,53 +0,0 @@ -# Unit Test Plan - -## Context -- Change source: current branch `ttl-filecache-fix`, replacing the temporary `newest_write_timestamp` anchor with `tablet_meta->creation_time()` for file-cache TTL expiration. -- Intent: keep file-cache TTL expiration stable across write/read/warmup/sync_meta without changing `newest_write_timestamp` semantics. - -## Scope summary -- Files/modules touched: - - `be/src/io/cache/file_cache_expiration.h` - - `be/src/olap/rowset/rowset_writer_context.h` - - `be/src/olap/rowset/rowset_reader_context.h` - - `be/src/olap/rowset/beta_rowset_reader.cpp` - - `be/src/olap/tablet_reader.cpp` - - `be/src/cloud/cloud_tablet.cpp` - - `be/src/cloud/cloud_internal_service.cpp` - - `be/src/cloud/cloud_warm_up_manager.cpp` - - `be/src/olap/tablet.cpp` - - `be/src/cloud/cloud_tablet.cpp` -- Behaviors changed: - - File-cache TTL expiration is now computed from a stable tablet-level base timestamp. - - `newest_write_timestamp` remains a rowset freshness field and is no longer repurposed for file-cache TTL. - - Pending rowset initialization no longer persists `newest_write_timestamp` early for TTL alignment. - -## Test cases -1. `calc_file_cache_expiration_time_returns_zero_for_invalid_or_expired_base` - - Purpose: verify clamp behavior is still correct for invalid base, expired base, and overflow. - - Setup/fixtures: direct helper test. - - Inputs: `base_timestamp <= 0`, expired `base_timestamp + ttl`, overflow edge. - - Expected outcomes: helper returns `0`. - - Mocks/stubs: none. -2. `calc_file_cache_expiration_time_uses_base_timestamp` - - Purpose: verify the helper preserves the stable anchor semantics. - - Setup/fixtures: direct helper test. - - Inputs: valid `base_timestamp`, positive `ttl_seconds`. - - Expected outcomes: helper returns `base_timestamp + ttl_seconds`. - - Mocks/stubs: none. -3. `rowset_writer_context_uses_file_cache_base_timestamp` - - Purpose: ensure writer-side cache expiration uses `file_cache_base_timestamp` rather than `newest_write_timestamp`. - - Setup/fixtures: instantiate `RowsetWriterContext`. - - Inputs: different values for `file_cache_base_timestamp` and `newest_write_timestamp`. - - Expected outcomes: `get_file_writer_options().file_cache_expiration` follows the base timestamp. - - Mocks/stubs: none. - -## Coverage matrix (optional) -- Helper clamp/overflow -> test cases 1 and 2 -- Writer anchor propagation -> test case 3 - -## Risks & gaps -- Read/warmup/sync_meta integration is hard to unit-test directly without large fixtures; that coverage will be carried by regression tests and targeted code inspection. -- No direct unit coverage for clone/restore semantics in this patch. - -## Approvals -- [x] Plan approved by user through the request to proceed end-to-end in this turn