From 89d75519a79ce029a78c5d71cc12dd8a57b95060 Mon Sep 17 00:00:00 2001 From: yiguolei Date: Wed, 10 Jun 2026 11:05:52 +0800 Subject: [PATCH 1/2] [improvement](cgroup) inactive_file should be treated as available memory to avoid query be cancelled --- be/src/common/cgroup_memory_ctl.cpp | 23 +++++++++++++++---- be/src/util/cgroup_util.cpp | 7 ++++++ be/src/util/mem_info.cpp | 16 ++++++------- .../meta_service_rate_limit_helper.cpp | 2 +- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/be/src/common/cgroup_memory_ctl.cpp b/be/src/common/cgroup_memory_ctl.cpp index dddcbd50338d82..1269026325fc1a 100644 --- a/be/src/common/cgroup_memory_ctl.cpp +++ b/be/src/common/cgroup_memory_ctl.cpp @@ -93,6 +93,8 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); } + // This means no limit, for example, all process in linux will belong to a cgroup, and + // the default value of the memory limit in memory.max file is "max", which means no limit. if (line == "max") { *value = std::numeric_limits::max(); return Status::OK(); @@ -107,15 +109,28 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { std::unordered_map metrics_map; CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / "memory.stat"), metrics_map); - if (*value < metrics_map["inactive_file"]) { + int64_t inactive_file = + metrics_map.contains("inactive_file") ? metrics_map["inactive_file"] : 0; + int64_t active_file = metrics_map.contains("active_file") ? metrics_map["active_file"] : 0; + int64_t slab_reclaimable = + metrics_map.contains("slab_reclaimable") ? metrics_map["slab_reclaimable"] : 0; + if (inactive_file < 1 || active_file < 1 || slab_reclaimable < 1) { + // In this scenario, not return error, ignore it and print log. + LOG(WARNING) << "CgroupsV2Reader read_memory_usage missing expected metrics in " + "memory.stat, inactive_file: " + << inactive_file << ", active_file: " << active_file + << ", slab_reclaimable: " << slab_reclaimable; + } + + const int64_t reclaimable_usage = inactive_file + active_file + slab_reclaimable; + if (*value < reclaimable_usage) { return Status::CgroupError("CgroupsV2Reader read_memory_usage negative memory usage"); } - // the reason why we subtract inactive_file described here: + // The reclaimable file cache described here should not be counted as used memory: // https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667 - *value -= metrics_map["inactive_file"]; // Part of "slab" that might be reclaimed, such as dentries and inodes. // https://arthurchiao.art/blog/cgroupv2-zh/ - *value -= metrics_map["slab_reclaimable"]; + *value -= reclaimable_usage; return Status::OK(); } diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp index cfda0a4fb77d35..cc52a60ed75245 100644 --- a/be/src/util/cgroup_util.cpp +++ b/be/src/util/cgroup_util.cpp @@ -178,6 +178,9 @@ std::string CGroupUtil::cgroupv2_of_process() { } // With cgroups v2, there will be a *single* line with prefix "0::/" // (see https://docs.kernel.org/admin-guide/cgroup-v2.html) + // such as 0::/user.slice/user-1005.slice/session-213906.scope this is the cgroup name + // it should be combined with the default cgroup mount point to get the full path to the cgroup, e.g. + // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope std::string cgroup; std::getline(cgroup_name_file, cgroup); static const std::string v2_prefix = "0::/"; @@ -198,6 +201,7 @@ std::optional CGroupUtil::get_cgroupsv2_path(const std::string& sub } std::string cgroup = CGroupUtil::cgroupv2_of_process(); + // /sys/fs/cgroup/user.slice/user-1005.slice/session-213906.scope auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); // Return the bottom-most nested current memory file. If there is no such file at the current @@ -259,6 +263,9 @@ void CGroupUtil::read_int_metric_from_cgroup_file( metrics_map[key] = value; } else if (fields[2] == "kB") { metrics_map[key] = value * 1024L; + } else { + LOG(WARNING) << "Unknown unit in cgroup file " << file_path.string() + << ", line: " << line; } } } diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index 4b48d40d2d93d2..ebd6f4a5442d13 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -94,7 +94,7 @@ void MemInfo::refresh_proc_meminfo() { if (meminfo.is_open()) { meminfo.close(); } - + _s_cgroup_mem_refresh_state = false; // refresh cgroup memory if (config::enable_use_cgroup_memory_info) { if (_s_cgroup_mem_refresh_wait_times >= 0) { @@ -119,12 +119,13 @@ void MemInfo::refresh_proc_meminfo() { // cgroup mem limit is refreshed every 10 seconds, // cgroup mem usage is refreshed together with memInfo every time, which is very frequent. + // If _s_cgroup_mem_limit == max, it means get cgroup mem limit failed OR the cgroup has no memory limit for example + // there is just "max" in memory.max file. if (_s_cgroup_mem_limit != std::numeric_limits::max()) { int64_t cgroup_mem_usage; auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage); if (!status.ok()) { _s_cgroup_mem_usage = std::numeric_limits::min(); - _s_cgroup_mem_refresh_state = false; LOG_EVERY_N(WARNING, 500) << "Refresh cgroup memory usage failed, cgroup mem limit: " << _s_cgroup_mem_limit << ", " << status; @@ -132,17 +133,14 @@ void MemInfo::refresh_proc_meminfo() { _s_cgroup_mem_usage = cgroup_mem_usage; _s_cgroup_mem_refresh_state = true; } - } else { - _s_cgroup_mem_refresh_state = false; } - } else { - _s_cgroup_mem_refresh_state = false; } // 1. calculate physical_mem int64_t physical_mem = -1; - - physical_mem = _mem_info_bytes["MemTotal"]; + if (_mem_info_bytes.find("MemTotal") != _mem_info_bytes.end()) { + physical_mem = _mem_info_bytes["MemTotal"]; + } if (_s_cgroup_mem_refresh_state) { // In theory, always cgroup_mem_limit < physical_mem if (physical_mem < 0) { @@ -200,7 +198,7 @@ void MemInfo::refresh_proc_meminfo() { // Process `MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))`, // from `MemAvailable` in `/proc/meminfo`, calculated by OS. // CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`, - // `cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable`, in fact, + // `cgroup_mem_usage = memory.current - inactive_file - active_file - slab_reclaimable`, in fact, // there seems to be some memory that can be reused in `cgroup_mem_usage`. if (mem_available < 0) { mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage; diff --git a/cloud/src/meta-service/meta_service_rate_limit_helper.cpp b/cloud/src/meta-service/meta_service_rate_limit_helper.cpp index 0f8c750be6cf3c..9af2aead05cca9 100644 --- a/cloud/src/meta-service/meta_service_rate_limit_helper.cpp +++ b/cloud/src/meta-service/meta_service_rate_limit_helper.cpp @@ -360,7 +360,7 @@ std::optional get_cgroup_memory_info() { } auto metrics = read_metrics_map(*dir / "memory.stat"); int64_t adjusted_usage = *usage; - adjusted_usage -= metrics["inactive_file"]; + adjusted_usage -= metrics["inactive_file"] + metrics["active_file"]; adjusted_usage -= metrics["slab_reclaimable"]; adjusted_usage = std::max(0, adjusted_usage); return CgroupMemoryInfo {limit_bytes, adjusted_usage}; From 0245a280ecd348cee7772ef9e9af09472a2b38f8 Mon Sep 17 00:00:00 2001 From: yiguolei Date: Wed, 10 Jun 2026 11:34:48 +0800 Subject: [PATCH 2/2] f --- be/src/common/cgroup_memory_ctl.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/be/src/common/cgroup_memory_ctl.cpp b/be/src/common/cgroup_memory_ctl.cpp index 1269026325fc1a..ceb7ae047946f9 100644 --- a/be/src/common/cgroup_memory_ctl.cpp +++ b/be/src/common/cgroup_memory_ctl.cpp @@ -124,7 +124,10 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { const int64_t reclaimable_usage = inactive_file + active_file + slab_reclaimable; if (*value < reclaimable_usage) { - return Status::CgroupError("CgroupsV2Reader read_memory_usage negative memory usage"); + return Status::CgroupError( + "CgroupsV2Reader read_memory_usage negative memory usage, memory.current: {}, " + "active_file: {}, inactive_file: {}, slab_reclaimable: {}", + *value, active_file, inactive_file, slab_reclaimable); } // The reclaimable file cache described here should not be counted as used memory: // https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667