From a62d1f177b7888ec88035a0a1ce600fbc2280ce7 Mon Sep 17 00:00:00 2001 From: Lipeng Zhu Date: Thu, 17 Oct 2024 18:37:10 +0800 Subject: [PATCH] Fix false sharing issue between main thread and io-threads when access `used_memory_thread`. (#1179) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When profiling some workloads with `io-threads` enabled. We found the false sharing issue is heavy. This patch try to split the the elements accessed by main thread and io-threads into different cache line by padding the elements in the head of `used_memory_thread_padded` array. This design helps mitigate the false sharing between main thread and io-threads, because the main thread has been the bottleneck with io-threads enabled. We didn't put each element in an individual cache line is that we don't want to bring the additional cache line fetch operation (3 vs 16 cache line) when call function like `zmalloc_used_memory()`. --------- Signed-off-by: Lipeng Zhu Signed-off-by: Lipeng Zhu Signed-off-by: Viktor Söderqvist Co-authored-by: Wangyang Guo Co-authored-by: Viktor Söderqvist --- src/zmalloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zmalloc.c b/src/zmalloc.c index 1cb01ee88c..e18fa8bac2 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -90,6 +90,7 @@ void zlibc_free(void *ptr) { #define thread_local _Thread_local +#define PADDING_ELEMENT_NUM (CACHE_LINE_SIZE / sizeof(size_t) - 1) #define MAX_THREADS_NUM (IO_THREADS_MAX_NUM + 3 + 1) /* A thread-local storage which keep the current thread's index in the used_memory_thread array. */ static thread_local int thread_index = -1; @@ -101,10 +102,11 @@ static thread_local int thread_index = -1; * For the other architecture, lets fall back to the atomic operation to keep safe. */ #if defined(__i386__) || defined(__x86_64__) || defined(__amd64__) || defined(__POWERPC__) || defined(__arm__) || \ defined(__arm64__) -static __attribute__((aligned(sizeof(size_t)))) size_t used_memory_thread[MAX_THREADS_NUM]; +static __attribute__((aligned(CACHE_LINE_SIZE))) size_t used_memory_thread_padded[MAX_THREADS_NUM + PADDING_ELEMENT_NUM]; #else -static _Atomic size_t used_memory_thread[MAX_THREADS_NUM]; +static __attribute__((aligned(CACHE_LINE_SIZE))) _Atomic size_t used_memory_thread_padded[MAX_THREADS_NUM + PADDING_ELEMENT_NUM]; #endif +static size_t *used_memory_thread = &used_memory_thread_padded[PADDING_ELEMENT_NUM]; static atomic_int total_active_threads = 0; /* This is a simple protection. It's used only if some modules create a lot of threads. */ static atomic_size_t used_memory_for_additional_threads = 0;