Tencent · DaChengTechnology · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,4 @@ python/setup.py
 
 # Xmake
 .xmake/
+CMakePresets.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -81,14 +81,15 @@ option(NCNN_SIMPLEVK "minimal in-house vulkan loader" ON)
 option(NCNN_SYSTEM_GLSLANG "use system glslang library" OFF)
 option(NCNN_RUNTIME_CPU "runtime dispatch cpu routines" ON)
 option(NCNN_DISABLE_PIC "disable position-independent code" OFF)
-option(NCNN_BUILD_TESTS "build tests" OFF)
+option(NCNN_BUILD_TESTS "build tests" ON)
 option(NCNN_COVERAGE "build for coverage" OFF)
 option(NCNN_ASAN "build for address sanitizer" OFF)
 option(NCNN_BUILD_BENCHMARK "build benchmark" ON)
 option(NCNN_PYTHON "build python api" OFF)
 option(NCNN_INT8 "int8 inference" ON)
 option(NCNN_BF16 "bf16 inference" ON)
 option(NCNN_FORCE_INLINE "force inline some function" ON)
+option(NCNN_MUTITHREAD "enable multi thread bata" ON)
 
 if(ANDROID OR IOS OR NCNN_SIMPLESTL)
     option(NCNN_DISABLE_RTTI "disable rtti" ON)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -48,6 +48,13 @@ if(ANDROID)
     list(APPEND ncnn_SRCS mat_pixel_android.cpp)
 endif()
 
+if(NCNN_MUTITHREAD)
+    list(APPEND ncnn_SRCS thread.cpp)
+    if(WIN32)
+        list(APPEND ncnn_SRCS TheadInfo.cpp)
+    endif()
+endif()
+
 ncnn_src_group(ncnn_SRCS "sources")
 
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}")
@@ -266,6 +273,11 @@ if(NCNN_THREADS)
         target_link_libraries(ncnn PUBLIC pthread)
     endif()
 endif()
+if(NCNN_MUTITHREAD)
+    if(NOT WIN32 AND (NOT NCNN_SIMPLEOMP) AND (NOT NCNN_SIMPLESTL))
+        target_link_libraries(ncnn PUBLIC -pthread)
+    endif()
+endif()
 
 if(NCNN_VULKAN)
     if(NCNN_SIMPLEVK)

diff --git a/src/TheadInfo.cpp b/src/TheadInfo.cpp
@@ -0,0 +1,69 @@
+#ifdef NCNN_MUTITHREAD
+#ifdef _WIN32
+
+#include "TheadInfo.h"
+namespace ncnn {
+
+// 初始化静态成员
+ThreadInfo* ThreadInfo::thread_info = nullptr;
+
+ThreadInfo::ThreadInfo(/* args */)
+{
+    int groupCount = GetActiveProcessorGroupCount();
+    for (WORD group = 0; group < groupCount; group++)
+    {
+        DWORD processorsInGroup = GetActiveProcessorCount(group);
+        for (int i = 0; i < static_cast<int>(processorsInGroup); i++)
+        {
+            CoreInfo info;
+            info.group = group;
+            info.id = i + core_infos.size();
+            info.affinity = (static_cast<DWORD_PTR>(1) << i);
+            core_infos.push_back(info);
+        }
+    }
+}
+
+ThreadInfo* ThreadInfo::get()
+{
+    static Mutex lock;
+    AutoLock guard(lock);
+
+    if (!thread_info)
+    {
+        thread_info = new ThreadInfo();
+    }
+    return thread_info;
+}
+
+CoreInfo ThreadInfo::getCurrentCore()
+{
+    // 获取当前线程运行的CPU核心（支持多处理器组）
+    DWORD_PTR process_affinity, system_affinity;
+    GetProcessAffinityMask(GetCurrentProcess(), &process_affinity, &system_affinity);
+
+    // 使用扩展API获取处理器组信息
+    PROCESSOR_NUMBER proc_num;
+    GetCurrentProcessorNumberEx(&proc_num);
+
+    for (const auto& core : core_infos)
+    {
+        // 匹配组号和组内核心编号
+        if (core.group == proc_num.Group && (core.affinity & (1ULL << proc_num.Number)))
+        {
+            return core;
+        }
+    }
+
+    // 未找到时返回默认值
+    return {-1, -1, 0};
+}
+
+void ThreadInfo::getAllCore(std::vector<CoreInfo>& out)
+{
+    out = core_infos;
+}
+} // namespace ncnn
+
+#endif
+#endif
diff --git a/src/TheadInfo.h b/src/TheadInfo.h
@@ -0,0 +1,30 @@
+#ifndef THREAD_INFO_H
+#define THREAD_INFO_H
+#ifdef NCNN_MUTITHREAD
+#if defined _WIN32
+#include "cpu.h"
+namespace ncnn {
+struct CoreInfo
+{
+public:
+    int id;
+    int group;
+    DWORD_PTR affinity;
+};
+class ThreadInfo
+{
+private:
+    static ThreadInfo* thread_info;
+    std::vector<CoreInfo> core_infos;
+    ThreadInfo(/* args */);
+
+public:
+    static ThreadInfo* get();
+    CoreInfo getCurrentCore();
+    void getAllCore(std::vector<CoreInfo>& out);
+};
+} // namespace ncnn
+
+#endif
+#endif
+#endif
diff --git a/src/cpu.cpp b/src/cpu.cpp
@@ -1424,12 +1424,21 @@ static std::vector<int> get_max_freq_mhz()
 
 static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 {
+#ifdef _WIN32
+    GROUP_AFFINITY groupAffinity;
+    ZeroMemory(&groupAffinity, sizeof(groupAffinity));
+    groupAffinity.Group = static_cast<WORD>(thread_affinity_mask.cpu_group);
+    groupAffinity.Mask = thread_affinity_mask.mask;
+
+    SetThreadGroupAffinity(GetCurrentThread(), &groupAffinity, NULL);
+#else
     DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
     if (prev_mask == 0)
     {
         NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
         return -1;
     }
+#endif
 
     return 0;
 }
@@ -2266,22 +2275,27 @@ CpuSet::CpuSet()
 
 void CpuSet::enable(int cpu)
 {
-    mask |= ((ULONG_PTR)1 << cpu);
+    cpu_group = cpu / 64;
+    mask |= ((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }
 
 void CpuSet::disable(int cpu)
 {
-    mask &= ~((ULONG_PTR)1 << cpu);
+    cpu_group = cpu / 64;
+    mask &= ~((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }
 
 void CpuSet::disable_all()
 {
+    cpu_group = 0;
     mask = 0;
 }
 
 bool CpuSet::is_enabled(int cpu) const
 {
-    return mask & ((ULONG_PTR)1 << cpu);
+    if (cpu_group != cpu / 64)
+        return false;
+    return mask & ((ULONG_PTR)1 << (cpu - cpu_group * 64));
 }
 
 int CpuSet::num_enabled() const
@@ -3266,4 +3280,38 @@ int set_flush_denormals(int flush_denormals)
 #endif
 }
 
+int get_multi_thread_batch()
+{
+#if defined(_NCNN_MUTITHREAD)
+#if defined _WIN32
+    DWORD length = 0;
+    GetLogicalProcessorInformation(NULL, &length);
+    if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
+        return 0;
+
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);
+
+    int count = 0;
+    if (GetLogicalProcessorInformation(buffer, &length))
+    {
+        DWORD offset = 0;
+        while (offset < length)
+        {
+            if (buffer->Relationship == RelationProcessorCore)
+                count++;
+
+            offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+            buffer++;
+        }
+    }
+    free(buffer);
+    return count;
+#else
+    return get_cpu_count();
+#endif
+#else
+    return get_cpu_count();
+#endif
+}
+
 } // namespace ncnn
diff --git a/src/cpu.h b/src/cpu.h
@@ -8,6 +8,7 @@
 
 #if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
+#define _WIN32_WINNT 0x0601 // Windows 7+
 #include <windows.h>
 #endif
 #if defined __ANDROID__ || defined __linux__
@@ -30,6 +31,7 @@ class NCNN_EXPORT CpuSet
 
 public:
 #if defined _WIN32
+    int cpu_group;
     ULONG_PTR mask;
 #endif
 #if defined __ANDROID__ || defined __linux__
@@ -172,6 +174,9 @@ NCNN_EXPORT void set_kmp_blocktime(int time_ms);
 NCNN_EXPORT int get_flush_denormals();
 NCNN_EXPORT int set_flush_denormals(int flush_denormals);
 
+// multi thread batch inference
+NCNN_EXPORT int get_multi_thread_batch();
+
 } // namespace ncnn
 
 #endif // NCNN_CPU_H
diff --git a/src/layer.cpp b/src/layer.cpp
@@ -98,6 +98,11 @@ int Layer::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) cons
     return -1;
 }
 
+int Layer::forward_thread(void* /*info*/) const
+{
+    return -1;
+}
+
 #if NCNN_VULKAN
 int Layer::upload_model(VkTransfer& /*cmd*/, const Option& /*opt*/)
 {

diff --git a/src/layer.h b/src/layer.h
@@ -94,6 +94,10 @@ class NCNN_EXPORT Layer
     // return 0 if success
     virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+    /// @brief mutithread work function
+    /// @param workspace thread infomation
+    /// @return 0 if success
+    virtual int forward_thread(void* workspace);
 
 #if NCNN_VULKAN
 public:
@@ -139,6 +143,7 @@ class NCNN_EXPORT Layer
 // layer factory function
 typedef Layer* (*layer_creator_func)(void*);
 typedef void (*layer_destroyer_func)(Layer*, void*);
+typedef int (*layer_work_func)(Layer*, void*);
 
 struct layer_registry_entry
 {

diff --git a/src/layer/absval.cpp b/src/layer/absval.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "absval.h"
+#include "thread.h"
 
 namespace ncnn {
 
@@ -17,6 +18,16 @@ int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
+    if (opt.num_threads > 64)
+    {
+        ThreadWorkspace workspace;
+        workspace.layer = (Layer*)this;
+        MutilThread thread(workspace, opt);
+        std::vector<Mat> workspace_blobs;
+        workspace_blobs.push_back(bottom_top_blob);
+        thread.join(workspace_blobs);
+        return 0;
+    }
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int q = 0; q < channels; q++)
@@ -33,4 +44,47 @@ int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+int AbsVal::forward_thread(void* workspace)
+{
+    ThreadInfoExc* info = (ThreadInfoExc*)workspace;
+    Mat& bottom_top_blob = info->mats->at(0);
+    if (bottom_top_blob.elemsize == 1)
+    {
+        int8_t* ptr = (int8_t*)bottom_top_blob.data;
+        const int8_t flag = 1 << 7;
+        for (size_t i = info->start_index; i < info->end_index; i++)
+        {
+            if (ptr[i] & flag)
+            {
+                ptr[i] = -ptr[i];
+            }
+        }
+    }
+    else if (bottom_top_blob.elemsize == 2)
+    {
+        int16_t* ptr = (int16_t*)bottom_top_blob.data;
+        const int16_t flag = 1 << 15;
+        for (size_t i = info->start_index; i < info->end_index; i++)
+        {
+            if (ptr[i] & flag)
+            {
+                ptr[i] = -ptr[i];
+            }
+        }
+    }
+    else
+    {
+        float* ptr = (float*)bottom_top_blob.data;
+        for (size_t i = info->start_index; i < info->end_index; i++)
+        {
+            if (ptr[i] < 0)
+            {
+                ptr[i] = -ptr[i];
+            }
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/absval.h b/src/layer/absval.h
@@ -14,6 +14,7 @@ class AbsVal : public Layer
     AbsVal();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+    virtual int forward_thread(void* workspace);
 };
 
 } // namespace ncnn

diff --git a/src/layer/batchnorm.h b/src/layer/batchnorm.h
@@ -19,6 +19,8 @@ class BatchNorm : public Layer
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
+    virtual int forward_thread(void* workspace);
+
 public:
     // param
     int channels;

diff --git a/src/platform.h.in b/src/platform.h.in
@@ -57,6 +57,7 @@
 #cmakedefine01 NCNN_INT8
 #cmakedefine01 NCNN_BF16
 #cmakedefine01 NCNN_FORCE_INLINE
+#cmakedefine01 NCNN_MUTITHREAD
 
 #cmakedefine NCNN_VERSION_STRING "@NCNN_VERSION_STRING@"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -60,3 +60,4 @@ python/setup.py

		# Xmake
		.xmake/
		CMakePresets.json