From 9d4d5b3a88511823c5244c0ef4bd884a9c8c3d17 Mon Sep 17 00:00:00 2001 From: Lukas Sismis Date: Fri, 3 Jan 2025 16:08:36 +0100 Subject: [PATCH] threading: support thread autopinning and interface-specific affinity Using the new configuration format, it is now possible to set CPU affinity settings per interface. The threading.autopin option has been added to automatically use CPUs from the same NUMA node as the interface. The autopin option requires hwloc-devel / hwloc-dev to be installed and --enable-hwloc flag in configure script. Ticket: 7036 --- configure.ac | 28 + doc/userguide/configuration/suricata-yaml.rst | 82 +++ doc/userguide/upgrade.rst | 14 + src/runmode-dpdk.c | 62 +- src/suricata.c | 4 + src/threadvars.h | 3 + src/tm-threads.c | 22 +- src/util-affinity.c | 630 ++++++++++++++++-- src/util-affinity.h | 25 +- src/util-device.c | 15 + src/util-device.h | 1 + src/util-runmodes.c | 16 + suricata.yaml.in | 15 + 13 files changed, 854 insertions(+), 63 deletions(-) diff --git a/configure.ac b/configure.ac index ca964d9039a0..3151bff17a92 100644 --- a/configure.ac +++ b/configure.ac @@ -741,6 +741,33 @@ exit 1 fi + LIBHWLOC="" + AC_ARG_ENABLE(hwloc, + AS_HELP_STRING([--enable-hwloc], [Enable hwloc support [default=no]]), + [enable_hwloc=$enableval],[enable_hwloc=no]) + AS_IF([test "x$enable_hwloc" = "xyes"], [ + PKG_CHECK_MODULES([HWLOC], [hwloc >= 2.0.0], + [AC_DEFINE([HAVE_HWLOC], [1], [Define if hwloc library is present and meets version requirements])], + LIBHWLOC="no") + + if test "$LIBHWLOC" = "no"; then + echo + echo " ERROR! hwloc library version > 2.0.0 not found, go get it" + echo " from https://www.open-mpi.org/projects/hwloc/ " + echo " or your distribution:" + echo + echo " Ubuntu: apt-get install hwloc libhwloc-dev" + echo " Fedora: dnf install hwloc hwloc-devel" + echo " CentOS/RHEL: yum install hwloc hwloc-devel" + echo + exit 1 + else + CFLAGS="${CFLAGS} ${HWLOC_CFLAGS}" + LDFLAGS="${LDFLAGS} ${HWLOC_LIBS}" + enable_hwloc="yes" + fi + ]) + # libpthread AC_ARG_WITH(libpthread_includes, [ --with-libpthread-includes=DIR libpthread include directory], @@ -2561,6 +2588,7 @@ SURICATA_BUILD_CONF="Suricata Configuration: JA4 support: ${enable_ja4} Non-bundled htp: ${enable_non_bundled_htp} Hyperscan support: ${enable_hyperscan} + Hwloc support: ${enable_hwloc} Libnet support: ${enable_libnet} liblz4 support: ${enable_liblz4} Landlock support: ${enable_landlock} diff --git a/doc/userguide/configuration/suricata-yaml.rst b/doc/userguide/configuration/suricata-yaml.rst index 33b3c5528fed..17a68d14067e 100644 --- a/doc/userguide/configuration/suricata-yaml.rst +++ b/doc/userguide/configuration/suricata-yaml.rst @@ -917,6 +917,7 @@ per available CPU/CPU core. threading: set-cpu-affinity: yes + autopin: no cpu-affinity: management-cpu-set: cpu: [ 0 ] # include only these cpus in affinity settings @@ -933,6 +934,13 @@ per available CPU/CPU core. medium: [ "1-2" ] high: [ 3 ] default: "medium" + interface-specific-cpu-set: + - interface: "enp4s0f0" # 0000:3b:00.0 # net_bonding0 # ens1f0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" verdict-cpu-set: cpu: [ 0 ] prio: @@ -969,6 +977,80 @@ Runmode Workers:: worker-cpu-set - used for receive,streamtcp,decode,detect,output(logging),respond/reject, verdict +Interface-specific CPU affinity settings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using the new configuration format introduced in Suricata 8.0 it is possible +to set CPU affinity settings per interface. This can be useful +when you have multiple interfaces and you want to dedicate specific CPU cores +to specific interfaces. This can be useful for example when Suricata runs on +multiple NUMA nodes and reads from interfaces on each NUMA node. + +Interface-specific affinity settings can be configured for the worker-cpu-set +and the receive-cpu-set (only used in autofp mode). +This feature is available for capture modes which work with interfaces +(af-packet, dpdk, etc.). The value of the interface key can be the kernel +interface name (e.g. eth0 for af-packet), the PCI address of the interface +(e.g. 0000:3b:00.0 for DPDK capture mode), or the name of the virtual device +interface (e.g. net_bonding0 for DPDK capture mode). +The interface names needs to be unique and be located under the capture mode +configuration. + +The interface-specific settings will override the global settings for the +worker-cpu-set and receive-cpu-set. The CPUs do not need to be contained in +the parent node settings. If the interface-specific settings are not defined, +the global settings will be used. + +:: + + threading: + set-cpu-affinity: yes + cpu-affinity: + worker-cpu-set: + interface-specific-cpu-set: + - interface: "eth0" # 0000:3b:00.0 # net_bonding0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" + +Automatic NUMA-aware CPU core pinning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When Suricata is running on a system with multiple NUMA nodes, it is possible +to automatically use CPUs from the same NUMA node as the network capture +interface. +CPU cores on the same NUMA nodes as the network capture interface have +reduced memory access latency and increased the performance of Suricata. +This is enabled by setting the `autopin` option to `yes` in the threading +section. This option is available for worker-cpu-set and receive-cpu-set. + +:: + + threading: + set-cpu-affinity: yes + autopin: yes + cpu-affinity: + worker-cpu-set: + cpu: [ "all" ] + mode: "exclusive" + prio: + high: [ "all" ] + +Consider 2 interfaces defined in the capture mode configuration, one on each +NUMA node. The `autopin` option is enabled to automatically use CPUs from the +same NUMA node as the interface. The worker-cpu-set is set to use all CPUs. +When interface on the first NUMA node is used, the worker threads will be +pinned to CPUs on the first NUMA node. When interface on the second NUMA node +is used, the worker threads will be pinned to CPUs on the second NUMA node. +If the number of CPU cores on a given NUMA node is exhausted then the worker +threads will be pinned to CPUs on the other NUMA node. + +The option `threading.autopin` can be combined with the interface-specific CPU +affinity settings. +To use the `autopin` option, the system must have the `hwloc` +dependency installed and pass `--enable-hwloc` to the configure script. IP Defrag --------- diff --git a/doc/userguide/upgrade.rst b/doc/userguide/upgrade.rst index fa08c8d14280..a738224d990a 100644 --- a/doc/userguide/upgrade.rst +++ b/doc/userguide/upgrade.rst @@ -99,6 +99,20 @@ Major changes + worker-cpu-set: + cpu: [0, 1] + - The `threading.cpu-affinity` configuration has been extended to support + interface-specific CPU affinity settings. This allows you to specify + CPU affinity settings for each interface separately. + The new configuration format is described in :ref:`suricata-yaml-threading`. + The old configuration format does not support this extension and will be + removed in Suricata 9.0. + - The `threading.cpu-affinity` configuration now supports autopinning + worker or receive threads to the same NUMA node as the network capture + interface is located on. + This can be enabled by setting `threading.autopin` to `yes`. + See :ref:`suricata-yaml-threading` for more information. + This requires hwloc dependency to be installed and `--enable-hwloc` + to be passed to configure script. + Removals ~~~~~~~~ - The ssh keywords ``ssh.protoversion`` and ``ssh.softwareversion`` have been removed. diff --git a/src/runmode-dpdk.c b/src/runmode-dpdk.c index 6bbe3c1f2ed6..32d8a1ff255b 100644 --- a/src/runmode-dpdk.c +++ b/src/runmode-dpdk.c @@ -368,12 +368,17 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) SCReturnInt(-EINVAL); } - ThreadsAffinityType *wtaf = GetAffinityTypeFromName("worker-cpu-set"); + bool wtaf_periface = true; + ThreadsAffinityType *wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", iconf->iface); if (wtaf == NULL) { - SCLogError("Specify worker-cpu-set list in the threading section"); - SCReturnInt(-EINVAL); + wtaf_periface = false; + wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", NULL); // mandatory + if (wtaf == NULL) { + SCLogError("Specify worker-cpu-set list in the threading section"); + SCReturnInt(-EINVAL); + } } - ThreadsAffinityType *mtaf = GetAffinityTypeFromName("management-cpu-set"); + ThreadsAffinityType *mtaf = GetAffinityTypeForNameAndIface("management-cpu-set", NULL); if (mtaf == NULL) { SCLogError("Specify management-cpu-set list in the threading section"); SCReturnInt(-EINVAL); @@ -406,7 +411,12 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) } if (strcmp(entry_str, "auto") == 0) { - iconf->threads = (uint16_t)sched_cpus / LiveGetDeviceCount(); + if (wtaf_periface) { + iconf->threads = (uint16_t)sched_cpus; + SCLogConfig("%s: auto-assigned %u threads", iconf->iface, iconf->threads); + SCReturnInt(0); + } + iconf->threads = (uint16_t)sched_cpus / LiveGetDeviceCountWithoutAssignedThreading(); if (iconf->threads == 0) { SCLogError("Not enough worker CPU cores with affinity were configured"); SCReturnInt(-ERANGE); @@ -416,7 +426,8 @@ static int ConfigSetThreads(DPDKIfaceConfig *iconf, const char *entry_str) iconf->threads++; remaining_auto_cpus--; } else if (remaining_auto_cpus == -1) { - remaining_auto_cpus = (int32_t)sched_cpus % LiveGetDeviceCount(); + remaining_auto_cpus = + (int32_t)sched_cpus % LiveGetDeviceCountWithoutAssignedThreading(); if (remaining_auto_cpus > 0) { iconf->threads++; remaining_auto_cpus--; @@ -844,23 +855,46 @@ static int ConfigLoad(DPDKIfaceConfig *iconf, const char *iface) SCReturnInt(0); } -static int32_t ConfigValidateThreads(uint16_t iface_threads) +static bool ConfigThreadsGenericIsValid(uint16_t iface_threads, ThreadsAffinityType *wtaf) { static uint32_t total_cpus = 0; total_cpus += iface_threads; - ThreadsAffinityType *wtaf = GetAffinityTypeFromName("worker-cpu-set"); if (wtaf == NULL) { SCLogError("Specify worker-cpu-set list in the threading section"); - return -1; + return false; } if (total_cpus > UtilAffinityGetAffinedCPUNum(wtaf)) { - SCLogError("Interfaces requested more cores than configured in the threading section " - "(requested %d configured %d", + SCLogError("Interfaces requested more cores than configured in the worker-cpu-set " + "threading section (requested %d configured %d", total_cpus, UtilAffinityGetAffinedCPUNum(wtaf)); - return -1; + return false; } - return 0; + return true; +} + +static bool ConfigThreadsInterfaceIsValid(uint16_t iface_threads, ThreadsAffinityType *itaf) +{ + if (iface_threads > UtilAffinityGetAffinedCPUNum(itaf)) { + SCLogError("Interface requested more cores than configured in the interface-specific " + "threading section (requested %d configured %d", + iface_threads, UtilAffinityGetAffinedCPUNum(itaf)); + return false; + } + + return true; +} + +static bool ConfigIsThreadingValid(uint16_t iface_threads, const char *iface) +{ + ThreadsAffinityType *itaf = GetAffinityTypeForNameAndIface("worker-cpu-set", iface); + ThreadsAffinityType *wtaf = GetAffinityTypeForNameAndIface("worker-cpu-set", NULL); + if (itaf && !ConfigThreadsInterfaceIsValid(iface_threads, itaf)) { + return false; + } else if (itaf == NULL && !ConfigThreadsGenericIsValid(iface_threads, wtaf)) { + return false; + } + return true; } static DPDKIfaceConfig *ConfigParse(const char *iface) @@ -873,7 +907,7 @@ static DPDKIfaceConfig *ConfigParse(const char *iface) ConfigInit(&iconf); retval = ConfigLoad(iconf, iface); - if (retval < 0 || ConfigValidateThreads(iconf->threads) != 0) { + if (retval < 0 || !ConfigIsThreadingValid(iconf->threads, iface)) { iconf->DerefFunc(iconf); SCReturnPtr(NULL, "void *"); } diff --git a/src/suricata.c b/src/suricata.c index ee9dfc0b5b69..b0b1721ccab0 100644 --- a/src/suricata.c +++ b/src/suricata.c @@ -111,6 +111,7 @@ #include "tmqh-packetpool.h" #include "tm-queuehandlers.h" +#include "util-affinity.h" #include "util-byte.h" #include "util-conf.h" #include "util-coredump-config.h" @@ -2298,6 +2299,9 @@ void PostRunDeinit(const int runmode, struct timeval *start_time) StreamTcpFreeConfig(STREAM_VERBOSE); DefragDestroy(); HttpRangeContainersDestroy(); +#ifdef HAVE_HWLOC + TopologyDestroy(); +#endif /* HAVE_HWLOC */ TmqResetQueues(); #ifdef PROFILING diff --git a/src/threadvars.h b/src/threadvars.h index 6f339e9839d5..471714a254c4 100644 --- a/src/threadvars.h +++ b/src/threadvars.h @@ -136,6 +136,9 @@ typedef struct ThreadVars_ { struct FlowQueue_ *flow_queue; bool break_loop; + /** Interface-specific thread affinity */ + char *iface_name; + Storage storage[]; } ThreadVars; diff --git a/src/tm-threads.c b/src/tm-threads.c index 07f9a9390df0..d5b504c16b7d 100644 --- a/src/tm-threads.c +++ b/src/tm-threads.c @@ -865,8 +865,24 @@ TmEcode TmThreadSetupOptions(ThreadVars *tv) TmThreadSetPrio(tv); if (tv->thread_setup_flags & THREAD_SET_AFFTYPE) { ThreadsAffinityType *taf = &thread_affinity[tv->cpu_affinity]; + bool use_iface_affinity = RunmodeIsAutofp() && tv->cpu_affinity == RECEIVE_CPU_SET && + FindAffinityByInterface(taf, tv->iface_name) != NULL; + use_iface_affinity |= RunmodeIsWorkers() && tv->cpu_affinity == WORKER_CPU_SET && + FindAffinityByInterface(taf, tv->iface_name) != NULL; + + if (use_iface_affinity) { + taf = FindAffinityByInterface(taf, tv->iface_name); + } + + if (UtilAffinityGetAffinedCPUNum(taf) == 0) { + if (!taf->nocpu_warned) { + SCLogWarning("No CPU affinity set for %s", AffinityGetYamlPath(taf)); + taf->nocpu_warned = true; + } + } + if (taf->mode_flag == EXCLUSIVE_AFFINITY) { - uint16_t cpu = AffinityGetNextCPU(taf); + uint16_t cpu = AffinityGetNextCPU(tv, taf); SetCPUAffinity(cpu); /* If CPU is in a set overwrite the default thread prio */ if (CPU_ISSET(cpu, &taf->lowprio_cpu)) { @@ -1600,6 +1616,10 @@ static void TmThreadFree(ThreadVars *tv) SCFree(tv->printable_name); } + if (tv->iface_name) { + SCFree(tv->iface_name); + } + if (tv->stream_pq_local) { BUG_ON(tv->stream_pq_local->len); SCMutexDestroy(&tv->stream_pq_local->mutex_q); diff --git a/src/util-affinity.c b/src/util-affinity.c index ee365372702a..e78fd7e8af61 100644 --- a/src/util-affinity.c +++ b/src/util-affinity.c @@ -31,50 +31,169 @@ #include "util-cpu.h" #include "util-byte.h" #include "util-debug.h" +#include "util-dpdk.h" ThreadsAffinityType thread_affinity[MAX_CPU_SET] = { { .name = "receive-cpu-set", .mode_flag = EXCLUSIVE_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "worker-cpu-set", .mode_flag = EXCLUSIVE_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "verdict-cpu-set", .mode_flag = BALANCED_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, { .name = "management-cpu-set", .mode_flag = BALANCED_AFFINITY, .prio = PRIO_MEDIUM, - .lcpu = 0, + .lcpu = { 0 }, }, }; int thread_affinity_init_done = 0; +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun +#ifdef HAVE_HWLOC +static hwloc_topology_t topology = NULL; +#endif /* HAVE_HWLOC */ +#endif /* OS_WIN32 and __OpenBSD__ */ + +static ThreadsAffinityType *AllocAndInitAffinityType( + const char *name, const char *interface_name, ThreadsAffinityType *parent) +{ + ThreadsAffinityType *new_affinity = SCCalloc(1, sizeof(ThreadsAffinityType)); + if (new_affinity == NULL) { + FatalError("Unable to allocate memory for new CPU affinity type"); + } + + new_affinity->name = SCStrdup(interface_name); + if (new_affinity->name == NULL) { + FatalError("Unable to allocate memory for new CPU affinity type name"); + } + new_affinity->parent = parent; + new_affinity->mode_flag = EXCLUSIVE_AFFINITY; + new_affinity->prio = PRIO_MEDIUM; + for (int i = 0; i < MAX_NUMA_NODES; i++) { + new_affinity->lcpu[i] = 0; + } + + if (parent != NULL) { + if (parent->nb_children == parent->nb_children_capacity) { + if (parent->nb_children_capacity == 0) { + parent->nb_children_capacity = 2; + } else { + parent->nb_children_capacity *= 2; + } + void *p = SCRealloc( + parent->children, parent->nb_children_capacity * sizeof(ThreadsAffinityType *)); + if (p == NULL) { + FatalError("Unable to reallocate memory for children CPU affinity types"); + } + parent->children = p; + } + parent->children[parent->nb_children++] = new_affinity; + } + + return new_affinity; +} + +ThreadsAffinityType *FindAffinityByInterface( + ThreadsAffinityType *parent, const char *interface_name) +{ + for (uint32_t i = 0; i < parent->nb_children; i++) { + if (interface_name && strcmp(parent->children[i]->name, interface_name) == 0) { + return parent->children[i]; + } + } + return NULL; +} + +/** + * \brief Find affinity by name (*-cpu-set name) and an interface name. + * \param name the name of the affinity (e.g. worker-cpu-set, receive-cpu-set). + * The name is required and cannot be NULL. + * \param interface_name the name of the interface. + * If NULL, the affinity is looked up by name only. + * \retval a pointer to the affinity or NULL if not found + */ +ThreadsAffinityType *GetAffinityTypeForNameAndIface(const char *name, const char *interface_name) +{ + int i; + ThreadsAffinityType *parent_affinity = NULL; + + for (i = 0; i < MAX_CPU_SET; i++) { + if (strcmp(thread_affinity[i].name, name) == 0) { + parent_affinity = &thread_affinity[i]; + break; + } + } + + if (parent_affinity == NULL) { + SCLogError("CPU affinity with name \"%s\" not found", name); + return NULL; + } + + if (interface_name != NULL) { + ThreadsAffinityType *child_affinity = + FindAffinityByInterface(parent_affinity, interface_name); + // found or not found, it is returned + return child_affinity; + } + + return parent_affinity; +} + /** - * \brief find affinity by its name + * \brief Finds affinity by its name and interface name. + * Interfaces are children of cpu-set names. If the queried interface is not + * found, then it is allocated, initialized and assigned to the queried cpu-set. + * \param name the name of the affinity (e.g. worker-cpu-set, receive-cpu-set). + * The name is required and cannot be NULL. + * \param interface_name the name of the interface. + * If NULL, the affinity is looked up by name only. * \retval a pointer to the affinity or NULL if not found */ -ThreadsAffinityType * GetAffinityTypeFromName(const char *name) +ThreadsAffinityType *GetOrAllocAffinityTypeForIfaceOfName( + const char *name, const char *interface_name) { int i; + ThreadsAffinityType *parent_affinity = NULL; + for (i = 0; i < MAX_CPU_SET; i++) { - if (!strcmp(thread_affinity[i].name, name)) { - return &thread_affinity[i]; + if (strcmp(thread_affinity[i].name, name) == 0) { + parent_affinity = &thread_affinity[i]; + break; } } - return NULL; + + if (parent_affinity == NULL) { + SCLogError("CPU affinity with name \"%s\" not found", name); + return NULL; + } + + if (interface_name != NULL) { + ThreadsAffinityType *child_affinity = + FindAffinityByInterface(parent_affinity, interface_name); + if (child_affinity != NULL) { + return child_affinity; + } + + // If not found, allocate and initialize a new child affinity + return AllocAndInitAffinityType(name, interface_name, parent_affinity); + } + + return parent_affinity; } #if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun @@ -275,38 +394,114 @@ static void SetupAffinityThreads(ThreadsAffinityType *taf, ConfNode *affinity) } } -static bool AllCPUsUsed(ThreadsAffinityType *taf) +/** + * \brief Get the YAML path for the given affinity type. + * The path is built using the parent name (if available) and the affinity name. + * Do not free the returned string. + * \param taf the affinity type - if NULL, the path is built for the root node + * \return a string containing the YAML path, or NULL if the path is too long + */ +char *AffinityGetYamlPath(ThreadsAffinityType *taf) { - if (taf->lcpu < UtilCpuGetNumProcessorsOnline()) { - return false; + static char rootpath[] = "threading.cpu-affinity"; + static char path[1024] = { 0 }; + char subpath[256] = { 0 }; + + if (taf == NULL) { + return rootpath; + } + + if (taf->parent != NULL) { + long r = snprintf( + subpath, sizeof(subpath), "%s.interface-specific-cpu-set.", taf->parent->name); + if (r < 0 || r >= (long)sizeof(subpath)) { + FatalError("Unable to build YAML path for CPU affinity %s.%s", taf->parent->name, + taf->name); + } + } else { + subpath[0] = '\0'; } - return true; + + long r = snprintf(path, sizeof(path), "%s.%s%s", rootpath, subpath, taf->name); + if (r < 0 || r >= (long)sizeof(path)) { + FatalError("Unable to build YAML path for CPU affinity %s", taf->name); + } + + return path; } static void ResetCPUs(ThreadsAffinityType *taf) { - taf->lcpu = 0; + for (int i = 0; i < MAX_NUMA_NODES; i++) { + taf->lcpu[i] = 0; + } } -static uint16_t GetNextAvailableCPU(ThreadsAffinityType *taf) +/** + * \brief Check if the set name corresponds to a worker CPU set. + */ +static bool IsWorkerCpuSet(const char *setname) { - uint16_t cpu = taf->lcpu; - int attempts = 0; + return (strcmp(setname, "worker-cpu-set") == 0); +} - while (!CPU_ISSET(cpu, &taf->cpu_set) && attempts < 2) { - cpu = (cpu + 1) % UtilCpuGetNumProcessorsOnline(); - if (cpu == 0) - attempts++; +/** + * \brief Check if the set name corresponds to a receive CPU set. + */ +static bool IsReceiveCpuSet(const char *setname) +{ + return (strcmp(setname, "receive-cpu-set") == 0); +} + +/** + * \brief Set up affinity configuration for a single interface. + */ +static void SetupSingleIfaceAffinity(ThreadsAffinityType *taf, ConfNode *iface_node) +{ + // offload to Setup function + ConfNode *child_node; + const char *interface_name = NULL; + TAILQ_FOREACH (child_node, &iface_node->head, next) { + if (strcmp(child_node->name, "interface") == 0) { + interface_name = child_node->val; + break; + } + } + if (interface_name == NULL) { + return; + } + + ThreadsAffinityType *iface_taf = + GetOrAllocAffinityTypeForIfaceOfName(taf->name, interface_name); + if (iface_taf == NULL) { + FatalError("Unknown CPU affinity type for interface: %s", interface_name); } - taf->lcpu = cpu + 1; + SetupCpuSets(iface_taf, iface_node, interface_name); + SetupAffinityPriority(iface_taf, iface_node, interface_name); + SetupAffinityMode(iface_taf, iface_node); + SetupAffinityThreads(iface_taf, iface_node); +} - if (attempts == 2) { - SCLogError( - "cpu_set does not contain available CPUs, CPU affinity configuration is invalid"); +/** + * \brief Set up per-interface affinity configurations. + */ +static void SetupPerIfaceAffinity(ThreadsAffinityType *taf, ConfNode *affinity) +{ + char if_af[] = "interface-specific-cpu-set"; + ConfNode *per_iface_node = ConfNodeLookupChild(affinity, if_af); + if (per_iface_node == NULL) { + return; } - return cpu; + ConfNode *iface_node; + TAILQ_FOREACH (iface_node, &per_iface_node->head, next) { + if (strcmp(iface_node->val, "interface") == 0) { + SetupSingleIfaceAffinity(taf, iface_node); + } else { + SCLogWarning("Unknown node in %s: %s", if_af, iface_node->name); + } + } } /** @@ -323,9 +518,8 @@ static bool AffinityConfigIsDeprecated(void) return threading_affinity_deprecated; } - ConfNode *root = ConfGetNode("threading.cpu-affinity"); + ConfNode *root = ConfGetNode(AffinityGetYamlPath(NULL)); if (root == NULL) { - threading_affinity_deprecated = false; initialized = true; return threading_affinity_deprecated; } @@ -357,18 +551,17 @@ void AffinitySetupLoadFromConfig(void) AffinitySetupInit(); thread_affinity_init_done = 1; if (AffinityConfigIsDeprecated()) { - SCLogWarning("CPU affinity configuration uses a deprecated structure and will become " - "obsolete in a future major release (Suricata 9.0). Please update your " - "threading.cpu-affinity to the new format. " - "See notes in %s/upgrade.html#upgrading-7-0-to-8-0", - GetDocURL()); + SCLogWarning("CPU affinity configuration uses a deprecated structure and will not be " + "supported in a future major release (Suricata 9.0). Please update your " + "%s to the new format. See notes in %s/upgrade.html#upgrading-7-0-to-8-0", + AffinityGetYamlPath(NULL), GetDocURL()); } } - SCLogDebug("Loading threading.cpu-affinity from config"); - ConfNode *root = ConfGetNode("threading.cpu-affinity"); + SCLogDebug("Loading %s from config", AffinityGetYamlPath(NULL)); + ConfNode *root = ConfGetNode(AffinityGetYamlPath(NULL)); if (root == NULL) { - SCLogInfo("Cannot find threading.cpu-affinity node in config"); + SCLogInfo("Cannot find %s node in config", AffinityGetYamlPath(NULL)); return; } @@ -380,7 +573,7 @@ void AffinitySetupLoadFromConfig(void) continue; } - ThreadsAffinityType *taf = GetAffinityTypeFromName(setname); + ThreadsAffinityType *taf = GetOrAllocAffinityTypeForIfaceOfName(setname, NULL); if (taf == NULL) { FatalError("Unknown CPU affinity type: %s", setname); } @@ -393,25 +586,372 @@ void AffinitySetupLoadFromConfig(void) SetupAffinityPriority(taf, aff_query_node, setname); SetupAffinityMode(taf, aff_query_node); SetupAffinityThreads(taf, aff_query_node); + + if (!AffinityConfigIsDeprecated() && + (IsWorkerCpuSet(setname) || IsReceiveCpuSet(setname))) { + SetupPerIfaceAffinity(taf, affinity); + } } #endif /* OS_WIN32 and __OpenBSD__ */ } +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun +#ifdef HAVE_HWLOC +static int HwLocDeviceNumaGet(hwloc_topology_t topo, hwloc_obj_t obj) +{ +#if HWLOC_VERSION_MAJOR >= 2 && HWLOC_VERSION_MINOR >= 5 + hwloc_obj_t nodes[MAX_NUMA_NODES]; + unsigned num_nodes = MAX_NUMA_NODES; + struct hwloc_location location; + + location.type = HWLOC_LOCATION_TYPE_OBJECT; + location.location.object = obj; + + int result = hwloc_get_local_numanode_objs(topo, &location, &num_nodes, nodes, 0); + if (result == 0 && num_nodes > 0 && num_nodes <= MAX_NUMA_NODES) { + return nodes[0]->logical_index; + } + return -1; +#endif /* HWLOC_VERSION_MAJOR >= 2 && HWLOC_VERSION_MINOR >= 5 */ + + hwloc_obj_t non_io_ancestor = hwloc_get_non_io_ancestor_obj(topo, obj); + if (non_io_ancestor == NULL) { + return -1; + } + + // Iterate over NUMA nodes and check their nodeset + hwloc_obj_t numa_node = NULL; + while ((numa_node = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_NUMANODE, numa_node)) != NULL) { + if (hwloc_bitmap_isset(non_io_ancestor->nodeset, numa_node->os_index)) { + return numa_node->logical_index; + } + } + + return -1; +} + +static hwloc_obj_t HwLocDeviceGetByKernelName(hwloc_topology_t topo, const char *interface_name) +{ + hwloc_obj_t obj = NULL; + + while ((obj = hwloc_get_next_osdev(topo, obj)) != NULL) { + if (obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK && + strcmp(obj->name, interface_name) == 0) { + hwloc_obj_t parent = obj->parent; + while (parent) { + if (parent->type == HWLOC_OBJ_PCI_DEVICE) { + return parent; + } + parent = parent->parent; + } + } + } + return NULL; +} + +// Static function to deparse PCIe interface string name to individual components /** - * \brief Return next cpu to use for a given thread family - * \retval the cpu to used given by its id + * \brief Parse PCIe address string to individual components + * \param[in] pcie_address PCIe address string + * \param[out] domain Domain component + * \param[out] bus Bus component + * \param[out] device Device component + * \param[out] function Function component */ -uint16_t AffinityGetNextCPU(ThreadsAffinityType *taf) +static int PcieAddressToComponents(const char *pcie_address, unsigned int *domain, + unsigned int *bus, unsigned int *device, unsigned int *function) { - uint16_t ncpu = 0; -#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun + // Handle both full and short PCIe address formats + if (sscanf(pcie_address, "%x:%x:%x.%x", domain, bus, device, function) != 4) { + if (sscanf(pcie_address, "%x:%x.%x", bus, device, function) != 3) { + return -1; + } + *domain = 0; // Default domain to 0 if not provided + } + return 0; +} + +// Function to convert PCIe address to hwloc object +static hwloc_obj_t HwLocDeviceGetByPcie(hwloc_topology_t topo, const char *pcie_address) +{ + hwloc_obj_t obj = NULL; + unsigned int domain, bus, device, function; + int r = PcieAddressToComponents(pcie_address, &domain, &bus, &device, &function); + if (r == 0) { + while ((obj = hwloc_get_next_pcidev(topo, obj)) != NULL) { + if (obj->attr->pcidev.domain == domain && obj->attr->pcidev.bus == bus && + obj->attr->pcidev.dev == device && obj->attr->pcidev.func == function) { + return obj; + } + } + } + return NULL; +} + +static void HwlocObjectDump(hwloc_obj_t obj, const char *iface_name) +{ + if (!obj) { + SCLogDebug("No object found for the given PCIe address.\n"); + return; + } + + static char pcie_address[32]; + snprintf(pcie_address, sizeof(pcie_address), "%04x:%02x:%02x.%x", obj->attr->pcidev.domain, + obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func); + SCLogDebug("Interface (%s / %s) has NUMA ID %d", iface_name, pcie_address, + HwLocDeviceNumaGet(topology, obj)); + + SCLogDebug("Object type: %s\n", hwloc_obj_type_string(obj->type)); + SCLogDebug("Logical index: %u\n", obj->logical_index); + SCLogDebug("Depth: %u\n", obj->depth); + SCLogDebug("Attributes:\n"); + if (obj->type == HWLOC_OBJ_PCI_DEVICE) { + SCLogDebug(" Domain: %04x\n", obj->attr->pcidev.domain); + SCLogDebug(" Bus: %02x\n", obj->attr->pcidev.bus); + SCLogDebug(" Device: %02x\n", obj->attr->pcidev.dev); + SCLogDebug(" Function: %01x\n", obj->attr->pcidev.func); + SCLogDebug(" Class ID: %04x\n", obj->attr->pcidev.class_id); + SCLogDebug(" Vendor ID: %04x\n", obj->attr->pcidev.vendor_id); + SCLogDebug(" Device ID: %04x\n", obj->attr->pcidev.device_id); + SCLogDebug(" Subvendor ID: %04x\n", obj->attr->pcidev.subvendor_id); + SCLogDebug(" Subdevice ID: %04x\n", obj->attr->pcidev.subdevice_id); + SCLogDebug(" Revision: %02x\n", obj->attr->pcidev.revision); + SCLogDebug(" Link speed: %f GB/s\n", obj->attr->pcidev.linkspeed); + } else { + SCLogDebug(" No PCI device attributes available.\n"); + } +} + +static bool TopologyShouldAutopin(ThreadVars *tv, ThreadsAffinityType *taf) +{ + bool cond; SCMutexLock(&taf->taf_mutex); - ncpu = GetNextAvailableCPU(taf); + cond = tv->type == TVT_PPT && tv->iface_name && + (strcmp(tv->iface_name, taf->name) == 0 || + (strcmp("worker-cpu-set", taf->name) == 0 && RunmodeIsWorkers()) || + (strcmp("receive-cpu-set", taf->name) == 0 && RunmodeIsAutofp())); + SCMutexUnlock(&taf->taf_mutex); + return cond; +} + +static void TopologyInitialize(void) +{ + if (topology == NULL) { + if (hwloc_topology_init(&topology) == -1) { + FatalError("Failed to initialize topology"); + } + + if (hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM) == -1 || + hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL) == -1 || + hwloc_topology_load(topology) == -1) { + FatalError("Failed to set/load topology"); + } + } +} + +void TopologyDestroy() +{ + if (topology != NULL) { + hwloc_topology_destroy(topology); + topology = NULL; + } +} + +static int InterfaceGetNumaNode(ThreadVars *tv) +{ + hwloc_obj_t if_obj = HwLocDeviceGetByKernelName(topology, tv->iface_name); + if (if_obj == NULL) { + if_obj = HwLocDeviceGetByPcie(topology, tv->iface_name); + } + + if (if_obj != NULL && SCLogGetLogLevel() == SC_LOG_DEBUG) { + HwlocObjectDump(if_obj, tv->iface_name); + } + + int32_t numa_id = HwLocDeviceNumaGet(topology, if_obj); + if (numa_id < 0 && SCRunmodeGet() == RUNMODE_DPDK) { + // DPDK fallback for e.g. net_bonding (vdev) PMDs + int32_t r = DPDKDeviceNameSetSocketID(tv->iface_name, &numa_id); + if (r < 0) { + numa_id = -1; + } + } + + if (numa_id < 0) { + SCLogDebug("Unable to find NUMA node for interface %s", tv->iface_name); + } + + return numa_id; +} +#endif /* HAVE_HWLOC */ + +static bool CPUIsFromNuma(uint16_t ncpu, uint16_t numa) +{ +#ifdef HAVE_HWLOC + int core_id = ncpu; + int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); + hwloc_obj_t numa_node = NULL; + + while ((numa_node = hwloc_get_next_obj_by_depth(topology, depth, numa_node)) != NULL) { + hwloc_cpuset_t cpuset = hwloc_bitmap_alloc(); + hwloc_bitmap_copy(cpuset, numa_node->cpuset); + + if (hwloc_bitmap_isset(cpuset, core_id)) { + SCLogDebug("Core %d - NUMA %d", core_id, numa_node->logical_index); + hwloc_bitmap_free(cpuset); + break; + } + hwloc_bitmap_free(cpuset); + } + + if (numa == numa_node->logical_index) { + return true; + } + +#endif /* HAVE_HWLOC */ + + return false; +} + +static int16_t FindCPUInNumaNode(int numa_node, ThreadsAffinityType *taf) +{ + if (numa_node < 0) { + return -1; + } + + if (taf->lcpu[numa_node] >= UtilCpuGetNumProcessorsOnline()) { + return -1; + } + + uint16_t cpu = taf->lcpu[numa_node]; + while (cpu < UtilCpuGetNumProcessorsOnline() && + (!CPU_ISSET(cpu, &taf->cpu_set) || !CPUIsFromNuma(cpu, (uint16_t)numa_node))) { + cpu++; + } + + taf->lcpu[numa_node] = + (CPU_ISSET(cpu, &taf->cpu_set) && CPUIsFromNuma(cpu, (uint16_t)numa_node)) + ? cpu + 1 + : UtilCpuGetNumProcessorsOnline(); + return (CPU_ISSET(cpu, &taf->cpu_set) && CPUIsFromNuma(cpu, (uint16_t)numa_node)) ? (int16_t)cpu + : -1; +} + +static int16_t CPUSelectFromNuma(int iface_numa, ThreadsAffinityType *taf) +{ + if (iface_numa != -1) { + return FindCPUInNumaNode(iface_numa, taf); + } + return -1; +} + +static int16_t CPUSelectAlternative(int iface_numa, ThreadsAffinityType *taf) +{ + for (int nid = 0; nid < MAX_NUMA_NODES; nid++) { + if (iface_numa == nid) { + continue; + } + + int16_t cpu = FindCPUInNumaNode(nid, taf); + if (cpu != -1) { + SCLogPerf("CPU %d from NUMA %d assigned to a network interface located on NUMA %d", cpu, + nid, iface_numa); + return cpu; + } + } + return -1; +} + +/** + * \brief Select the next available CPU for the given affinity type. + * taf->cpu_set is a bit array where each bit represents a CPU core. + * The function iterates over the bit array and returns the first available CPU. + * If last used CPU core index is higher than the indexes of available cores, + * we reach the end of the array, and we reset the CPU selection. + * On the second reset attempt, the function bails out with a default value. + * The second attempt should only happen with an empty CPU set. + */ +static uint16_t CPUSelectDefault(ThreadsAffinityType *taf) +{ + uint16_t cpu = taf->lcpu[0]; + int attempts = 0; + while (!CPU_ISSET(cpu, &taf->cpu_set) && attempts < 2) { + cpu = (cpu + 1) % UtilCpuGetNumProcessorsOnline(); + if (cpu == 0) { + attempts++; + } + } - if (AllCPUsUsed(taf)) { - ResetCPUs(taf); + taf->lcpu[0] = cpu + 1; + return cpu; +} + +static uint16_t CPUSelectFromNumaOrDefault(int iface_numa, ThreadsAffinityType *taf) +{ + uint16_t attempts = 0; + int16_t cpu = -1; + while (attempts < 2) { + cpu = CPUSelectFromNuma(iface_numa, taf); + if (cpu == -1) { + cpu = CPUSelectAlternative(iface_numa, taf); + if (cpu == -1) { + // All CPUs from all NUMAs are used at this point + ResetCPUs(taf); + attempts++; + } + } + + if (cpu >= 0) { + return (uint16_t)cpu; + } + } + return CPUSelectDefault(taf); +} + +static uint16_t GetNextAvailableCPU(int iface_numa, ThreadsAffinityType *taf) +{ + if (iface_numa < 0) { + return CPUSelectDefault(taf); } + return CPUSelectFromNumaOrDefault(iface_numa, taf); +} + +static bool AutopinEnabled(void) +{ + int autopin = 0; + if (ConfGetBool("threading.autopin", &autopin) != 1) { + return false; + } + return (bool)autopin; +} + +#endif /* OS_WIN32 and __OpenBSD__ */ + +uint16_t AffinityGetNextCPU(ThreadVars *tv, ThreadsAffinityType *taf) +{ + uint16_t ncpu = 0; +#if !defined __CYGWIN__ && !defined OS_WIN32 && !defined __OpenBSD__ && !defined sun + int iface_numa = -1; + if (AutopinEnabled()) { +#ifdef HAVE_HWLOC + if (TopologyShouldAutopin(tv, taf)) { + TopologyInitialize(); + iface_numa = InterfaceGetNumaNode(tv); + } +#else + static bool printed = false; + if (!printed) { + printed = true; + SCLogWarning( + "threading.autopin option is enabled but hwloc support is not compiled in. " + "Make sure to pass --enable-nfqueue to configure when building Suricata."); + } +#endif /* HAVE_HWLOC */ + } + + SCMutexLock(&taf->taf_mutex); + ncpu = GetNextAvailableCPU(iface_numa, taf); SCLogDebug("Setting affinity on CPU %d", ncpu); SCMutexUnlock(&taf->taf_mutex); #endif /* OS_WIN32 and __OpenBSD__ */ diff --git a/src/util-affinity.h b/src/util-affinity.h index 2fa4509ffa2c..ceca26dfeff9 100644 --- a/src/util-affinity.h +++ b/src/util-affinity.h @@ -26,6 +26,11 @@ #include "suricata-common.h" #include "conf.h" #include "threads.h" +#include "threadvars.h" + +#ifdef HAVE_HWLOC +#include +#endif /* HAVE_HWLOC */ #if defined OS_FREEBSD #include @@ -62,10 +67,12 @@ enum { MAX_AFFINITY }; +#define MAX_NUMA_NODES 16 + typedef struct ThreadsAffinityType_ { const char *name; uint8_t mode_flag; - uint16_t lcpu; /* use by exclusive mode */ + uint16_t lcpu[MAX_NUMA_NODES]; /* use by exclusive mode */ int prio; uint32_t nb_threads; SCMutex taf_mutex; @@ -76,6 +83,12 @@ typedef struct ThreadsAffinityType_ { cpu_set_t medprio_cpu; cpu_set_t hiprio_cpu; #endif + struct ThreadsAffinityType_ **children; + uint32_t nb_children; + uint32_t nb_children_capacity; + struct ThreadsAffinityType_ *parent; + // a flag to avoid multiple warnings when no CPU is set + bool nocpu_warned; } ThreadsAffinityType; /** store thread affinity mode for all type of threads */ @@ -83,10 +96,16 @@ typedef struct ThreadsAffinityType_ { extern ThreadsAffinityType thread_affinity[MAX_CPU_SET]; #endif +char *AffinityGetYamlPath(ThreadsAffinityType *taf); void AffinitySetupLoadFromConfig(void); -ThreadsAffinityType * GetAffinityTypeFromName(const char *name); +ThreadsAffinityType *GetOrAllocAffinityTypeForIfaceOfName( + const char *name, const char *interface_name); +ThreadsAffinityType *GetAffinityTypeForNameAndIface(const char *name, const char *interface_name); +ThreadsAffinityType *FindAffinityByInterface( + ThreadsAffinityType *parent, const char *interface_name); -uint16_t AffinityGetNextCPU(ThreadsAffinityType *taf); +void TopologyDestroy(void); +uint16_t AffinityGetNextCPU(ThreadVars *tv, ThreadsAffinityType *taf); uint16_t UtilAffinityGetAffinedCPUNum(ThreadsAffinityType *taf); #ifdef HAVE_DPDK uint16_t UtilAffinityCpusOverlap(ThreadsAffinityType *taf1, ThreadsAffinityType *taf2); diff --git a/src/util-device.c b/src/util-device.c index fd4cf5685f0b..ec1e91b41374 100644 --- a/src/util-device.c +++ b/src/util-device.c @@ -24,6 +24,7 @@ #include "device-storage.h" #include "util-debug.h" +#include "util-affinity.h" #define MAX_DEVNAME 10 @@ -173,6 +174,20 @@ int LiveGetDeviceCount(void) return i; } +int LiveGetDeviceCountWithoutAssignedThreading(void) +{ + int i = 0; + LiveDevice *pd; + + TAILQ_FOREACH (pd, &live_devices, next) { + if (GetAffinityTypeForNameAndIface("worker-cpu-set", pd->dev) == NULL) { + i++; + } + } + + return i; +} + /** * \brief Get a pointer to the device name at idx * diff --git a/src/util-device.h b/src/util-device.h index 0774825385a3..075c21567c81 100644 --- a/src/util-device.h +++ b/src/util-device.h @@ -85,6 +85,7 @@ void LiveDevAddBypassStats(LiveDevice *dev, uint64_t cnt, int family); void LiveDevSubBypassStats(LiveDevice *dev, uint64_t cnt, int family); void LiveDevAddBypassFail(LiveDevice *dev, uint64_t cnt, int family); void LiveDevAddBypassSuccess(LiveDevice *dev, uint64_t cnt, int family); +int LiveGetDeviceCountWithoutAssignedThreading(void); int LiveGetDeviceCount(void); const char *LiveGetDeviceName(int number); LiveDevice *LiveGetDevice(const char *dev); diff --git a/src/util-runmodes.c b/src/util-runmodes.c index f78e857abfc6..be4da6bd49ee 100644 --- a/src/util-runmodes.c +++ b/src/util-runmodes.c @@ -175,6 +175,14 @@ int RunModeSetLiveCaptureAutoFp(ConfigIfaceParserFunc ConfigParser, FatalError("TmThreadsCreate failed"); } tv_receive->printable_name = printable_threadname; + if (dev) { + tv_receive->iface_name = SCStrdup(dev); + if (tv_receive->iface_name == NULL) { + FatalError("Failed to allocate memory for iface name"); + } + } else { + tv_receive->iface_name = NULL; + } TmModule *tm_module = TmModuleGetByName(recv_mod_name); if (tm_module == NULL) { FatalError("TmModuleGetByName failed for %s", recv_mod_name); @@ -283,6 +291,14 @@ static int RunModeSetLiveCaptureWorkersForDevice(ConfigIfaceThreadsCountFunc Mod FatalError("TmThreadsCreate failed"); } tv->printable_name = printable_threadname; + if (live_dev) { + tv->iface_name = SCStrdup(live_dev); + if (tv->iface_name == NULL) { + FatalError("Failed to allocate memory for iface name"); + } + } else { + tv->iface_name = NULL; + } tm_module = TmModuleGetByName(recv_mod_name); if (tm_module == NULL) { diff --git a/suricata.yaml.in b/suricata.yaml.in index 954acde2d3ae..c4319f02b45b 100644 --- a/suricata.yaml.in +++ b/suricata.yaml.in @@ -1777,6 +1777,7 @@ spm-algo: auto # Suricata is multi-threaded. Here the threading can be influenced. threading: set-cpu-affinity: no + autopin: no # Tune cpu affinity of threads. Each family of threads can be bound # to specific CPUs. # @@ -1793,6 +1794,13 @@ threading: cpu: [ 0 ] # include only these CPUs in affinity settings receive-cpu-set: cpu: [ 0 ] # include only these CPUs in affinity settings + # interface-specific-cpu-set: + # - interface: "enp4s0f0" + # cpu: [ 1,3,5,7,9 ] + # mode: "exclusive" + # prio: + # high: [ "all" ] + # default: "medium" worker-cpu-set: cpu: [ "all" ] mode: "exclusive" @@ -1804,6 +1812,13 @@ threading: medium: [ "1-2" ] high: [ 3 ] default: "medium" + interface-specific-cpu-set: + - interface: "enp4s0f0" # 0000:3b:00.0 # net_bonding0 # ens1f0 + cpu: [ 1,3,5,7,9 ] + mode: "exclusive" + prio: + high: [ "all" ] + default: "medium" #verdict-cpu-set: # cpu: [ 0 ] # prio: