diff --git a/RtxOptions.md b/RtxOptions.md index 6e987b7f..931210c3 100644 --- a/RtxOptions.md +++ b/RtxOptions.md @@ -399,6 +399,7 @@ Tables below enumerate all the options and their defaults set by RTX Remix. Note |rtx.numFramesToKeepInstances|int|1|| |rtx.numFramesToKeepLights|int|100|| |rtx.numFramesToKeepMaterialTextures|int|5|| +|rtx.numGeometryProcessingThreads|int|2|The desired number of CPU threads to dedicate to geometry processing Will be limited by the number of CPU cores\. There may be some advantage to lowering this number in games which are fairly simple and use a low number of draw calls per frame\. The default was determined by looking at a game with around 2000 draw calls per frame, and with a reasonably high average triangle count per draw\.| |rtx.opacityMicromap.buildRequests.customFiltersForBillboards|bool|True|Applies custom filters for staged Billboard requests\.| |rtx.opacityMicromap.buildRequests.enableAnimatedInstances|bool|False|Enables Opacity Micromaps for animated instances\.| |rtx.opacityMicromap.buildRequests.enableParticles|bool|True|Enables Opacity Micromaps for particles\.| diff --git a/src/d3d9/d3d9_rtx.cpp b/src/d3d9/d3d9_rtx.cpp index 01d045e1..7752be08 100644 --- a/src/d3d9/d3d9_rtx.cpp +++ b/src/d3d9/d3d9_rtx.cpp @@ -32,7 +32,7 @@ namespace dxvk { VK_ACCESS_TRANSFER_READ_BIT) , m_parent(d3d9Device) , m_enableDrawCallConversion(enableDrawCallConversion) - , m_pGeometryWorkers(enableDrawCallConversion ? std::make_unique(popcnt_uint8(D3D9Rtx::kAllThreads), "geometry-processing") : nullptr) { + , m_pGeometryWorkers(enableDrawCallConversion ? std::make_unique(numGeometryProcessingThreads(), "geometry-processing") : nullptr) { // Add space for 256 objects skinned with 256 bones each. m_stagedBones.resize(256 * 256); diff --git a/src/d3d9/d3d9_rtx.h b/src/d3d9/d3d9_rtx.h index 40890db1..c175878b 100644 --- a/src/d3d9/d3d9_rtx.h +++ b/src/d3d9/d3d9_rtx.h @@ -41,6 +41,7 @@ namespace dxvk { RTX_OPTION("rtx", bool, useVertexCapturedNormals, true, "When enabled, vertex normals are read from the input assembler and used in raytracing. This doesn't always work as normals can be in any coordinate space, but can help sometimes."); RTX_OPTION("rtx", bool, useWorldMatricesForShaders, true, "When enabled, Remix will utilize the world matrices being passed from the game via D3D9 fixed function API, even when running with shaders. Sometimes games pass these matrices and they are useful, however for some games they are very unreliable, and should be filtered out. If you're seeing precision related issues with shader vertex capture, try disabling this setting."); RTX_OPTION("rtx", bool, enableIndexBufferMemoization, true, "CPU performance optimization, should generally be enabled. Will reduce main thread time by caching processIndexBuffer operations and reusing when possible, this will come at the expense of some CPU RAM."); + RTX_OPTION("rtx", uint32_t, numGeometryProcessingThreads, 2, "The desired number of CPU threads to dedicate to geometry processing Will be limited by the number of CPU cores. There may be some advantage to lowering this number in games which are fairly simple and use a low number of draw calls per frame. The default was determined by looking at a game with around 2000 draw calls per frame, and with a reasonably high average triangle count per draw."); // Copy of the parameters issued to D3D9 on DrawXXX struct DrawContext { @@ -171,19 +172,6 @@ namespace dxvk { } private: - // Give threads specific tasks, to reduce the chance of - // critical work being pre-empted. - enum WorkerTasks : uint8_t { - kSkinningThread = 1 << 0, - - kHashingThread0 = 1 << 1, - kHashingThread1 = 1 << 2, - kHashingThread2 = 1 << 3, - - kHashingThreads = (kHashingThread0 | kHashingThread1 | kHashingThread2), - kAllThreads = (kHashingThreads | kSkinningThread) - }; - inline static const uint32_t kMaxConcurrentDraws = 6 * 1024; // some games issuing >3000 draw calls per frame... account for some consumer thread lag with x2 using GeometryProcessor = WorkerThreadPool; const std::unique_ptr m_pGeometryWorkers; diff --git a/src/util/util_threadpool.h b/src/util/util_threadpool.h index 35075e43..d5adbfc4 100644 --- a/src/util/util_threadpool.h +++ b/src/util/util_threadpool.h @@ -289,9 +289,9 @@ namespace dxvk { public: WorkerThreadPool(uint8_t numThreads, const char* workerName = "Nameless Worker Thread") - : m_numThread(numThreads) { + : m_numThread(std::clamp(numThreads, (uint8_t)1u, (uint8_t)dxvk::thread::hardware_concurrency())) { // Note: round up to a closest power-of-two so we can use mask as modulo - m_taskCount = 1 << (32 - bit::lzcnt(static_cast(NumTasksPerThread*numThreads) - 1)); + m_taskCount = 1 << (32 - bit::lzcnt(static_cast(NumTasksPerThread * m_numThread) - 1)); m_tasks.resize(m_taskCount); m_workerTasks.resize(m_numThread); m_workerThreads.resize(m_numThread); @@ -343,15 +343,11 @@ namespace dxvk { // Schedule a task to be executed by the thread pool template >> Future Schedule(F&& f) { - // Add the task to the queue and notify a worker thread - // just distribute evenly to all threads for some mask denoted by Affinity. - static size_t s_idx = 0; - // Is the affinity mask valid? const uint8_t affinityMask = std::min(popcnt_uint8(Affinity), m_numThread); // Schedule work on the appropriate thread - const uint32_t thread = fast::findNthBit(Affinity, (uint8_t) (s_idx++ % affinityMask)); + const uint32_t thread = fast::findNthBit(Affinity, (uint8_t) (m_schedulerIndex++ % affinityMask)); assert(thread < m_numThread); // Atomic queue is SPSC, so we don't need to take a lock here @@ -451,6 +447,10 @@ namespace dxvk { std::atomic m_taskId = 0; uint32_t m_taskCount; + // Add the task to the queue and notify a worker thread + // just distribute evenly to all threads for some mask denoted by Affinity. + size_t m_schedulerIndex = 0; + uint8_t m_numThread; std::atomic m_stopWork = false;