From 3c3207611bc01824b17871af4bfe1650ef315dcd Mon Sep 17 00:00:00 2001 From: DH Date: Sat, 19 Oct 2024 15:36:22 +0300 Subject: [PATCH] gpu: split command and gfx queues --- rpcsx/gpu/Cache.cpp | 14 +++--- rpcsx/gpu/Device.cpp | 30 +++++++++---- rpcsx/gpu/Device.hpp | 2 + rpcsx/gpu/DeviceCtl.cpp | 59 +++++++++++++------------- rpcsx/gpu/DeviceCtl.hpp | 15 +++---- rpcsx/gpu/FlipPipeline.cpp | 6 ++- rpcsx/gpu/lib/vk/include/Scheduler.hpp | 2 +- rpcsx/iodev/dce.cpp | 14 +++--- rpcsx/iodev/dmem.cpp | 7 ++- rpcsx/iodev/gc.cpp | 5 ++- rpcsx/vm.cpp | 15 +++---- 11 files changed, 96 insertions(+), 73 deletions(-) diff --git a/rpcsx/gpu/Cache.cpp b/rpcsx/gpu/Cache.cpp index bb0aa240..3c000f54 100644 --- a/rpcsx/gpu/Cache.cpp +++ b/rpcsx/gpu/Cache.cpp @@ -469,13 +469,16 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, barrier.image = image; barrier.subresourceRange = subresourceRange; - auto layoutToStageAccess = [](VkImageLayout layout) - -> std::pair { + auto layoutToStageAccess = + [](VkImageLayout layout, + bool isSrc) -> std::pair { switch (layout) { case VK_IMAGE_LAYOUT_UNDEFINED: case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: case VK_IMAGE_LAYOUT_GENERAL: - return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0}; + return {isSrc ? VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT + : VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + 0}; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT}; @@ -501,8 +504,9 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, } }; - auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout); - auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout); + auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout, true); + auto [destinationStage, destinationAccess] = + layoutToStageAccess(newLayout, false); barrier.srcAccessMask = sourceAccess; barrier.dstAccessMask = destinationAccess; diff --git a/rpcsx/gpu/Device.cpp b/rpcsx/gpu/Device.cpp index 6e053f92..f2b84b64 100644 --- a/rpcsx/gpu/Device.cpp +++ b/rpcsx/gpu/Device.cpp @@ -236,6 +236,14 @@ Device::Device() : vkContext(createVkContext(this)) { } }); + commandPipe.device = this; + commandPipe.ring = { + .base = cmdRing, + .size = std::size(cmdRing), + .rptr = cmdRing, + .wptr = cmdRing, + }; + for (auto &pipe : computePipes) { pipe.device = this; } @@ -244,7 +252,7 @@ Device::Device() : vkContext(createVkContext(this)) { graphicsPipes[i].setDeQueue( Ring{ .base = mainGfxRings[i], - .size = sizeof(mainGfxRings[i]) / sizeof(mainGfxRings[i][0]), + .size = std::size(mainGfxRings[i]), .rptr = mainGfxRings[i], .wptr = mainGfxRings[i], }, @@ -621,6 +629,8 @@ void Device::onCommandBuffer(std::uint32_t pid, int cmdHeader, bool Device::processPipes() { bool allProcessed = true; + commandPipe.processAllRings(); + for (auto &pipe : computePipes) { if (!pipe.processAllRings()) { allProcessed = false; @@ -649,13 +659,16 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, barrier.image = image; barrier.subresourceRange = subresourceRange; - auto layoutToStageAccess = [](VkImageLayout layout) - -> std::pair { + auto layoutToStageAccess = + [](VkImageLayout layout, + bool isSrc) -> std::pair { switch (layout) { case VK_IMAGE_LAYOUT_UNDEFINED: case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: case VK_IMAGE_LAYOUT_GENERAL: - return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0}; + return {isSrc ? VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT + : VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + 0}; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT}; @@ -681,8 +694,9 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, } }; - auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout); - auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout); + auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout, true); + auto [destinationStage, destinationAccess] = + layoutToStageAccess(newLayout, false); barrier.srcAccessMask = sourceAccess; barrier.dstAccessMask = destinationAccess; @@ -783,13 +797,13 @@ bool Device::flip(std::uint32_t pid, int bufferIndex, std::uint64_t arg, .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = vk::context->presentCompleteSemaphore, .value = 1, - .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + .stageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, }, { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = scheduler.getSemaphoreHandle(), .value = submitCompleteTask - 1, - .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + .stageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, }, }; diff --git a/rpcsx/gpu/Device.hpp b/rpcsx/gpu/Device.hpp index 7bc4979d..4f3c65da 100644 --- a/rpcsx/gpu/Device.hpp +++ b/rpcsx/gpu/Device.hpp @@ -78,6 +78,7 @@ struct Device : orbis::RcBase, DeviceContext { GpuTiler tiler; GraphicsPipe graphicsPipes[kGfxPipeCount]{0, 1}; ComputePipe computePipes[kComputePipeCount]{0, 1, 2, 3, 4, 5, 6, 7}; + CommandPipe commandPipe; FlipPipeline flipPipeline; orbis::shared_mutex writeCommandMtx; @@ -94,6 +95,7 @@ struct Device : orbis::RcBase, DeviceContext { }; std::uint32_t mainGfxRings[kGfxPipeCount][0x4000 / sizeof(std::uint32_t)]; + std::uint32_t cmdRing[0x4000 / sizeof(std::uint32_t)]; Device(); ~Device(); diff --git a/rpcsx/gpu/DeviceCtl.cpp b/rpcsx/gpu/DeviceCtl.cpp index 9a8c5ffd..ecedbb1d 100644 --- a/rpcsx/gpu/DeviceCtl.cpp +++ b/rpcsx/gpu/DeviceCtl.cpp @@ -54,12 +54,6 @@ void DeviceCtl::submitGfxCommand(int gfxPipe, int vmId, void DeviceCtl::submitSwitchBuffer(int gfxPipe) { mDevice->submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_SWITCH_BUFFER, 0)); } -void DeviceCtl::submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex, - std::uint64_t flipArg) { - mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_FLIP, bufferIndex, - flipArg & 0xffff'ffff, - flipArg >> 32, pid)); -} orbis::ErrorCode DeviceCtl::submitWriteEop(int gfxPipe, std::uint32_t waitMode, std::uint64_t eopValue) { @@ -107,40 +101,47 @@ orbis::ErrorCode DeviceCtl::submitFlipOnEop(int gfxPipe, std::uint32_t pid, return {}; } +void DeviceCtl::submitFlip(std::uint32_t pid, int bufferIndex, + std::uint64_t flipArg) { + mDevice->submitCommand(mDevice->commandPipe.ring, + createPm4Packet(IT_FLIP, bufferIndex, + flipArg & 0xffff'ffff, flipArg >> 32, + pid)); +} -void DeviceCtl::submitMapMemory(int gfxPipe, std::uint32_t pid, - std::uint64_t address, std::uint64_t size, - int memoryType, int dmemIndex, int prot, - std::int64_t offset) { - mDevice->submitGfxCommand( - gfxPipe, +void DeviceCtl::submitMapMemory(std::uint32_t pid, std::uint64_t address, + std::uint64_t size, int memoryType, + int dmemIndex, int prot, std::int64_t offset) { + mDevice->submitCommand( + mDevice->commandPipe.ring, createPm4Packet(IT_MAP_MEMORY, pid, address & 0xffff'ffff, address >> 32, size & 0xffff'ffff, size >> 32, memoryType, dmemIndex, prot, offset & 0xffff'ffff, offset >> 32)); } -void DeviceCtl::submitUnmapMemory(int gfxPipe, std::uint32_t pid, - std::uint64_t address, std::uint64_t size) { - mDevice->submitGfxCommand( - gfxPipe, createPm4Packet(IT_UNMAP_MEMORY, pid, address & 0xffff'ffff, - address >> 32, size & 0xffff'ffff, size >> 32)); +void DeviceCtl::submitUnmapMemory(std::uint32_t pid, std::uint64_t address, + std::uint64_t size) { + mDevice->submitCommand(mDevice->commandPipe.ring, + createPm4Packet(IT_UNMAP_MEMORY, pid, + address & 0xffff'ffff, address >> 32, + size & 0xffff'ffff, size >> 32)); } -void DeviceCtl::submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId) { - mDevice->submitGfxCommand(gfxPipe, - createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId)); +void DeviceCtl::submitMapProcess(std::uint32_t pid, int vmId) { + mDevice->submitCommand(mDevice->commandPipe.ring, + createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId)); } -void DeviceCtl::submitUnmapProcess(int gfxPipe, std::uint32_t pid) { - mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_UNMAP_PROCESS, pid)); +void DeviceCtl::submitUnmapProcess(std::uint32_t pid) { + mDevice->submitCommand(mDevice->commandPipe.ring, + createPm4Packet(IT_UNMAP_PROCESS, pid)); } -void DeviceCtl::submitProtectMemory(int gfxPipe, std::uint32_t pid, - std::uint64_t address, std::uint64_t size, - int prot) { - mDevice->submitGfxCommand( - gfxPipe, - createPm4Packet(IT_PROTECT_MEMORY, pid, address & 0xffff'ffff, - address >> 32, size & 0xffff'ffff, size >> 32, prot)); +void DeviceCtl::submitProtectMemory(std::uint32_t pid, std::uint64_t address, + std::uint64_t size, int prot) { + mDevice->submitCommand(mDevice->commandPipe.ring, + createPm4Packet(IT_PROTECT_MEMORY, pid, + address & 0xffff'ffff, address >> 32, + size & 0xffff'ffff, size >> 32, prot)); } void DeviceCtl::registerBuffer(std::uint32_t pid, Buffer buffer) { diff --git a/rpcsx/gpu/DeviceCtl.hpp b/rpcsx/gpu/DeviceCtl.hpp index fdaeb47f..0bc41fbf 100644 --- a/rpcsx/gpu/DeviceCtl.hpp +++ b/rpcsx/gpu/DeviceCtl.hpp @@ -28,22 +28,21 @@ class DeviceCtl { void submitGfxCommand(int gfxPipe, int vmId, std::span command); void submitSwitchBuffer(int gfxPipe); - void submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex, - std::uint64_t flipArg); orbis::ErrorCode submitWriteEop(int gfxPipe, std::uint32_t waitMode, std::uint64_t eopValue); orbis::ErrorCode submitFlipOnEop(int gfxPipe, std::uint32_t pid, int bufferIndex, std::uint64_t flipArg, std::uint64_t eopValue); - void submitMapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address, + void submitFlip(std::uint32_t pid, int bufferIndex, std::uint64_t flipArg); + void submitMapMemory(std::uint32_t pid, std::uint64_t address, std::uint64_t size, int memoryType, int dmemIndex, int prot, std::int64_t offset); - void submitUnmapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address, + void submitUnmapMemory(std::uint32_t pid, std::uint64_t address, std::uint64_t size); - void submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId); - void submitUnmapProcess(int gfxPipe, std::uint32_t pid); - void submitProtectMemory(int gfxPipe, std::uint32_t pid, - std::uint64_t address, std::uint64_t size, int prot); + void submitMapProcess(std::uint32_t pid, int vmId); + void submitUnmapProcess(std::uint32_t pid); + void submitProtectMemory(std::uint32_t pid, std::uint64_t address, + std::uint64_t size, int prot); void registerBuffer(std::uint32_t pid, Buffer buffer); void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr); diff --git a/rpcsx/gpu/FlipPipeline.cpp b/rpcsx/gpu/FlipPipeline.cpp index dfff6af2..917c9558 100644 --- a/rpcsx/gpu/FlipPipeline.cpp +++ b/rpcsx/gpu/FlipPipeline.cpp @@ -161,8 +161,8 @@ FlipPipeline::FlipPipeline() { .pAttachments = &blendAttachmentState}; VkDynamicState dynamicStates[] = { - VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT, - VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT, + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, }; VkPipelineDynamicStateCreateInfo dynamicState{ @@ -183,6 +183,7 @@ FlipPipeline::FlipPipeline() { { .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, .pNext = &info, + .flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT, .stageCount = std::size(stagesStd), .pStages = stagesStd, .pVertexInputState = &vertexInputState, @@ -198,6 +199,7 @@ FlipPipeline::FlipPipeline() { { .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, .pNext = &info, + .flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT, .stageCount = std::size(stagesAlt), .pStages = stagesAlt, .pVertexInputState = &vertexInputState, diff --git a/rpcsx/gpu/lib/vk/include/Scheduler.hpp b/rpcsx/gpu/lib/vk/include/Scheduler.hpp index 96303bd1..b161b873 100644 --- a/rpcsx/gpu/lib/vk/include/Scheduler.hpp +++ b/rpcsx/gpu/lib/vk/include/Scheduler.hpp @@ -180,7 +180,7 @@ class Scheduler { auto value = mSemaphore.getCounterValue(); auto endIt = mTasks.upper_bound(value); - for (auto it = mTasks.begin(); it != mTasks.end(); + for (auto it = mTasks.begin(); it != endIt; it = mTasks.erase(it)) { taskList.reserve(taskList.size() + it->second.size()); for (auto &&fn : it->second) { diff --git a/rpcsx/iodev/dce.cpp b/rpcsx/iodev/dce.cpp index cbbbbf8e..0ca39c96 100644 --- a/rpcsx/iodev/dce.cpp +++ b/rpcsx/iodev/dce.cpp @@ -100,8 +100,8 @@ struct ResolutionStatus { std::uint32_t heigth; std::uint32_t paneWidth; std::uint32_t paneHeight; - std::uint32_t refreshHz; // float - std::uint32_t screenSizeInInch; // float + float refreshHz; // float + float screenSizeInInch; // float std::byte padding[20]; }; @@ -316,8 +316,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request, status->heigth = 1080; status->paneWidth = 1920; status->paneHeight = 1080; - status->refreshHz = 0x426fc28f; //( 59.94) - status->screenSizeInInch = 0x42500000; //( 52.00) + status->refreshHz = 59.94f; + status->screenSizeInInch = 52.0f; } else if (args->id == 9) { ORBIS_LOG_NOTICE("dce: FlipControl allocate", args->id, args->arg2, args->ptr, args->size); @@ -393,8 +393,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request, auto args = reinterpret_cast(argp); if (args->eop_nz == 0) { - gpu.submitFlip(thread->tproc->gfxRing, thread->tproc->pid, - args->displayBufferIndex, args->flipArg); + gpu.submitFlip(thread->tproc->pid, args->displayBufferIndex, + args->flipArg); } else if (args->eop_nz == 1) { std::uint64_t eopValue = args->canary; eopValue ^= 0xff00'0000; @@ -473,7 +473,7 @@ void DceDevice::initializeProcess(orbis::Process *process) { std::lock_guard lock(orbis::g_context.gpuDeviceMtx); { auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}; - gpu.submitMapProcess(process->gfxRing, process->pid, vmId); + gpu.submitMapProcess(process->pid, vmId); process->vmId = vmId; } diff --git a/rpcsx/iodev/dmem.cpp b/rpcsx/iodev/dmem.cpp index 58db33d1..88333724 100644 --- a/rpcsx/iodev/dmem.cpp +++ b/rpcsx/iodev/dmem.cpp @@ -69,10 +69,9 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len, } if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { - gpu.submitMapMemory(orbis::g_currentThread->tproc->gfxRing, - orbis::g_currentThread->tproc->pid, - reinterpret_cast(result), len, - memoryType, index, prot, directMemoryStart); + gpu.submitMapMemory(orbis::g_currentThread->tproc->pid, + reinterpret_cast(result), len, + memoryType, index, prot, directMemoryStart); } *address = result; diff --git a/rpcsx/iodev/gc.cpp b/rpcsx/iodev/gc.cpp index 0d4c6903..b36edf96 100644 --- a/rpcsx/iodev/gc.cpp +++ b/rpcsx/iodev/gc.cpp @@ -143,13 +143,16 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, } case 0xc0048116: { // submit done? + break; + } + + case 0xc0048117: if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { gpu.waitForIdle(); } else { return orbis::ErrorCode::BUSY; } break; - } case 0xc00c8110: { // set gs ring sizes diff --git a/rpcsx/vm.cpp b/rpcsx/vm.cpp index 8fd5f6f6..e0363c68 100644 --- a/rpcsx/vm.cpp +++ b/rpcsx/vm.cpp @@ -933,8 +933,8 @@ void *vm::map(void *addr, std::uint64_t len, std::int32_t prot, if (auto thr = orbis::g_currentThread) { std::lock_guard lock(orbis::g_context.gpuDeviceMtx); if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { - gpu.submitMapMemory(thr->tproc->gfxRing, thr->tproc->pid, address, len, - -1, -1, prot, address - kMinAddress); + gpu.submitMapMemory(thr->tproc->pid, address, len, -1, -1, prot, + address - kMinAddress); } } @@ -990,11 +990,11 @@ bool vm::unmap(void *addr, std::uint64_t size) { if (auto thr = orbis::g_currentThread) { std::lock_guard lock(orbis::g_context.gpuDeviceMtx); if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { - gpu.submitUnmapMemory(thr->tproc->gfxRing, thr->tproc->pid, address, - size); + gpu.submitUnmapMemory(thr->tproc->pid, address, size); } } else { - std::println(stderr, "ignoring mapping {:x}-{:x}", address, address + size); + std::println(stderr, "ignoring unmapping {:x}-{:x}", address, + address + size); } return rx::mem::unmap(addr, size); } @@ -1032,10 +1032,9 @@ bool vm::protect(void *addr, std::uint64_t size, std::int32_t prot) { std::println("memory prot: {:x}", prot); std::lock_guard lock(orbis::g_context.gpuDeviceMtx); if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { - gpu.submitProtectMemory(thr->tproc->gfxRing, thr->tproc->pid, address, - size, prot); + gpu.submitProtectMemory(thr->tproc->pid, address, size, prot); } - } else { + } else if (prot >> 4) { std::println(stderr, "ignoring mapping {:x}-{:x}", address, address + size); } return ::mprotect(addr, size, prot & kMapProtCpuAll) == 0;