Skip to content

Commit

Permalink
gpu: split command and gfx queues
Browse files Browse the repository at this point in the history
  • Loading branch information
DHrpcs3 committed Oct 19, 2024
1 parent 5ce8d51 commit 3c32076
Show file tree
Hide file tree
Showing 11 changed files with 96 additions and 73 deletions.
14 changes: 9 additions & 5 deletions rpcsx/gpu/Cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -469,13 +469,16 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
barrier.image = image;
barrier.subresourceRange = subresourceRange;

auto layoutToStageAccess = [](VkImageLayout layout)
-> std::pair<VkPipelineStageFlags, VkAccessFlags> {
auto layoutToStageAccess =
[](VkImageLayout layout,
bool isSrc) -> std::pair<VkPipelineStageFlags, VkAccessFlags> {
switch (layout) {
case VK_IMAGE_LAYOUT_UNDEFINED:
case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR:
case VK_IMAGE_LAYOUT_GENERAL:
return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0};
return {isSrc ? VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
: VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
0};

case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT};
Expand All @@ -501,8 +504,9 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
}
};

auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout);
auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout);
auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout, true);
auto [destinationStage, destinationAccess] =
layoutToStageAccess(newLayout, false);

barrier.srcAccessMask = sourceAccess;
barrier.dstAccessMask = destinationAccess;
Expand Down
30 changes: 22 additions & 8 deletions rpcsx/gpu/Device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,14 @@ Device::Device() : vkContext(createVkContext(this)) {
}
});

commandPipe.device = this;
commandPipe.ring = {
.base = cmdRing,
.size = std::size(cmdRing),
.rptr = cmdRing,
.wptr = cmdRing,
};

for (auto &pipe : computePipes) {
pipe.device = this;
}
Expand All @@ -244,7 +252,7 @@ Device::Device() : vkContext(createVkContext(this)) {
graphicsPipes[i].setDeQueue(
Ring{
.base = mainGfxRings[i],
.size = sizeof(mainGfxRings[i]) / sizeof(mainGfxRings[i][0]),
.size = std::size(mainGfxRings[i]),
.rptr = mainGfxRings[i],
.wptr = mainGfxRings[i],
},
Expand Down Expand Up @@ -621,6 +629,8 @@ void Device::onCommandBuffer(std::uint32_t pid, int cmdHeader,
bool Device::processPipes() {
bool allProcessed = true;

commandPipe.processAllRings();

for (auto &pipe : computePipes) {
if (!pipe.processAllRings()) {
allProcessed = false;
Expand Down Expand Up @@ -649,13 +659,16 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
barrier.image = image;
barrier.subresourceRange = subresourceRange;

auto layoutToStageAccess = [](VkImageLayout layout)
-> std::pair<VkPipelineStageFlags, VkAccessFlags> {
auto layoutToStageAccess =
[](VkImageLayout layout,
bool isSrc) -> std::pair<VkPipelineStageFlags, VkAccessFlags> {
switch (layout) {
case VK_IMAGE_LAYOUT_UNDEFINED:
case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR:
case VK_IMAGE_LAYOUT_GENERAL:
return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0};
return {isSrc ? VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
: VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
0};

case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT};
Expand All @@ -681,8 +694,9 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
}
};

auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout);
auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout);
auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout, true);
auto [destinationStage, destinationAccess] =
layoutToStageAccess(newLayout, false);

barrier.srcAccessMask = sourceAccess;
barrier.dstAccessMask = destinationAccess;
Expand Down Expand Up @@ -783,13 +797,13 @@ bool Device::flip(std::uint32_t pid, int bufferIndex, std::uint64_t arg,
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = vk::context->presentCompleteSemaphore,
.value = 1,
.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
.stageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
},
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = scheduler.getSemaphoreHandle(),
.value = submitCompleteTask - 1,
.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
.stageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
},
};

Expand Down
2 changes: 2 additions & 0 deletions rpcsx/gpu/Device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ struct Device : orbis::RcBase, DeviceContext {
GpuTiler tiler;
GraphicsPipe graphicsPipes[kGfxPipeCount]{0, 1};
ComputePipe computePipes[kComputePipeCount]{0, 1, 2, 3, 4, 5, 6, 7};
CommandPipe commandPipe;
FlipPipeline flipPipeline;

orbis::shared_mutex writeCommandMtx;
Expand All @@ -94,6 +95,7 @@ struct Device : orbis::RcBase, DeviceContext {
};

std::uint32_t mainGfxRings[kGfxPipeCount][0x4000 / sizeof(std::uint32_t)];
std::uint32_t cmdRing[0x4000 / sizeof(std::uint32_t)];

Device();
~Device();
Expand Down
59 changes: 30 additions & 29 deletions rpcsx/gpu/DeviceCtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,6 @@ void DeviceCtl::submitGfxCommand(int gfxPipe, int vmId,
void DeviceCtl::submitSwitchBuffer(int gfxPipe) {
mDevice->submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_SWITCH_BUFFER, 0));
}
void DeviceCtl::submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
std::uint64_t flipArg) {
mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_FLIP, bufferIndex,
flipArg & 0xffff'ffff,
flipArg >> 32, pid));
}

orbis::ErrorCode DeviceCtl::submitWriteEop(int gfxPipe, std::uint32_t waitMode,
std::uint64_t eopValue) {
Expand Down Expand Up @@ -107,40 +101,47 @@ orbis::ErrorCode DeviceCtl::submitFlipOnEop(int gfxPipe, std::uint32_t pid,

return {};
}
void DeviceCtl::submitFlip(std::uint32_t pid, int bufferIndex,
std::uint64_t flipArg) {
mDevice->submitCommand(mDevice->commandPipe.ring,
createPm4Packet(IT_FLIP, bufferIndex,
flipArg & 0xffff'ffff, flipArg >> 32,
pid));
}

void DeviceCtl::submitMapMemory(int gfxPipe, std::uint32_t pid,
std::uint64_t address, std::uint64_t size,
int memoryType, int dmemIndex, int prot,
std::int64_t offset) {
mDevice->submitGfxCommand(
gfxPipe,
void DeviceCtl::submitMapMemory(std::uint32_t pid, std::uint64_t address,
std::uint64_t size, int memoryType,
int dmemIndex, int prot, std::int64_t offset) {
mDevice->submitCommand(
mDevice->commandPipe.ring,
createPm4Packet(IT_MAP_MEMORY, pid, address & 0xffff'ffff, address >> 32,
size & 0xffff'ffff, size >> 32, memoryType, dmemIndex,
prot, offset & 0xffff'ffff, offset >> 32));
}
void DeviceCtl::submitUnmapMemory(int gfxPipe, std::uint32_t pid,
std::uint64_t address, std::uint64_t size) {
mDevice->submitGfxCommand(
gfxPipe, createPm4Packet(IT_UNMAP_MEMORY, pid, address & 0xffff'ffff,
address >> 32, size & 0xffff'ffff, size >> 32));
void DeviceCtl::submitUnmapMemory(std::uint32_t pid, std::uint64_t address,
std::uint64_t size) {
mDevice->submitCommand(mDevice->commandPipe.ring,
createPm4Packet(IT_UNMAP_MEMORY, pid,
address & 0xffff'ffff, address >> 32,
size & 0xffff'ffff, size >> 32));
}

void DeviceCtl::submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId) {
mDevice->submitGfxCommand(gfxPipe,
createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId));
void DeviceCtl::submitMapProcess(std::uint32_t pid, int vmId) {
mDevice->submitCommand(mDevice->commandPipe.ring,
createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId));
}

void DeviceCtl::submitUnmapProcess(int gfxPipe, std::uint32_t pid) {
mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_UNMAP_PROCESS, pid));
void DeviceCtl::submitUnmapProcess(std::uint32_t pid) {
mDevice->submitCommand(mDevice->commandPipe.ring,
createPm4Packet(IT_UNMAP_PROCESS, pid));
}

void DeviceCtl::submitProtectMemory(int gfxPipe, std::uint32_t pid,
std::uint64_t address, std::uint64_t size,
int prot) {
mDevice->submitGfxCommand(
gfxPipe,
createPm4Packet(IT_PROTECT_MEMORY, pid, address & 0xffff'ffff,
address >> 32, size & 0xffff'ffff, size >> 32, prot));
void DeviceCtl::submitProtectMemory(std::uint32_t pid, std::uint64_t address,
std::uint64_t size, int prot) {
mDevice->submitCommand(mDevice->commandPipe.ring,
createPm4Packet(IT_PROTECT_MEMORY, pid,
address & 0xffff'ffff, address >> 32,
size & 0xffff'ffff, size >> 32, prot));
}

void DeviceCtl::registerBuffer(std::uint32_t pid, Buffer buffer) {
Expand Down
15 changes: 7 additions & 8 deletions rpcsx/gpu/DeviceCtl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,21 @@ class DeviceCtl {
void submitGfxCommand(int gfxPipe, int vmId,
std::span<const std::uint32_t> command);
void submitSwitchBuffer(int gfxPipe);
void submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
std::uint64_t flipArg);
orbis::ErrorCode submitWriteEop(int gfxPipe, std::uint32_t waitMode,
std::uint64_t eopValue);
orbis::ErrorCode submitFlipOnEop(int gfxPipe, std::uint32_t pid,
int bufferIndex, std::uint64_t flipArg,
std::uint64_t eopValue);
void submitMapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
void submitFlip(std::uint32_t pid, int bufferIndex, std::uint64_t flipArg);
void submitMapMemory(std::uint32_t pid, std::uint64_t address,
std::uint64_t size, int memoryType, int dmemIndex,
int prot, std::int64_t offset);
void submitUnmapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
void submitUnmapMemory(std::uint32_t pid, std::uint64_t address,
std::uint64_t size);
void submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId);
void submitUnmapProcess(int gfxPipe, std::uint32_t pid);
void submitProtectMemory(int gfxPipe, std::uint32_t pid,
std::uint64_t address, std::uint64_t size, int prot);
void submitMapProcess(std::uint32_t pid, int vmId);
void submitUnmapProcess(std::uint32_t pid);
void submitProtectMemory(std::uint32_t pid, std::uint64_t address,
std::uint64_t size, int prot);
void registerBuffer(std::uint32_t pid, Buffer buffer);
void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr);

Expand Down
6 changes: 4 additions & 2 deletions rpcsx/gpu/FlipPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ FlipPipeline::FlipPipeline() {
.pAttachments = &blendAttachmentState};

VkDynamicState dynamicStates[] = {
VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT,
VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT,
VK_DYNAMIC_STATE_VIEWPORT,
VK_DYNAMIC_STATE_SCISSOR,
};

VkPipelineDynamicStateCreateInfo dynamicState{
Expand All @@ -183,6 +183,7 @@ FlipPipeline::FlipPipeline() {
{
.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
.pNext = &info,
.flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT,
.stageCount = std::size(stagesStd),
.pStages = stagesStd,
.pVertexInputState = &vertexInputState,
Expand All @@ -198,6 +199,7 @@ FlipPipeline::FlipPipeline() {
{
.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
.pNext = &info,
.flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT,
.stageCount = std::size(stagesAlt),
.pStages = stagesAlt,
.pVertexInputState = &vertexInputState,
Expand Down
2 changes: 1 addition & 1 deletion rpcsx/gpu/lib/vk/include/Scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ class Scheduler {
auto value = mSemaphore.getCounterValue();
auto endIt = mTasks.upper_bound(value);

for (auto it = mTasks.begin(); it != mTasks.end();
for (auto it = mTasks.begin(); it != endIt;
it = mTasks.erase(it)) {
taskList.reserve(taskList.size() + it->second.size());
for (auto &&fn : it->second) {
Expand Down
14 changes: 7 additions & 7 deletions rpcsx/iodev/dce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ struct ResolutionStatus {
std::uint32_t heigth;
std::uint32_t paneWidth;
std::uint32_t paneHeight;
std::uint32_t refreshHz; // float
std::uint32_t screenSizeInInch; // float
float refreshHz; // float
float screenSizeInInch; // float
std::byte padding[20];
};

Expand Down Expand Up @@ -316,8 +316,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
status->heigth = 1080;
status->paneWidth = 1920;
status->paneHeight = 1080;
status->refreshHz = 0x426fc28f; //( 59.94)
status->screenSizeInInch = 0x42500000; //( 52.00)
status->refreshHz = 59.94f;
status->screenSizeInInch = 52.0f;
} else if (args->id == 9) {
ORBIS_LOG_NOTICE("dce: FlipControl allocate", args->id, args->arg2,
args->ptr, args->size);
Expand Down Expand Up @@ -393,8 +393,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
auto args = reinterpret_cast<FlipRequestArgs *>(argp);

if (args->eop_nz == 0) {
gpu.submitFlip(thread->tproc->gfxRing, thread->tproc->pid,
args->displayBufferIndex, args->flipArg);
gpu.submitFlip(thread->tproc->pid, args->displayBufferIndex,
args->flipArg);
} else if (args->eop_nz == 1) {
std::uint64_t eopValue = args->canary;
eopValue ^= 0xff00'0000;
Expand Down Expand Up @@ -473,7 +473,7 @@ void DceDevice::initializeProcess(orbis::Process *process) {
std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
{
auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
gpu.submitMapProcess(process->gfxRing, process->pid, vmId);
gpu.submitMapProcess(process->pid, vmId);
process->vmId = vmId;
}

Expand Down
7 changes: 3 additions & 4 deletions rpcsx/iodev/dmem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,9 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len,
}

if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
gpu.submitMapMemory(orbis::g_currentThread->tproc->gfxRing,
orbis::g_currentThread->tproc->pid,
reinterpret_cast<std::uint64_t>(result), len,
memoryType, index, prot, directMemoryStart);
gpu.submitMapMemory(orbis::g_currentThread->tproc->pid,
reinterpret_cast<std::uint64_t>(result), len,
memoryType, index, prot, directMemoryStart);
}

*address = result;
Expand Down
5 changes: 4 additions & 1 deletion rpcsx/iodev/gc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,16 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
}

case 0xc0048116: { // submit done?
break;
}

case 0xc0048117:
if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
gpu.waitForIdle();
} else {
return orbis::ErrorCode::BUSY;
}
break;
}

case 0xc00c8110: {
// set gs ring sizes
Expand Down
15 changes: 7 additions & 8 deletions rpcsx/vm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -933,8 +933,8 @@ void *vm::map(void *addr, std::uint64_t len, std::int32_t prot,
if (auto thr = orbis::g_currentThread) {
std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
gpu.submitMapMemory(thr->tproc->gfxRing, thr->tproc->pid, address, len,
-1, -1, prot, address - kMinAddress);
gpu.submitMapMemory(thr->tproc->pid, address, len, -1, -1, prot,
address - kMinAddress);
}
}

Expand Down Expand Up @@ -990,11 +990,11 @@ bool vm::unmap(void *addr, std::uint64_t size) {
if (auto thr = orbis::g_currentThread) {
std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
gpu.submitUnmapMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
size);
gpu.submitUnmapMemory(thr->tproc->pid, address, size);
}
} else {
std::println(stderr, "ignoring mapping {:x}-{:x}", address, address + size);
std::println(stderr, "ignoring unmapping {:x}-{:x}", address,
address + size);
}
return rx::mem::unmap(addr, size);
}
Expand Down Expand Up @@ -1032,10 +1032,9 @@ bool vm::protect(void *addr, std::uint64_t size, std::int32_t prot) {
std::println("memory prot: {:x}", prot);
std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
gpu.submitProtectMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
size, prot);
gpu.submitProtectMemory(thr->tproc->pid, address, size, prot);
}
} else {
} else if (prot >> 4) {
std::println(stderr, "ignoring mapping {:x}-{:x}", address, address + size);
}
return ::mprotect(addr, size, prot & kMapProtCpuAll) == 0;
Expand Down

0 comments on commit 3c32076

Please sign in to comment.