diff --git a/CMakeLists.txt b/CMakeLists.txt index f340bae6..9f5f2356 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,8 @@ project(rpcsx) set(CMAKE_CXX_EXTENSIONS off) set(CMAKE_CXX_STANDARD 23) -set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_STANDARD_REQUIRED on) +set(CMAKE_BUILD_RPATH_USE_ORIGIN on) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") @@ -41,7 +42,7 @@ function(add_precompiled_vulkan_spirv target) add_custom_command( OUTPUT ${outputpath} - COMMAND $ -V --target-env vulkan1.3 --vn "${varname}" -o "${outputpath}" "${CMAKE_CURRENT_SOURCE_DIR}/${input}" + COMMAND $ -V --target-env vulkan1.2 --vn "${varname}" -o "${outputpath}" "${CMAKE_CURRENT_SOURCE_DIR}/${input}" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${input}" glslang::glslang-standalone COMMENT "Generating ${outputname}..." ) @@ -65,6 +66,7 @@ add_subdirectory(tools) add_subdirectory(orbis-kernel) add_subdirectory(rpcsx-os) add_subdirectory(rpcsx-gpu) +add_subdirectory(rpcsx-gpu2) add_subdirectory(hw/amdgpu) add_subdirectory(rx) diff --git a/hw/amdgpu/device/src/rect_list.geom.glsl b/hw/amdgpu/device/src/rect_list.geom.glsl index 84123923..287c864a 100644 --- a/hw/amdgpu/device/src/rect_list.geom.glsl +++ b/hw/amdgpu/device/src/rect_list.geom.glsl @@ -1,6 +1,6 @@ #version 450 -layout (triangles) in; +layout (triangles, invocations = 1) in; layout (triangle_strip, max_vertices = 4) out; void main(void) diff --git a/rpcsx-gpu2/CMakeLists.txt b/rpcsx-gpu2/CMakeLists.txt new file mode 100644 index 00000000..f15d0907 --- /dev/null +++ b/rpcsx-gpu2/CMakeLists.txt @@ -0,0 +1,36 @@ +find_package(glfw3 3.3 REQUIRED) + +add_precompiled_vulkan_spirv(rpcsx-gpu-shaders + shaders/fill_red.frag.glsl + shaders/flip.frag.glsl + shaders/flip.vert.glsl + shaders/rect_list.geom.glsl +) + +add_executable(rpcsx-gpu2 + Cache.cpp + main.cpp + Device.cpp + Pipe.cpp + Registers.cpp + Renderer.cpp +) + +target_link_libraries(rpcsx-gpu2 +PUBLIC + rpcsx-gpu-shaders + amdgpu::bridge + rx + gcn-shader + glfw + amdgpu::tiler::cpu + amdgpu::tiler::vulkan + rdna-semantic-spirv + gnm::vulkan + gnm +) + +install(TARGETS rpcsx-gpu2 RUNTIME DESTINATION bin) +set_target_properties(rpcsx-gpu2 PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +add_subdirectory(lib) diff --git a/rpcsx-gpu2/Cache.cpp b/rpcsx-gpu2/Cache.cpp new file mode 100644 index 00000000..a10f331c --- /dev/null +++ b/rpcsx-gpu2/Cache.cpp @@ -0,0 +1,1109 @@ +#include "Cache.hpp" +#include "Device.hpp" +#include "amdgpu/tiler.hpp" +#include "gnm/vulkan.hpp" +#include "rx/MemoryTable.hpp" +#include "rx/die.hpp" +#include "shader/GcnConverter.hpp" +#include "shader/dialect.hpp" +#include "shader/glsl.hpp" +#include "shader/spv.hpp" +#include "vk.hpp" +#include +#include +#include +#include +#include + +using namespace amdgpu; + +static bool isPrimRequiresConversion(gnm::PrimitiveType primType) { + switch (primType) { + case gnm::PrimitiveType::PointList: + case gnm::PrimitiveType::LineList: + case gnm::PrimitiveType::LineStrip: + case gnm::PrimitiveType::TriList: + case gnm::PrimitiveType::TriFan: + case gnm::PrimitiveType::TriStrip: + case gnm::PrimitiveType::Patch: + case gnm::PrimitiveType::LineListAdjacency: + case gnm::PrimitiveType::LineStripAdjacency: + case gnm::PrimitiveType::TriListAdjacency: + case gnm::PrimitiveType::TriStripAdjacency: + return false; + + case gnm::PrimitiveType::LineLoop: // FIXME + rx::die("unimplemented line loop primitive"); + return false; + + case gnm::PrimitiveType::RectList: + return false; + + case gnm::PrimitiveType::QuadList: + case gnm::PrimitiveType::QuadStrip: + case gnm::PrimitiveType::Polygon: + return true; + + default: + rx::die("unknown primitive type: %u", (unsigned)primType); + } +} + +static std::pair +quadListPrimConverter(std::uint64_t index) { + static constexpr int indicies[] = {0, 1, 2, 2, 3, 0}; + return {index, index / 6 + indicies[index % 6]}; +} + +static std::pair +quadStripPrimConverter(std::uint64_t index) { + static constexpr int indicies[] = {0, 1, 3, 0, 3, 2}; + return {index, (index / 6) * 4 + indicies[index % 6]}; +} + +using ConverterFn = + std::pair(std::uint64_t index); + +static ConverterFn *getPrimConverterFn(gnm::PrimitiveType primType, + std::uint32_t *count) { + switch (primType) { + case gnm::PrimitiveType::QuadList: + *count = *count / 4 * 6; + return quadListPrimConverter; + + case gnm::PrimitiveType::QuadStrip: + *count = *count / 4 * 6; + return quadStripPrimConverter; + + default: + rx::die("getPrimConverterFn: unexpected primType %u", + static_cast(primType)); + } +} + +static VkShaderStageFlagBits shaderStageToVk(shader::gcn::Stage stage) { + switch (stage) { + case shader::gcn::Stage::Ps: + return VK_SHADER_STAGE_FRAGMENT_BIT; + case shader::gcn::Stage::VsVs: + return VK_SHADER_STAGE_VERTEX_BIT; + // case shader::gcn::Stage::VsEs: + // case shader::gcn::Stage::VsLs: + case shader::gcn::Stage::Cs: + return VK_SHADER_STAGE_COMPUTE_BIT; + // case shader::gcn::Stage::Gs: + // case shader::gcn::Stage::GsVs: + // case shader::gcn::Stage::Hs: + // case shader::gcn::Stage::DsVs: + // case shader::gcn::Stage::DsEs: + + default: + rx::die("unsupported shader stage %u", int(stage)); + } +} + +static void fillStageBindings(VkDescriptorSetLayoutBinding *bindings, + VkShaderStageFlagBits stage, int setIndex) { + + auto createDescriptorBinding = [&](VkDescriptorType type, uint32_t count, + int dim = 0) { + auto binding = Cache::getDescriptorBinding(type, dim); + rx::dieIf(binding < 0, "unexpected descriptor type %#x\n", int(type)); + bindings[binding] = VkDescriptorSetLayoutBinding{ + .binding = static_cast(binding), + .descriptorType = type, + .descriptorCount = count, + .stageFlags = VkShaderStageFlags( + stage | (binding > 0 && stage != VK_SHADER_STAGE_COMPUTE_BIT + ? VK_SHADER_STAGE_ALL_GRAPHICS + : 0)), + .pImmutableSamplers = nullptr, + }; + }; + + createDescriptorBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1); + if (setIndex == 0) { + createDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLER, 16); + createDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 16, 1); + createDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 16, 2); + createDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 16, 3); + createDescriptorBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 16); + } +} + +static void +transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, + VkImageLayout oldLayout, VkImageLayout newLayout, + const VkImageSubresourceRange &subresourceRange) { + VkImageMemoryBarrier barrier{}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; + barrier.subresourceRange = subresourceRange; + + auto layoutToStageAccess = [](VkImageLayout layout) + -> std::pair { + switch (layout) { + case VK_IMAGE_LAYOUT_UNDEFINED: + case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: + case VK_IMAGE_LAYOUT_GENERAL: + return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0}; + + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT}; + + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT}; + + case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: + return {VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT}; + + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + return {VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT}; + + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + return {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT}; + + default: + std::abort(); + } + }; + + auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout); + auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout); + + barrier.srcAccessMask = sourceAccess; + barrier.dstAccessMask = destinationAccess; + + vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, + nullptr, 0, nullptr, 1, &barrier); +} + +struct Cache::Entry { + virtual ~Entry() = default; + + Cache::TagId tagId; + std::uint64_t baseAddress; + Access acquiredAccess = Access::None; + + virtual void flush(Cache::Tag &tag, Scheduler &scheduler, + std::uint64_t beginAddress, std::uint64_t endAddress) {} +}; + +struct CachedShader : Cache::Entry { + std::uint64_t magic; + VkShaderEXT handle; + shader::gcn::ShaderInfo info; + std::vector>> usedMemory; + + ~CachedShader() { + vk::DestroyShaderEXT(vk::context->device, handle, vk::context->allocator); + } +}; + +struct CachedBuffer : Cache::Entry { + vk::Buffer buffer; + std::size_t size; + + void flush(Cache::Tag &tag, Scheduler &scheduler, std::uint64_t beginAddress, + std::uint64_t endAddress) override { + if ((acquiredAccess & Access::Write) == Access::None) { + return; + } + + // std::printf("writing buffer to memory %lx\n", baseAddress); + std::memcpy(RemoteMemory{tag.getVmId()}.getPointer(baseAddress), + buffer.getData(), size); + } +}; + +struct CachedIndexBuffer : Cache::Entry { + vk::Buffer buffer; + std::size_t size; + gnm::IndexType indexType; + gnm::PrimitiveType primType; +}; + +struct CachedImage : Cache::Entry { + vk::Image image; + SurfaceInfo info; + TileMode acquiredTileMode; + + void flush(Cache::Tag &tag, Scheduler &scheduler, std::uint64_t beginAddress, + std::uint64_t endAddress) override { + if ((acquiredAccess & Access::Write) == Access::None) { + return; + } + + // std::printf("writing image to buffer to %lx\n", baseAddress); + + VkImageSubresourceRange subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = image.getMipLevels(), + .baseArrayLayer = 0, + .layerCount = image.getArrayLayers(), + }; + + auto transferBuffer = vk::Buffer::Allocate( + vk::getDeviceLocalMemory(), info.totalSize, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT); + auto tiledBuffer = + tag.getBuffer(baseAddress, info.totalSize, Access::Write); + auto &tiler = tag.getDevice()->tiler; + + transitionImageLayout( + scheduler.getCommandBuffer(), image, VK_IMAGE_LAYOUT_GENERAL, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, subresourceRange); + + for (unsigned mipLevel = 0; mipLevel < image.getMipLevels(); ++mipLevel) { + VkBufferImageCopy region = { + .bufferRowLength = + mipLevel > 0 ? 0 : std::max(info.pitch >> mipLevel, 1u), + .imageSubresource = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .mipLevel = mipLevel, + .baseArrayLayer = 0, + .layerCount = image.getArrayLayers(), + }, + .imageExtent = + { + .width = std::max(image.getWidth() >> mipLevel, 1u), + .height = std::max(image.getHeight() >> mipLevel, 1u), + .depth = std::max(image.getDepth() >> mipLevel, 1u), + }, + }; + + vkCmdCopyImageToBuffer(scheduler.getCommandBuffer(), image.getHandle(), + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + transferBuffer.getHandle(), 1, ®ion); + + tiler.tile(scheduler, info, acquiredTileMode, transferBuffer.getAddress(), + tiledBuffer.deviceAddress, mipLevel, 0, + image.getArrayLayers()); + } + + transitionImageLayout(scheduler.getCommandBuffer(), image, + VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + VK_IMAGE_LAYOUT_GENERAL, subresourceRange); + // scheduler.afterSubmit([transferBuffer = std::move(transferBuffer)] {}); + scheduler.submit(); + scheduler.wait(); + } +}; + +struct CachedImageView : Cache::Entry { + vk::ImageView view; +}; + +ImageKey ImageKey::createFrom(const gnm::TBuffer &buffer) { + return { + .address = buffer.address(), + .type = buffer.type, + .dfmt = buffer.dfmt, + .nfmt = buffer.nfmt, + .tileMode = getDefaultTileModes()[buffer.tiling_idx], + .offset = {}, + .extent = + { + .width = buffer.width + 1u, + .height = buffer.height + 1u, + .depth = buffer.depth + 1u, + }, + .pitch = buffer.pitch + 1u, + .baseMipLevel = static_cast(buffer.base_level), + .mipCount = buffer.last_level - buffer.base_level + 1u, + .baseArrayLayer = static_cast(buffer.base_array), + .arrayLayerCount = buffer.last_array - buffer.base_array + 1u, + .pow2pad = buffer.pow2pad != 0, + }; +} + +ImageViewKey ImageViewKey::createFrom(const gnm::TBuffer &buffer) { + ImageViewKey result{}; + static_cast(result) = ImageKey::createFrom(buffer); + result.R = buffer.dst_sel_x; + result.G = buffer.dst_sel_y; + result.B = buffer.dst_sel_z; + result.A = buffer.dst_sel_w; + return result; +} + +SamplerKey SamplerKey::createFrom(const gnm::SSampler &sampler) { + float lodBias = ((std::int16_t(sampler.lod_bias) << 2) >> 2) / float(256.f); + // FIXME: lodBias can be scaled by gnm::TBuffer + + return { + .magFilter = toVkFilter(sampler.xy_mag_filter), + .minFilter = toVkFilter(sampler.xy_min_filter), + .mipmapMode = toVkSamplerMipmapMode(sampler.mip_filter), + .addressModeU = toVkSamplerAddressMode(sampler.clamp_x), + .addressModeV = toVkSamplerAddressMode(sampler.clamp_y), + .addressModeW = toVkSamplerAddressMode(sampler.clamp_z), + .mipLodBias = lodBias, + .maxAnisotropy = 0, // max_aniso_ratio + .compareOp = toVkCompareOp(sampler.depth_compare_func), + .minLod = static_cast(sampler.min_lod), + .maxLod = static_cast(sampler.max_lod), + .borderColor = toVkBorderColor(sampler.border_color_type), + .anisotropyEnable = false, + .compareEnable = sampler.depth_compare_func != gnm::CompareFunc::Never, + .unnormalizedCoordinates = sampler.force_unorm_coords != 0, + }; +} + +Cache::Shader Cache::Tag::getShader(const ShaderKey &key, + const ShaderKey *dependedKey) { + auto stage = shaderStageToVk(key.stage); + if (auto result = findShader(key, dependedKey)) { + auto cachedShader = static_cast(result.get()); + mAcquiredResources.push_back(result); + return {cachedShader->handle, &cachedShader->info, stage}; + } + + auto vmId = mParent->mVmIm; + + std::optional converted; + + { + shader::gcn::Context context; + auto deserialized = shader::gcn::deserialize( + context, key.env, mParent->mDevice->gcnSemantic, key.address, + [vmId](std::uint64_t address) -> std::uint32_t { + return *RemoteMemory{vmId}.getPointer(address); + }); + + // deserialized.print(std::cerr, context.ns); + + converted = shader::gcn::convertToSpv( + context, deserialized, mParent->mDevice->gcnSemanticModuleInfo, + key.stage, key.env); + if (!converted) { + return {}; + } + + converted->info.resources.dump(); + if (!shader::spv::validate(converted->spv)) { + shader::spv::dump(converted->spv, true); + return {}; + } + + std::fprintf(stderr, "%s", shader::glsl::decompile(converted->spv).c_str()); + // if (auto opt = shader::spv::optimize(converted->spv)) { + // converted->spv = std::move(*opt); + // std::fprintf(stderr, "opt: %s", + // shader::glsl::decompile(converted->spv).c_str()); + // } else { + // std::printf("optimization failed\n"); + // } + } + + VkShaderCreateInfoEXT createInfo{ + .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT, + .flags = 0, + .stage = stage, + .codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT, + .codeSize = converted->spv.size() * sizeof(converted->spv[0]), + .pCode = converted->spv.data(), + .pName = "main", + .setLayoutCount = static_cast( + stage == VK_SHADER_STAGE_COMPUTE_BIT ? 1 + : Cache::kGraphicsStages.size()), + .pSetLayouts = (stage == VK_SHADER_STAGE_COMPUTE_BIT + ? &mParent->mComputeDescriptorSetLayout + : mParent->mGraphicsDescriptorSetLayouts.data())}; + + VkShaderEXT handle; + VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &createInfo, + vk::context->allocator, &handle)); + + auto result = std::make_shared(); + result->baseAddress = key.address; + result->tagId = getReadId(); + result->handle = handle; + result->info = std::move(converted->info); + readMemory(&result->magic, key.address, sizeof(result->magic)); + + for (auto entry : converted->info.memoryMap) { + auto address = entry.beginAddress; + auto size = entry.endAddress - entry.beginAddress; + auto &inserted = result->usedMemory.emplace_back(); + inserted.first = address; + inserted.second.resize(size); + readMemory(inserted.second.data(), address, size); + } + + mParent->mShaders.map(key.address, key.address + 8, result); + mAcquiredResources.push_back(result); + return {handle, &result->info, stage}; +} + +std::shared_ptr +Cache::Tag::findShader(const ShaderKey &key, const ShaderKey *dependedKey) { + auto data = RemoteMemory{mParent->mVmIm}.getPointer(key.address); + + auto cacheIt = mParent->mShaders.queryArea(key.address); + + if (cacheIt == mParent->mShaders.end() || + cacheIt->get()->baseAddress != key.address) { + return {}; + } + + std::uint64_t magic; + readMemory(&magic, key.address, sizeof(magic)); + + auto cachedShader = static_cast(cacheIt->get()); + if (cachedShader->magic != magic) { + return {}; + } + + for (auto [index, sgpr] : cachedShader->info.requiredSgprs) { + if (index >= key.env.userSgprs.size() || key.env.userSgprs[index] != sgpr) { + return {}; + } + } + + for (auto &usedMemory : cachedShader->usedMemory) { + if (compareMemory(usedMemory.second.data(), usedMemory.first, + usedMemory.second.size())) { + return {}; + } + } + + return cacheIt.get(); +} + +Cache::Sampler Cache::Tag::getSampler(const SamplerKey &key) { + auto [it, inserted] = getCache()->mSamplers.emplace(key, VK_NULL_HANDLE); + + if (inserted) { + VkSamplerCreateInfo info{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .magFilter = key.magFilter, + .minFilter = key.minFilter, + .mipmapMode = key.mipmapMode, + .addressModeU = key.addressModeU, + .addressModeV = key.addressModeV, + .addressModeW = key.addressModeW, + .mipLodBias = key.mipLodBias, + .anisotropyEnable = key.anisotropyEnable, + .maxAnisotropy = key.maxAnisotropy, + .compareEnable = key.compareEnable, + .compareOp = key.compareOp, + .minLod = key.minLod, + .maxLod = key.maxLod, + .borderColor = key.borderColor, + .unnormalizedCoordinates = key.unnormalizedCoordinates, + }; + + VK_VERIFY(vkCreateSampler(vk::context->device, &info, + vk::context->allocator, &it->second)); + } + + return {it->second}; +} + +Cache::Buffer Cache::Tag::getBuffer(std::uint64_t address, std::uint64_t size, + Access access) { + auto buffer = vk::Buffer::Allocate( + vk::getHostVisibleMemory(), size, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_INDEX_BUFFER_BIT); + + if ((access & Access::Read) != Access::None) { + readMemory(buffer.getData(), address, size); + } + + auto cached = std::make_shared(); + cached->baseAddress = address; + cached->acquiredAccess = access; + cached->buffer = std::move(buffer); + cached->size = size; + cached->tagId = + (access & Access::Write) != Access::Write ? getWriteId() : getReadId(); + + mAcquiredResources.push_back(cached); + + return { + .handle = cached->buffer.getHandle(), + .offset = 0, + .deviceAddress = cached->buffer.getAddress(), + .tagId = getReadId(), + .data = cached->buffer.getData(), + }; +} + +Cache::Buffer Cache::Tag::getInternalBuffer(std::uint64_t size) { + auto buffer = vk::Buffer::Allocate( + vk::getHostVisibleMemory(), size, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_INDEX_BUFFER_BIT); + + auto cached = std::make_shared(); + cached->baseAddress = 0; + cached->acquiredAccess = Access::None; + cached->buffer = std::move(buffer); + cached->size = size; + cached->tagId = getReadId(); + + mAcquiredResources.push_back(cached); + + return { + .handle = cached->buffer.getHandle(), + .offset = 0, + .deviceAddress = cached->buffer.getAddress(), + .tagId = getReadId(), + .data = cached->buffer.getData(), + }; +} + +Cache::IndexBuffer Cache::Tag::getIndexBuffer(std::uint64_t address, + std::uint32_t indexCount, + gnm::PrimitiveType primType, + gnm::IndexType indexType) { + unsigned origIndexSize = indexType == gnm::IndexType::Int16 ? 2 : 4; + std::uint32_t size = indexCount * origIndexSize; + + if (address == 0) { + if (isPrimRequiresConversion(primType)) { + getPrimConverterFn(primType, &indexCount); + primType = gnm::PrimitiveType::TriList; + } + + return { + .handle = VK_NULL_HANDLE, + .offset = 0, + .indexCount = indexCount, + .primType = primType, + .indexType = indexType, + }; + } + + auto indexBuffer = getBuffer(address, size, Access::Read); + + if (!isPrimRequiresConversion(primType)) { + return { + .handle = indexBuffer.handle, + .offset = indexBuffer.offset, + .indexCount = indexCount, + .primType = primType, + .indexType = indexType, + }; + } + + auto it = mParent->mIndexBuffers.queryArea(address); + if (it != mParent->mIndexBuffers.end() && it.beginAddress() == address && + it.endAddress() == address + size) { + + auto &resource = it.get(); + auto indexBuffer = static_cast(resource.get()); + if (indexBuffer->size == size && resource->tagId == indexBuffer->tagId) { + mAcquiredResources.push_back(resource); + return { + .handle = indexBuffer->buffer.getHandle(), + .offset = 0, + .indexCount = indexCount, + .primType = indexBuffer->primType, + .indexType = indexBuffer->indexType, + }; + } + } + + auto converterFn = getPrimConverterFn(primType, &indexCount); + primType = gnm::PrimitiveType::TriList; + + if (indexCount >= 0x10000) { + indexType = gnm::IndexType::Int32; + } + + unsigned indexSize = indexType == gnm::IndexType::Int16 ? 2 : 4; + auto indexBufferSize = indexSize * indexCount; + + auto convertedIndexBuffer = vk::Buffer::Allocate( + vk::getHostVisibleMemory(), indexBufferSize, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT); + + void *data = convertedIndexBuffer.getData(); + + auto indicies = indexBuffer.data + indexBuffer.offset; + + if (indexSize == 2) { + for (std::uint32_t i = 0; i < indexCount; ++i) { + auto [dstIndex, srcIndex] = converterFn(i); + std::uint32_t origIndex = origIndexSize == 2 + ? ((std::uint16_t *)indicies)[srcIndex] + : ((std::uint32_t *)indicies)[srcIndex]; + ((std::uint16_t *)data)[dstIndex] = origIndex; + } + + } else { + for (std::uint32_t i = 0; i < indexCount; ++i) { + auto [dstIndex, srcIndex] = converterFn(i); + std::uint32_t origIndex = origIndexSize == 2 + ? ((std::uint16_t *)indicies)[srcIndex] + : ((std::uint32_t *)indicies)[srcIndex]; + ((std::uint32_t *)data)[dstIndex] = origIndex; + } + } + + auto cached = std::make_shared(); + cached->baseAddress = address; + cached->acquiredAccess = Access::Read; + cached->buffer = std::move(convertedIndexBuffer); + cached->size = size; + cached->tagId = indexBuffer.tagId; + cached->primType = primType; + cached->indexType = indexType; + + mParent->mIndexBuffers.map(address, address + size, cached); + mAcquiredResources.push_back(cached); + + return { + .handle = cached->buffer.getHandle(), + .offset = 0, + .indexCount = indexCount, + .primType = cached->primType, + .indexType = cached->indexType, + }; +} + +Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { + auto surfaceInfo = computeSurfaceInfo( + key.tileMode, key.type, key.dfmt, key.offset.x + key.extent.width, + key.offset.y + key.extent.height, key.offset.z + key.extent.depth, + key.pitch, key.baseArrayLayer, key.arrayLayerCount, key.baseMipLevel, + key.mipCount, key.pow2pad); + + VkImageUsageFlags usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT // | VK_IMAGE_USAGE_STORAGE_BIT + ; + + bool isCompressed = + key.dfmt == gnm::kDataFormatBc1 || key.dfmt == gnm::kDataFormatBc2 || + key.dfmt == gnm::kDataFormatBc3 || key.dfmt == gnm::kDataFormatBc4 || + key.dfmt == gnm::kDataFormatBc5 || key.dfmt == gnm::kDataFormatBc6 || + key.dfmt == gnm::kDataFormatBc7 || key.dfmt == gnm::kDataFormatGB_GR || + key.dfmt == gnm::kDataFormatBG_RG; + + if (!isCompressed) { + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + } + + auto image = vk::Image::Allocate( + vk::getDeviceLocalMemory(), gnm::toVkImageType(key.type), key.extent, + key.mipCount, key.arrayLayerCount, gnm::toVkFormat(key.dfmt, key.nfmt), + VK_SAMPLE_COUNT_1_BIT, usage); + + if ((access & Access::Read) != Access::None) { + auto tiledBuffer = + getBuffer(key.address, surfaceInfo.totalSize, Access::Read); + + auto &tiler = mParent->mDevice->tiler; + auto detiledBuffer = + vk::Buffer::Allocate(vk::getDeviceLocalMemory(), surfaceInfo.totalSize, + VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR | + VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR); + VkImageSubresourceRange subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = key.baseMipLevel, + .levelCount = key.mipCount, + .baseArrayLayer = key.baseArrayLayer, + .layerCount = key.arrayLayerCount, + }; + + std::vector regions; + regions.reserve(key.mipCount); + + for (unsigned mipLevel = key.baseMipLevel; + mipLevel < key.baseMipLevel + key.mipCount; ++mipLevel) { + tiler.detile(*mScheduler, surfaceInfo, key.tileMode, + tiledBuffer.deviceAddress, detiledBuffer.getAddress(), + mipLevel, key.baseArrayLayer, key.arrayLayerCount); + regions.push_back({ + .bufferRowLength = + mipLevel > 0 ? 0 : std::max(key.pitch >> mipLevel, 1u), + .imageSubresource = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .mipLevel = mipLevel, + .baseArrayLayer = key.baseArrayLayer, + .layerCount = key.arrayLayerCount, + }, + .imageExtent = + { + .width = std::max(key.extent.width >> mipLevel, 1u), + .height = std::max(key.extent.height >> mipLevel, 1u), + .depth = std::max(key.extent.depth >> mipLevel, 1u), + }, + }); + } + + transitionImageLayout( + mScheduler->getCommandBuffer(), image, VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, subresourceRange); + + vkCmdCopyBufferToImage(mScheduler->getCommandBuffer(), + detiledBuffer.getHandle(), image.getHandle(), + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, regions.size(), + regions.data()); + + transitionImageLayout(mScheduler->getCommandBuffer(), image, + VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, + subresourceRange); + + mScheduler->afterSubmit([detiledBuffer = std::move(detiledBuffer)] {}); + } + + auto cached = std::make_shared(); + cached->image = std::move(image); + cached->info = std::move(surfaceInfo); + cached->baseAddress = key.address; + cached->acquiredAccess = access; + mAcquiredResources.push_back(cached); + + return {.handle = cached->image.getHandle()}; +} + +Cache::ImageView Cache::Tag::getImageView(const ImageViewKey &key, + Access access) { + auto image = getImage(key, access); + auto result = vk::ImageView(gnm::toVkImageViewType(key.type), image.handle, + gnm::toVkFormat(key.dfmt, key.nfmt), + { + .r = gnm::toVkComponentSwizzle(key.R), + .g = gnm::toVkComponentSwizzle(key.G), + .b = gnm::toVkComponentSwizzle(key.B), + .a = gnm::toVkComponentSwizzle(key.A), + }, + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = key.baseMipLevel, + .levelCount = key.mipCount, + .baseArrayLayer = key.baseArrayLayer, + .layerCount = key.arrayLayerCount, + }); + auto cached = std::make_shared(); + cached->baseAddress = key.address; + cached->acquiredAccess = access; + cached->view = std::move(result); + + mAcquiredResources.push_back(cached); + + return { + .handle = cached->view.getHandle(), + .imageHandle = image.handle, + }; +} + +void Cache::Tag::readMemory(void *target, std::uint64_t address, + std::uint64_t size) { + mParent->flush(*mScheduler, address, size); + auto memoryPtr = RemoteMemory{mParent->mVmIm}.getPointer(address); + std::memcpy(target, memoryPtr, size); +} + +void Cache::Tag::writeMemory(const void *source, std::uint64_t address, + std::uint64_t size) { + mParent->flush(*mScheduler, address, size); + auto memoryPtr = RemoteMemory{mParent->mVmIm}.getPointer(address); + std::memcpy(memoryPtr, source, size); +} + +int Cache::Tag::compareMemory(const void *source, std::uint64_t address, + std::uint64_t size) { + mParent->flush(*mScheduler, address, size); + auto memoryPtr = RemoteMemory{mParent->mVmIm}.getPointer(address); + return std::memcmp(memoryPtr, source, size); +} + +void Cache::Tag::release() { + for (auto ds : mGraphicsDescriptorSets) { + getCache()->destroyGraphicsDescriptorSets(ds); + } + + for (auto ds : mComputeDescriptorSets) { + getCache()->destroyComputeDescriptorSet(ds); + } + + mGraphicsDescriptorSets.clear(); + mComputeDescriptorSets.clear(); + + if (mAcquiredResources.empty()) { + return; + } + + while (!mAcquiredResources.empty()) { + auto resource = std::move(mAcquiredResources.back()); + mAcquiredResources.pop_back(); + resource->flush(*this, *mScheduler, 0, ~static_cast(0)); + } + + mScheduler->submit(); + mScheduler->then([mAcquiredResources = std::move(mAcquiredResources)] {}); +} + +Cache::Tag Cache::createTag(Scheduler &scheduler) { + auto tag = Tag{this, scheduler, mNextTagId}; + mNextTagId = static_cast(static_cast(mNextTagId) + 2); + return tag; +} + +Cache::Cache(Device *device, int vmId) : mDevice(device), mVmIm(vmId) { + mMemoryTableBuffer = + vk::Buffer::Allocate(vk::getHostVisibleMemory(), 0x10000); + mGdsBuffer = vk::Buffer::Allocate(vk::getHostVisibleMemory(), 0x40000); + + { + VkDescriptorSetLayoutBinding bindings[kGraphicsStages.size()] + [kDescriptorBindings.size()]; + + for (std::size_t index = 0; auto stage : kGraphicsStages) { + fillStageBindings(bindings[index], stage, index); + ++index; + } + + for (std::size_t index = 0; auto &layout : mGraphicsDescriptorSetLayouts) { + VkDescriptorSetLayoutCreateInfo descLayoutInfo{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = static_cast( + index == 0 ? kDescriptorBindings.size() : 1), + .pBindings = bindings[index], + }; + + ++index; + + VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, + &descLayoutInfo, + vk::context->allocator, &layout)); + } + } + + { + VkDescriptorSetLayoutBinding bindings[kDescriptorBindings.size()]; + + fillStageBindings(bindings, VK_SHADER_STAGE_COMPUTE_BIT, 0); + + VkDescriptorSetLayoutCreateInfo layoutInfo{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = kDescriptorBindings.size(), + .pBindings = bindings, + }; + + VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, &layoutInfo, + vk::context->allocator, + &mComputeDescriptorSetLayout)); + } + + { + VkPipelineLayoutCreateInfo pipelineLayoutInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = + static_cast(mGraphicsDescriptorSetLayouts.size()), + .pSetLayouts = mGraphicsDescriptorSetLayouts.data(), + }; + + VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &pipelineLayoutInfo, + vk::context->allocator, + &mGraphicsPipelineLayout)); + } + + { + VkPipelineLayoutCreateInfo pipelineLayoutInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &mComputeDescriptorSetLayout, + }; + + VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &pipelineLayoutInfo, + vk::context->allocator, + &mComputePipelineLayout)); + } +} +Cache::~Cache() {} + +void Cache::addFrameBuffer(Scheduler &scheduler, int index, + std::uint64_t address, std::uint32_t width, + std::uint32_t height, int format, + TileMode tileMode) {} + +void Cache::removeFrameBuffer(Scheduler &scheduler, int index) {} + +VkImage Cache::getFrameBuffer(Scheduler &scheduler, int index) { return {}; } + +static void +flushCacheImpl(Scheduler &scheduler, Cache::Tag &tag, + rx::MemoryTableWithPayload> &table, + std::uint64_t beginAddress, std::uint64_t endAddress) { + auto beginIt = table.lowerBound(beginAddress); + auto endIt = table.lowerBound(endAddress); + + while (beginIt != endIt) { + auto cached = beginIt->get(); + cached->flush(tag, scheduler, beginAddress, endAddress); + ++beginIt; + } +} + +static void invalidateCacheImpl( + Scheduler &scheduler, + rx::MemoryTableWithPayload> &table, + std::uint64_t beginAddress, std::uint64_t endAddress) { + table.unmap(beginAddress, endAddress); +} + +void Cache::invalidate(Scheduler &scheduler, std::uint64_t address, + std::uint64_t size) { + auto beginAddress = address; + auto endAddress = address + size; + + rx::dieIf(beginAddress >= endAddress, + "wrong flush range: address %lx, size %lx", address, size); + + invalidateCacheImpl(scheduler, mBuffers, beginAddress, endAddress); + invalidateCacheImpl(scheduler, mImages, beginAddress, endAddress); + + invalidateCacheImpl(scheduler, mSyncTable, beginAddress, endAddress); +} + +void Cache::flush(Scheduler &scheduler, std::uint64_t address, + std::uint64_t size) { + auto beginAddress = address; + auto endAddress = address + size; + + rx::dieIf(beginAddress >= endAddress, + "wrong flush range: address %lx, size %lx", address, size); + + auto tag = createTag(scheduler); + flushCacheImpl(scheduler, tag, mBuffers, beginAddress, endAddress); + flushCacheImpl(scheduler, tag, mIndexBuffers, beginAddress, endAddress); + flushCacheImpl(scheduler, tag, mImages, beginAddress, endAddress); + // flushCacheImpl(scheduler, tag, mShaders, beginAddress, endAddress); + + flushCacheImpl(scheduler, tag, mSyncTable, beginAddress, endAddress); +} + +std::array +Cache::createGraphicsDescriptorSets() { + std::lock_guard lock(mDescriptorMtx); + + if (!mGraphicsDescriptorSets.empty()) { + auto result = mGraphicsDescriptorSets.back(); + mGraphicsDescriptorSets.pop_back(); + return result; + } + + if (mGraphicsDescriptorPool == nullptr) { + VkDescriptorPoolSize poolSizes[]{ + { + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = 1, + }, + { + .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 16 * 3, + }, + { + .type = VK_DESCRIPTOR_TYPE_SAMPLER, + .descriptorCount = 16, + }, + { + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 16, + }, + }; + + VkDescriptorPoolCreateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = Cache::kGraphicsStages.size() * 100, + .poolSizeCount = static_cast(std::size(poolSizes)), + .pPoolSizes = poolSizes, + }; + + VK_VERIFY(vkCreateDescriptorPool(vk::context->device, &info, + vk::context->allocator, + &mGraphicsDescriptorPool)); + } + + VkDescriptorSetAllocateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = mGraphicsDescriptorPool, + .descriptorSetCount = + static_cast(mGraphicsDescriptorSetLayouts.size()), + .pSetLayouts = mGraphicsDescriptorSetLayouts.data(), + }; + + std::array result; + VK_VERIFY( + vkAllocateDescriptorSets(vk::context->device, &info, result.data())); + return result; +} + +VkDescriptorSet Cache::createComputeDescriptorSet() { + std::lock_guard lock(mDescriptorMtx); + + if (!mComputeDescriptorSets.empty()) { + auto result = mComputeDescriptorSets.back(); + mComputeDescriptorSets.pop_back(); + return result; + } + + if (mComputeDescriptorPool == nullptr) { + VkDescriptorPoolSize poolSizes[]{ + { + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = 1, + }, + { + .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 16 * 3, + }, + { + .type = VK_DESCRIPTOR_TYPE_SAMPLER, + .descriptorCount = 16, + }, + { + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 16, + }, + }; + + VkDescriptorPoolCreateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = 8, + .poolSizeCount = static_cast(std::size(poolSizes)), + .pPoolSizes = poolSizes, + }; + + VK_VERIFY(vkCreateDescriptorPool(vk::context->device, &info, + vk::context->allocator, + &mComputeDescriptorPool)); + } + + VkDescriptorSetAllocateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = mComputeDescriptorPool, + .descriptorSetCount = 1, + .pSetLayouts = &mComputeDescriptorSetLayout, + }; + + VkDescriptorSet result; + VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info, &result)); + return result; +} diff --git a/rpcsx-gpu2/Cache.hpp b/rpcsx-gpu2/Cache.hpp new file mode 100644 index 00000000..4ae2c46e --- /dev/null +++ b/rpcsx-gpu2/Cache.hpp @@ -0,0 +1,333 @@ +#pragma once + +#include "Pipe.hpp" +#include "amdgpu/tiler.hpp" +#include "gnm/constants.hpp" +#include "rx/die.hpp" +#include "shader/Access.hpp" +#include "shader/GcnConverter.hpp" +#include +#include +#include +#include +#include +#include + +namespace amdgpu { +using Access = shader::Access; + +struct ShaderKey { + std::uint64_t address; + shader::gcn::Stage stage; + shader::gcn::Environment env; +}; + +struct ImageKey { + std::uint64_t address; + gnm::TextureType type; + gnm::DataFormat dfmt; + gnm::NumericFormat nfmt; + TileMode tileMode = {}; + VkOffset3D offset = {}; + VkExtent3D extent = {1, 1, 1}; + std::uint32_t pitch = 1; + unsigned baseMipLevel = 0; + unsigned mipCount = 1; + unsigned baseArrayLayer = 0; + unsigned arrayLayerCount = 1; + bool pow2pad = false; + + static ImageKey createFrom(const gnm::TBuffer &tbuffer); +}; + +struct ImageViewKey : ImageKey { + gnm::Swizzle R = gnm::Swizzle::R; + gnm::Swizzle G = gnm::Swizzle::G; + gnm::Swizzle B = gnm::Swizzle::B; + gnm::Swizzle A = gnm::Swizzle::A; + + static ImageViewKey createFrom(const gnm::TBuffer &tbuffer); +}; + +struct SamplerKey { + VkFilter magFilter; + VkFilter minFilter; + VkSamplerMipmapMode mipmapMode; + VkSamplerAddressMode addressModeU; + VkSamplerAddressMode addressModeV; + VkSamplerAddressMode addressModeW; + float mipLodBias; + float maxAnisotropy; + VkCompareOp compareOp; + float minLod; + float maxLod; + VkBorderColor borderColor; + bool anisotropyEnable; + bool compareEnable; + bool unnormalizedCoordinates; + + static SamplerKey createFrom(const gnm::SSampler &sampler); + + auto operator<=>(const SamplerKey &other) const = default; +}; + +struct Cache { + static constexpr std::array kGraphicsStages = { + VK_SHADER_STAGE_VERTEX_BIT, + VK_SHADER_STAGE_GEOMETRY_BIT, + VK_SHADER_STAGE_FRAGMENT_BIT, + VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT, + VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, + }; + + static constexpr std::array kDescriptorBindings = { + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + VK_DESCRIPTOR_TYPE_SAMPLER, + VkDescriptorType(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE + 1 * 1000), + VkDescriptorType(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE + 2 * 1000), + VkDescriptorType(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE + 3 * 1000), + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + }; + + static constexpr int getStageIndex(VkShaderStageFlagBits stage) { + auto it = std::find(kGraphicsStages.begin(), kGraphicsStages.end(), stage); + + if (it == kGraphicsStages.end()) { + return -1; + } + + return it - kGraphicsStages.begin(); + } + + static constexpr int getDescriptorBinding(VkDescriptorType type, int dim = 0) { + auto it = + std::find(kDescriptorBindings.begin(), kDescriptorBindings.end(), type + dim * 1000); + + if (it == kDescriptorBindings.end()) { + return -1; + } + + return it - kDescriptorBindings.begin(); + } + + enum class TagId : std::uint64_t {}; + struct Entry; + + int vmId = -1; + + struct Shader { + VkShaderEXT handle; + shader::gcn::ShaderInfo *info; + VkShaderStageFlagBits stage; + }; + + struct Sampler { + VkSampler handle; + }; + + struct Buffer { + VkBuffer handle; + std::uint64_t offset; + std::uint64_t deviceAddress; + TagId tagId; + std::byte *data; + }; + + struct IndexBuffer { + VkBuffer handle; + std::uint64_t offset; + std::uint32_t indexCount; + gnm::PrimitiveType primType; + gnm::IndexType indexType; + }; + + struct Image { + VkImage handle; + }; + + struct ImageView { + VkImageView handle; + VkImage imageHandle; + }; + + class Tag { + Cache *mParent = nullptr; + Scheduler *mScheduler = nullptr; + TagId mTagId{}; + + std::vector> mAcquiredResources; + std::vector> + mGraphicsDescriptorSets; + + std::vector mComputeDescriptorSets; + + public: + Tag() = default; + Tag(Cache *parent, Scheduler &scheduler, TagId id) + : mParent(parent), mScheduler(&scheduler), mTagId(id) {} + Tag(const Tag &) = delete; + Tag(Tag &&other) { other.swap(*this); } + Tag &operator=(Tag &&other) { + other.swap(*this); + return *this; + } + + void submitAndWait() { + mScheduler->submit(); + mScheduler->wait(); + } + + ~Tag() { release(); } + + TagId getReadId() const { return TagId{std::uint64_t(mTagId) - 1}; } + TagId getWriteId() const { return mTagId; } + + void swap(Tag &other) { + std::swap(mParent, other.mParent); + std::swap(mScheduler, other.mScheduler); + std::swap(mTagId, other.mTagId); + std::swap(mAcquiredResources, other.mAcquiredResources); + std::swap(mGraphicsDescriptorSets, other.mGraphicsDescriptorSets); + std::swap(mComputeDescriptorSets, other.mComputeDescriptorSets); + } + + Cache *getCache() const { return mParent; } + Device *getDevice() const { return mParent->mDevice; } + int getVmId() const { return mParent->mVmIm; } + + Shader getShader(const ShaderKey &key, + const ShaderKey *dependedKey = nullptr); + Sampler getSampler(const SamplerKey &key); + Buffer getBuffer(std::uint64_t address, std::uint64_t size, Access access); + Buffer getInternalBuffer(std::uint64_t size); + IndexBuffer getIndexBuffer(std::uint64_t address, std::uint32_t indexCount, + gnm::PrimitiveType primType, + gnm::IndexType indexType); + Image getImage(const ImageKey &key, Access access); + ImageView getImageView(const ImageViewKey &key, Access access); + void readMemory(void *target, std::uint64_t address, std::uint64_t size); + void writeMemory(const void *source, std::uint64_t address, + std::uint64_t size); + int compareMemory(const void *source, std::uint64_t address, + std::uint64_t size); + void release(); + + VkPipelineLayout getGraphicsPipelineLayout() const { + return getCache()->getGraphicsPipelineLayout(); + } + + VkPipelineLayout getComputePipelineLayout() const { + return getCache()->getComputePipelineLayout(); + } + + std::array + createGraphicsDescriptorSets() { + auto result = getCache()->createGraphicsDescriptorSets(); + mGraphicsDescriptorSets.push_back(result); + return result; + } + + VkDescriptorSet createComputeDescriptorSet() { + auto result = getCache()->createComputeDescriptorSet(); + mComputeDescriptorSets.push_back(result); + return result; + } + + std::shared_ptr findShader(const ShaderKey &key, + const ShaderKey *dependedKey = nullptr); + }; + + Cache(Device *device, int vmId); + ~Cache(); + Tag createTag(Scheduler &scheduler); + + vk::Buffer &getMemoryTableBuffer() { return mMemoryTableBuffer; } + vk::Buffer &getGdsBuffer() { return mGdsBuffer; } + + void addFrameBuffer(Scheduler &scheduler, int index, std::uint64_t address, + std::uint32_t width, std::uint32_t height, int format, + TileMode tileMode); + void removeFrameBuffer(Scheduler &scheduler, int index); + VkImage getFrameBuffer(Scheduler &scheduler, int index); + void invalidate(Scheduler &scheduler, std::uint64_t address, + std::uint64_t size); + + void invalidate(Scheduler &scheduler) { + invalidate(scheduler, 0, ~static_cast(0)); + } + + void flush(Scheduler &scheduler, std::uint64_t address, std::uint64_t size); + void flush(Scheduler &scheduler) { + flush(scheduler, 0, ~static_cast(0)); + } + + const std::array & + getGraphicsDescriptorSetLayouts() const { + return mGraphicsDescriptorSetLayouts; + } + + VkDescriptorSetLayout + getGraphicsDescriptorSetLayout(VkShaderStageFlagBits stage) const { + int index = getStageIndex(stage); + rx::dieIf(index < 0, "getGraphicsDescriptorSetLayout: unexpected stage"); + return mGraphicsDescriptorSetLayouts[index]; + } + + VkDescriptorSetLayout getComputeDescriptorSetLayout() const { + return mComputeDescriptorSetLayout; + } + VkPipelineLayout getGraphicsPipelineLayout() const { + return mGraphicsPipelineLayout; + } + + VkPipelineLayout getComputePipelineLayout() const { + return mComputePipelineLayout; + } + + std::array + createGraphicsDescriptorSets(); + VkDescriptorSet createComputeDescriptorSet(); + + void destroyGraphicsDescriptorSets( + const std::array &set) { + std::lock_guard lock(mDescriptorMtx); + mGraphicsDescriptorSets.push_back(set); + } + + void destroyComputeDescriptorSet(VkDescriptorSet set) { + std::lock_guard lock(mDescriptorMtx); + mComputeDescriptorSets.push_back(set); + } + +private: + TagId getSyncTag(std::uint64_t address, std::uint64_t size, TagId currentTag); + + Device *mDevice; + int mVmIm; + TagId mNextTagId{2}; + vk::Buffer mMemoryTableBuffer; + vk::Buffer mGdsBuffer; + + std::mutex mDescriptorMtx; + std::array + mGraphicsDescriptorSetLayouts{}; + VkDescriptorSetLayout mComputeDescriptorSetLayout{}; + VkPipelineLayout mGraphicsPipelineLayout{}; + VkPipelineLayout mComputePipelineLayout{}; + VkDescriptorPool mGraphicsDescriptorPool{}; + VkDescriptorPool mComputeDescriptorPool{}; + std::vector> + mGraphicsDescriptorSets; + std::vector mComputeDescriptorSets; + std::map mSamplers; + + std::shared_ptr mFrameBuffers[10]; + + rx::MemoryTableWithPayload> mBuffers; + rx::MemoryTableWithPayload> mIndexBuffers; + rx::MemoryTableWithPayload> mImages; + rx::MemoryTableWithPayload> mShaders; + + rx::MemoryTableWithPayload> mSyncTable; +}; +} // namespace amdgpu diff --git a/rpcsx-gpu2/Device.cpp b/rpcsx-gpu2/Device.cpp new file mode 100644 index 00000000..d82030a2 --- /dev/null +++ b/rpcsx-gpu2/Device.cpp @@ -0,0 +1,508 @@ +#include "Device.hpp" +#include "Renderer.hpp" +#include "amdgpu/tiler.hpp" +#include "gnm/constants.hpp" +#include "gnm/pm4.hpp" +#include "rx/bits.hpp" +#include "rx/die.hpp" +#include "rx/mem.hpp" +#include "shader/spv.hpp" +#include "shaders/rdna-semantic-spirv.hpp" +#include "vk.hpp" +#include +#include + +using namespace amdgpu; + +Device::Device() { + if (!shader::spv::validate(g_rdna_semantic_spirv)) { + shader::spv::dump(g_rdna_semantic_spirv, true); + rx::die("builtin semantic validation failed"); + } + + if (auto sem = shader::spv::deserialize( + shaderSemanticContext, g_rdna_semantic_spirv, + shaderSemanticContext.getUnknownLocation())) { + auto shaderSemantic = *sem; + shader::gcn::canonicalizeSemantic(shaderSemanticContext, shaderSemantic); + shader::gcn::collectSemanticModuleInfo(gcnSemanticModuleInfo, + shaderSemantic); + gcnSemantic = shader::gcn::collectSemanticInfo(gcnSemanticModuleInfo); + } else { + rx::die("failed to deserialize builtin semantics\n"); + } + + for (int index = 0; auto &cache : caches) { + cache.vmId = index++; + } + + for (auto &pipe : graphicsPipes) { + pipe.device = this; + } + + // for (auto &pipe : computePipes) { + // pipe.device = this; + // } +} + +Device::~Device() { + for (auto fd : dmemFd) { + if (fd >= 0) { + ::close(fd); + } + } + + for (auto &[pid, info] : processInfo) { + if (info.vmFd >= 0) { + ::close(info.vmFd); + } + } +} + +void Device::mapProcess(std::int64_t pid, int vmId, const char *shmName) { + auto &process = processInfo[pid]; + process.vmId = vmId; + + auto memory = amdgpu::RemoteMemory{vmId}; + + std::string pidVmName = shmName; + pidVmName += '-'; + pidVmName += std::to_string(pid); + int memoryFd = ::shm_open(pidVmName.c_str(), O_RDWR, S_IRUSR | S_IWUSR); + process.vmFd = memoryFd; + + if (memoryFd < 0) { + std::printf("failed to process %x shared memory\n", (int)pid); + std::abort(); + } + + for (auto [startAddress, endAddress, slot] : process.vmTable) { + auto gpuProt = slot.prot >> 4; + if (gpuProt == 0) { + continue; + } + + auto devOffset = slot.offset + startAddress - slot.baseAddress; + int mapFd = memoryFd; + + if (slot.memoryType >= 0) { + mapFd = dmemFd[slot.memoryType]; + } + + auto mmapResult = + ::mmap(memory.getPointer(startAddress), endAddress - startAddress, + gpuProt, MAP_FIXED | MAP_SHARED, mapFd, devOffset); + + if (mmapResult == MAP_FAILED) { + std::printf("failed to map process %x memory, address %lx-%lx, type %x\n", + (int)pid, startAddress, endAddress, slot.memoryType); + std::abort(); + } + + handleProtectChange(vmId, startAddress, endAddress - startAddress, + slot.prot); + } +} + +void Device::unmapProcess(std::int64_t pid) { + auto &process = processInfo[pid]; + auto startAddress = static_cast(process.vmId) << 40; + auto size = static_cast(1) << 40; + rx::mem::reserve(reinterpret_cast(startAddress), size); + + ::close(process.vmFd); + process.vmFd = -1; + process.vmId = -1; +} + +void Device::protectMemory(int pid, std::uint64_t address, std::uint64_t size, + int prot) { + auto &process = processInfo[pid]; + + auto vmSlotIt = process.vmTable.queryArea(address); + if (vmSlotIt == process.vmTable.end()) { + std::abort(); + } + + auto vmSlot = (*vmSlotIt).payload; + + process.vmTable.map(address, address + size, + VmMapSlot{ + .memoryType = vmSlot.memoryType, + .prot = static_cast(prot), + .offset = vmSlot.offset, + .baseAddress = vmSlot.baseAddress, + }); + + if (process.vmId >= 0) { + auto memory = amdgpu::RemoteMemory{process.vmId}; + rx::mem::protect(memory.getPointer(address), size, prot >> 4); + handleProtectChange(process.vmId, address, size, prot); + } +} + +void Device::onCommandBuffer(std::int64_t pid, int cmdHeader, + std::uint64_t address, std::uint64_t size) { + auto &process = processInfo[pid]; + if (process.vmId < 0) { + return; + } + + auto memory = RemoteMemory{process.vmId}; + + auto op = rx::getBits(cmdHeader, 15, 8); + + if (op == gnm::IT_INDIRECT_BUFFER_CNST) { + graphicsPipes[0].setCeQueue(Queue::createFromRange( + process.vmId, memory.getPointer(address), + size / sizeof(std::uint32_t))); + } else if (op == gnm::IT_INDIRECT_BUFFER) { + graphicsPipes[0].setDeQueue( + Queue::createFromRange(process.vmId, + memory.getPointer(address), + size / sizeof(std::uint32_t)), + 1); + } else { + rx::die("unimplemented command buffer %x", cmdHeader); + } +} + +bool Device::processPipes() { + bool allProcessed = true; + + // for (auto &pipe : computePipes) { + // if (!pipe.processAllRings()) { + // allProcessed = false; + // } + // } + + for (auto &pipe : graphicsPipes) { + if (!pipe.processAllRings()) { + allProcessed = false; + } + } + + return allProcessed; +} + +static void +transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, + VkImageLayout oldLayout, VkImageLayout newLayout, + const VkImageSubresourceRange &subresourceRange) { + VkImageMemoryBarrier barrier{}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; + barrier.subresourceRange = subresourceRange; + + auto layoutToStageAccess = [](VkImageLayout layout) + -> std::pair { + switch (layout) { + case VK_IMAGE_LAYOUT_UNDEFINED: + case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: + case VK_IMAGE_LAYOUT_GENERAL: + return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0}; + + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT}; + + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT}; + + case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: + return {VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT}; + + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + return {VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT}; + + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + return {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT}; + + default: + std::abort(); + } + }; + + auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout); + auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout); + + barrier.srcAccessMask = sourceAccess; + barrier.dstAccessMask = destinationAccess; + + vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, + nullptr, 0, nullptr, 1, &barrier); +} + +bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg, + VkCommandBuffer commandBuffer, VkImage swapchainImage, + VkImageView swapchainImageView, VkFence fence) { + auto &pipe = graphicsPipes[0]; + auto &scheduler = pipe.scheduler; + auto &process = processInfo[pid]; + if (process.vmId < 0) { + return false; + } + + auto &buffer = process.buffers[bufferIndex]; + auto &bufferAttr = process.bufferAttributes[buffer.attrId]; + + gnm::DataFormat dfmt; + gnm::NumericFormat nfmt; + CbCompSwap compSwap; + switch (bufferAttr.pixelFormat) { + case 0x80000000: + // bgra + dfmt = gnm::kDataFormat8_8_8_8; + nfmt = gnm::kNumericFormatSNormNoZero; + compSwap = CbCompSwap::Alt; + break; + + case 0x80002200: + // rgba + dfmt = gnm::kDataFormat8_8_8_8; + nfmt = gnm::kNumericFormatSNormNoZero; + compSwap = CbCompSwap::Std; + break; + + case 0x88060000: + // bgra + dfmt = gnm::kDataFormat2_10_10_10; + nfmt = gnm::kNumericFormatSNormNoZero; + compSwap = CbCompSwap::Alt; + break; + + default: + rx::die("unimplemented color buffer format %x", bufferAttr.pixelFormat); + } + + // std::printf("displaying buffer %lx\n", buffer.address); + VkCommandBufferBeginInfo beginInfo{}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + + vkBeginCommandBuffer(commandBuffer, &beginInfo); + + auto cacheTag = getCacheTag(process.vmId, scheduler); + + if (true) { + transitionImageLayout(commandBuffer, swapchainImage, + VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + }); + + amdgpu::flip(cacheTag, commandBuffer, vk::context->swapchainExtent, + buffer.address, swapchainImageView, + {bufferAttr.width, bufferAttr.height}, compSwap, + getDefaultTileModes()[13], dfmt, nfmt); + + transitionImageLayout(commandBuffer, swapchainImage, + VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + }); + } else { + ImageKey frameKey{ + .address = buffer.address, + .type = gnm::TextureType::Dim2D, + .dfmt = dfmt, + .nfmt = nfmt, + .tileMode = getDefaultTileModes()[13], + .extent = + { + .width = bufferAttr.width, + .height = bufferAttr.height, + .depth = 1, + }, + .pitch = bufferAttr.width, + .mipCount = 1, + .arrayLayerCount = 1, + }; + + auto image = cacheTag.getImage(frameKey, Access::Read); + + scheduler.submit(); + scheduler.wait(); + + transitionImageLayout(commandBuffer, swapchainImage, + VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + }); + + VkImageBlit region{ + .srcSubresource = {.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1}, + .srcOffsets = {{}, + {static_cast(bufferAttr.width), + static_cast(bufferAttr.height), 1}}, + .dstSubresource = {.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1}, + .dstOffsets = + {{}, + {static_cast(vk::context->swapchainExtent.width), + static_cast(vk::context->swapchainExtent.height), 1}}, + }; + + vkCmdBlitImage(commandBuffer, image.handle, VK_IMAGE_LAYOUT_GENERAL, + swapchainImage, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, + ®ion, VK_FILTER_LINEAR); + + transitionImageLayout(commandBuffer, swapchainImage, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .levelCount = 1, + .layerCount = 1, + }); + } + + auto submitCompleteTask = scheduler.createExternalSubmit(); + + { + vkEndCommandBuffer(commandBuffer); + + VkSemaphoreSubmitInfo signalSemSubmitInfos[] = { + { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = vk::context->renderCompleteSemaphore, + .value = 1, + .stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, + }, + { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = scheduler.getSemaphoreHandle(), + .value = submitCompleteTask, + .stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, + }, + }; + + VkSemaphoreSubmitInfo waitSemSubmitInfos[] = { + { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = vk::context->presentCompleteSemaphore, + .value = 1, + .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + }, + { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, + .semaphore = scheduler.getSemaphoreHandle(), + .value = submitCompleteTask - 1, + .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + }, + }; + + VkCommandBufferSubmitInfo cmdBufferSubmitInfo{ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = commandBuffer, + }; + + VkSubmitInfo2 submitInfo{ + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, + .waitSemaphoreInfoCount = 1, + .pWaitSemaphoreInfos = waitSemSubmitInfos, + .commandBufferInfoCount = 1, + .pCommandBufferInfos = &cmdBufferSubmitInfo, + .signalSemaphoreInfoCount = 2, + .pSignalSemaphoreInfos = signalSemSubmitInfos, + }; + + vkQueueSubmit2(vk::context->presentQueue, 1, &submitInfo, fence); + // vkQueueWaitIdle(queue); + } + + scheduler.then([=, this, cacheTag = std::move(cacheTag)] { + bridge->flipBuffer[process.vmId] = bufferIndex; + bridge->flipArg[process.vmId] = arg; + bridge->flipCount[process.vmId] = bridge->flipCount[process.vmId] + 1; + + auto mem = RemoteMemory{process.vmId}; + auto bufferInUse = + mem.getPointer(bridge->bufferInUseAddress[process.vmId]); + if (bufferInUse != nullptr) { + bufferInUse[bufferIndex] = 0; + } + }); + + return true; +} + +void Device::mapMemory(std::int64_t pid, std::uint64_t address, + std::uint64_t size, int memoryType, int dmemIndex, + int prot, std::int64_t offset) { + auto &process = processInfo[pid]; + + process.vmTable.map(address, address + size, + VmMapSlot{ + .memoryType = memoryType >= 0 ? dmemIndex : -1, + .prot = prot, + .offset = offset, + .baseAddress = address, + }); + + if (process.vmId < 0) { + return; + } + + auto memory = amdgpu::RemoteMemory{process.vmId}; + + int mapFd = process.vmFd; + + if (memoryType >= 0) { + mapFd = dmemFd[dmemIndex]; + } + + auto mmapResult = ::mmap(memory.getPointer(address), size, prot >> 4, + MAP_FIXED | MAP_SHARED, mapFd, offset); + + if (mmapResult == MAP_FAILED) { + rx::die("failed to map process %x memory, address %lx-%lx, type %x", + (int)pid, address, address + size, memoryType); + } + + handleProtectChange(process.vmId, address, size, prot); +} + +void Device::registerBuffer(std::int64_t pid, bridge::CmdBuffer buffer) { + auto &process = processInfo[pid]; + + if (buffer.attrId >= 10 || buffer.index >= 10) { + rx::die("out of buffers %u, %u", buffer.attrId, buffer.index); + } + + process.buffers[buffer.index] = buffer; +} + +void Device::registerBufferAttribute(std::int64_t pid, + bridge::CmdBufferAttribute attr) { + auto &process = processInfo[pid]; + if (attr.attrId >= 10) { + rx::die("out of buffer attributes %u", attr.attrId); + } + + process.bufferAttributes[attr.attrId] = attr; +} + +void Device::handleProtectChange(int vmId, std::uint64_t address, + std::uint64_t size, int prot) {} diff --git a/rpcsx-gpu2/Device.hpp b/rpcsx-gpu2/Device.hpp new file mode 100644 index 00000000..b4f9db78 --- /dev/null +++ b/rpcsx-gpu2/Device.hpp @@ -0,0 +1,91 @@ +#pragma once +#include "Cache.hpp" +#include "Pipe.hpp" +#include "amdgpu/bridge/bridge.hpp" +#include "amdgpu/tiler_vulkan.hpp" +#include "gnm/descriptors.hpp" +#include "rx/MemoryTable.hpp" +#include "shader/SemanticInfo.hpp" +#include "shader/SpvConverter.hpp" +#include "shader/gcn.hpp" +#include +#include + +namespace amdgpu { + +struct VmMapSlot { + int memoryType; + int prot; + std::int64_t offset; + std::uint64_t baseAddress; + + auto operator<=>(const VmMapSlot &) const = default; +}; + +struct ProcessInfo { + int vmId = -1; + int vmFd = -1; + amdgpu::bridge::CmdBufferAttribute bufferAttributes[10]; + amdgpu::bridge::CmdBuffer buffers[10]; + rx::MemoryTableWithPayload vmTable; +}; + +struct RemoteMemory { + int vmId; + + template T *getPointer(std::uint64_t address) const { + return address ? reinterpret_cast( + static_cast(vmId) << 40 | address) + : nullptr; + } +}; + +struct Device { + static constexpr auto kComputePipeCount = 8; + static constexpr auto kGfxPipeCount = 2; + + shader::SemanticInfo gcnSemantic; + shader::spv::Context shaderSemanticContext; + shader::gcn::SemanticModuleInfo gcnSemanticModuleInfo; + amdgpu::bridge::BridgeHeader *bridge; + + Registers::Config config; + + GpuTiler tiler; + + GraphicsPipe graphicsPipes[kGfxPipeCount]{0, 1}; + // ComputePipe computePipes[kComputePipeCount]{0, 1, 2, 3, 4, 5, 6, 7}; + + int dmemFd[3] = {-1, -1, -1}; + std::unordered_map processInfo; + + Cache caches[6]{ + {this, 0}, {this, 1}, {this, 2}, {this, 3}, {this, 4}, {this, 5}, + }; + + Device(); + ~Device(); + + Cache::Tag getCacheTag(int vmId, Scheduler &scheduler) { + return caches[vmId].createTag(scheduler); + } + + void mapProcess(std::int64_t pid, int vmId, const char *shmName); + void unmapProcess(std::int64_t pid); + void protectMemory(int pid, std::uint64_t address, std::uint64_t size, + int prot); + void onCommandBuffer(std::int64_t pid, int cmdHeader, std::uint64_t address, + std::uint64_t size); + bool processPipes(); + bool flip(std::int64_t pid, int bufferIndex, std::uint64_t arg, + VkCommandBuffer commandBuffer, VkImage swapchainImage, + VkImageView swapchainImageView, VkFence fence); + void mapMemory(std::int64_t pid, std::uint64_t address, std::uint64_t size, + int memoryType, int dmemIndex, int prot, std::int64_t offset); + void registerBuffer(std::int64_t pid, bridge::CmdBuffer buffer); + void registerBufferAttribute(std::int64_t pid, + bridge::CmdBufferAttribute attr); + void handleProtectChange(int vmId, std::uint64_t address, std::uint64_t size, + int prot); +}; +} // namespace amdgpu diff --git a/rpcsx-gpu2/Pipe.cpp b/rpcsx-gpu2/Pipe.cpp new file mode 100644 index 00000000..3fe0e6c2 --- /dev/null +++ b/rpcsx-gpu2/Pipe.cpp @@ -0,0 +1,987 @@ +#include "Pipe.hpp" +#include "Device.hpp" +#include "Registers.hpp" +#include "Renderer.hpp" +#include "gnm/mmio.hpp" +#include "gnm/pm4.hpp" +#include "vk.hpp" +#include +#include +#include +#include + +using namespace amdgpu; + +static Scheduler createGfxScheduler(int index) { + auto queue = vk::context->presentQueue; + auto family = vk::context->presentQueueFamily; + + if (index != 0) { + for (auto [otherQueue, otherFamily] : vk::context->graphicsQueues) { + if (family != otherFamily) { + queue = otherQueue; + family = otherFamily; + } + } + } + + return Scheduler{queue, family}; +} + +static Scheduler createComputeScheduler(int index) { + auto &compQueues = vk::context->computeQueues; + auto [queue, family] = compQueues[index % compQueues.size()]; + + return Scheduler{queue, family}; +} + +static bool compare(int cmpFn, std::uint32_t poll, std::uint32_t mask, + std::uint32_t ref) { + poll &= mask; + ref &= mask; + + switch (cmpFn) { + case 0: + return true; + case 1: + return poll < ref; + case 2: + return poll <= ref; + case 3: + return poll == ref; + case 4: + return poll != ref; + case 5: + return poll >= ref; + case 6: + return poll > ref; + } + + return false; +} + +ComputePipe::ComputePipe(int index) : scheduler(createComputeScheduler(index)) { + for (auto &handler : commandHandlers) { + handler = &ComputePipe::unknownPacket; + } + + commandHandlers[gnm::IT_NOP] = &ComputePipe::handleNop; +} + +bool ComputePipe::processAllRings() { + bool allProcessed = true; + + for (auto &ring : queues) { + processRing(ring); + + if (ring.rptr != ring.wptr) { + allProcessed = false; + break; + } + } + + return allProcessed; +} + +void ComputePipe::processRing(Queue &queue) { + while (queue.rptr != queue.wptr) { + if (queue.rptr >= queue.base + queue.size) { + queue.rptr = queue.base; + } + + auto header = *queue.rptr; + auto type = rx::getBits(header, 31, 30); + + if (type == 3) { + auto op = rx::getBits(header, 15, 8); + auto len = rx::getBits(header, 29, 16) + 2; + + // std::fprintf(stderr, "queue %d: %s\n", queue.indirectLevel, + // gnm::pm4OpcodeToString(op)); + + if (op == gnm::IT_COND_EXEC) { + rx::die("unimplemented COND_EXEC"); + } + + auto handler = commandHandlers[op]; + if (!(this->*handler)(queue)) { + return; + } + + queue.rptr += len; + continue; + } + + if (type == 2) { + ++queue.rptr; + continue; + } + + rx::die("unexpected pm4 packet type %u", type); + } +} + +bool ComputePipe::unknownPacket(Queue &queue) { + auto op = rx::getBits(queue.rptr[0], 15, 8); + + rx::die("unimplemented compute pm4 packet: %s, queue %u\n", + gnm::pm4OpcodeToString(op), queue.indirectLevel); + + return true; +} + +bool ComputePipe::handleNop(Queue &queue) { return true; } + +GraphicsPipe::GraphicsPipe(int index) : scheduler(createGfxScheduler(index)) { + for (auto &processorHandlers : commandHandlers) { + for (auto &handler : processorHandlers) { + handler = &GraphicsPipe::unknownPacket; + } + + processorHandlers[gnm::IT_NOP] = &GraphicsPipe::handleNop; + } + + auto &dataHandlers = commandHandlers[2]; + auto &deHandlers = commandHandlers[1]; + auto &ceHandlers = commandHandlers[0]; + + deHandlers[gnm::IT_SET_BASE] = &GraphicsPipe::setBase; + deHandlers[gnm::IT_CLEAR_STATE] = &GraphicsPipe::clearState; + + deHandlers[gnm::IT_INDEX_BUFFER_SIZE] = &GraphicsPipe::indexBufferSize; + deHandlers[gnm::IT_DISPATCH_DIRECT] = &GraphicsPipe::dispatchDirect; + deHandlers[gnm::IT_DISPATCH_INDIRECT] = &GraphicsPipe::dispatchIndirect; + + // IT_ATOMIC_GDS + // IT_OCCLUSION_QUERY + deHandlers[gnm::IT_SET_PREDICATION] = &GraphicsPipe::setPredication; + + // IT_REG_RMW + + // IT_COND_EXEC + // IT_PRED_EXEC + + deHandlers[gnm::IT_DRAW_INDIRECT] = &GraphicsPipe::drawIndirect; + deHandlers[gnm::IT_DRAW_INDEX_INDIRECT] = &GraphicsPipe::drawIndexIndirect; + deHandlers[gnm::IT_INDEX_BASE] = &GraphicsPipe::indexBase; + deHandlers[gnm::IT_DRAW_INDEX_2] = &GraphicsPipe::drawIndex2; + + deHandlers[gnm::IT_CONTEXT_CONTROL] = &GraphicsPipe::contextControl; + + deHandlers[gnm::IT_INDEX_TYPE] = &GraphicsPipe::indexType; + // IT_DRAW_INDIRECT_MULTI + deHandlers[gnm::IT_DRAW_INDEX_AUTO] = &GraphicsPipe::drawIndexAuto; + deHandlers[gnm::IT_NUM_INSTANCES] = &GraphicsPipe::numInstances; + deHandlers[gnm::IT_DRAW_INDEX_MULTI_AUTO] = &GraphicsPipe::drawIndexMultiAuto; + + // IT_INDIRECT_BUFFER_CNST + // IT_STRMOUT_BUFFER_UPDATE + + deHandlers[gnm::IT_DRAW_INDEX_OFFSET_2] = &GraphicsPipe::drawIndexOffset2; + deHandlers[gnm::IT_DRAW_PREAMBLE] = &GraphicsPipe::drawPreamble; + + deHandlers[gnm::IT_WRITE_DATA] = &GraphicsPipe::writeData; + deHandlers[gnm::IT_MEM_SEMAPHORE] = &GraphicsPipe::memSemaphore; + // IT_COPY_DW + deHandlers[gnm::IT_WAIT_REG_MEM] = &GraphicsPipe::waitRegMem; + deHandlers[gnm::IT_INDIRECT_BUFFER] = &GraphicsPipe::indirectBuffer; + // IT_COPY_DATA + deHandlers[gnm::IT_PFP_SYNC_ME] = &GraphicsPipe::pfpSyncMe; + // IT_SURFACE_SYNC + deHandlers[gnm::IT_COND_WRITE] = &GraphicsPipe::condWrite; + deHandlers[gnm::IT_EVENT_WRITE] = &GraphicsPipe::eventWrite; + deHandlers[gnm::IT_EVENT_WRITE_EOP] = &GraphicsPipe::eventWriteEop; + deHandlers[gnm::IT_EVENT_WRITE_EOS] = &GraphicsPipe::eventWriteEos; + deHandlers[gnm::IT_RELEASE_MEM] = &GraphicsPipe::releaseMem; + // IT_PREAMBLE_CNTL + deHandlers[gnm::IT_DMA_DATA] = &GraphicsPipe::dmaData; + deHandlers[gnm::IT_ACQUIRE_MEM] = &GraphicsPipe::acquireMem; + // IT_REWIND + + // IT_LOAD_UCONFIG_REG + // IT_LOAD_SH_REG + // IT_LOAD_CONFIG_REG + // IT_LOAD_CONTEXT_REG + deHandlers[gnm::IT_SET_CONFIG_REG] = &GraphicsPipe::setConfigReg; + deHandlers[gnm::IT_SET_CONTEXT_REG] = &GraphicsPipe::setContextReg; + // IT_SET_CONTEXT_REG_INDIRECT + deHandlers[gnm::IT_SET_SH_REG] = &GraphicsPipe::setShReg; + // IT_SET_SH_REG_OFFSET + // IT_SET_QUEUE_REG + deHandlers[gnm::IT_SET_UCONFIG_REG] = &GraphicsPipe::setUConfigReg; + // IT_SCRATCH_RAM_WRITE + // IT_SCRATCH_RAM_READ + deHandlers[gnm::IT_INCREMENT_DE_COUNTER] = &GraphicsPipe::incrementDeCounter; + deHandlers[gnm::IT_WAIT_ON_CE_COUNTER] = &GraphicsPipe::waitOnCeCounter; + deHandlers[gnm::IT_SET_CE_DE_COUNTERS] = &GraphicsPipe::setCeDeCounters; + // IT_WAIT_ON_AVAIL_BUFFER + // IT_SWITCH_BUFFER + // IT_SET_RESOURCES + // IT_MAP_PROCESS + // IT_MAP_QUEUES + // IT_UNMAP_QUEUES + // IT_QUERY_STATUS + // IT_RUN_LIST + // IT_DISPATCH_DRAW_PREAMBLE + // IT_DISPATCH_DRAW + + ceHandlers[gnm::IT_WAIT_ON_DE_COUNTER_DIFF] = + &GraphicsPipe::waitOnDeCounterDiff; + ceHandlers[gnm::IT_INCREMENT_CE_COUNTER] = &GraphicsPipe::incrementCeCounter; + ceHandlers[gnm::IT_LOAD_CONST_RAM] = &GraphicsPipe::loadConstRam; + ceHandlers[gnm::IT_WRITE_CONST_RAM] = &GraphicsPipe::writeConstRam; + ceHandlers[gnm::IT_DUMP_CONST_RAM] = &GraphicsPipe::dumpConstRam; +} + +void GraphicsPipe::setCeQueue(Queue queue) { + queue.indirectLevel = -1; + ceQueue = queue; +} + +void GraphicsPipe::setDeQueue(Queue queue, int ring) { + rx::dieIf(ring > 2, "out of indirect gfx rings, %u", ring); + queue.indirectLevel = 2 - ring; + deQueues[ring] = queue; +} + +std::uint32_t *GraphicsPipe::getMmRegister(std::uint32_t dwAddress) { + // if (dwAddress >= Registers::Config::kMmioOffset && + // dwAddress < Registers::Config::kMmioOffset + + // sizeof(Registers::Config) / sizeof(std::uint32_t)) { + // return reinterpret_cast(&config) + (dwAddress - + // Registers::Config::kMmioOffset); + // } + + if (dwAddress >= Registers::ShaderConfig::kMmioOffset && + dwAddress < Registers::ShaderConfig::kMmioOffset + + sizeof(Registers::ShaderConfig) / sizeof(std::uint32_t)) { + return reinterpret_cast(&sh) + + (dwAddress - Registers::ShaderConfig::kMmioOffset); + } + + if (dwAddress >= Registers::UConfig::kMmioOffset && + dwAddress < Registers::UConfig::kMmioOffset + + sizeof(Registers::UConfig) / sizeof(std::uint32_t)) { + return reinterpret_cast(&uConfig) + + (dwAddress - Registers::UConfig::kMmioOffset); + } + + if (dwAddress >= Registers::Context::kMmioOffset && + dwAddress < Registers::Context::kMmioOffset + + sizeof(Registers::Context) / sizeof(std::uint32_t)) { + return reinterpret_cast(&context) + + (dwAddress - Registers::Context::kMmioOffset); + } + + rx::die("unexpected memory mapped register address %x, %s", dwAddress, + gnm::mmio::registerName(dwAddress)); +} + +bool GraphicsPipe::processAllRings() { + bool allProcessed = true; + + if (ceQueue.rptr != ceQueue.wptr) { + processRing(ceQueue); + + if (ceQueue.rptr != ceQueue.wptr) { + allProcessed = false; + } + } + + for (int i = 0; i < 3; ++i) { + auto &queue = deQueues[i]; + processRing(queue); + + if (queue.rptr != queue.wptr) { + allProcessed = false; + break; + } + } + + return allProcessed; +} + +void GraphicsPipe::processRing(Queue &queue) { + auto cp = 1; + if (queue.indirectLevel < 0) { + cp = 0; + } else if (queue.indirectLevel == 2) { + cp = 2; + } + + while (queue.rptr != queue.wptr) { + if (queue.rptr >= queue.base + queue.size) { + queue.rptr = queue.base; + } + + auto header = *queue.rptr; + auto type = rx::getBits(header, 31, 30); + + if (type == 3) { + auto op = rx::getBits(header, 15, 8); + auto len = rx::getBits(header, 29, 16) + 2; + + // std::fprintf(stderr, "queue %d: %s\n", queue.indirectLevel, + // gnm::pm4OpcodeToString(op)); + + if (op == gnm::IT_COND_EXEC) { + rx::die("unimplemented COND_EXEC"); + } + + auto handler = commandHandlers[cp][op]; + if (!(this->*handler)(queue)) { + return; + } + + queue.rptr += len; + + if (op == gnm::IT_INDIRECT_BUFFER || op == gnm::IT_INDIRECT_BUFFER_CNST) { + break; + } + + continue; + } + + if (type == 2) { + ++queue.rptr; + continue; + } + + rx::die("unexpected pm4 packet type %u", type); + } +} + +bool GraphicsPipe::handleNop(Queue &queue) { return true; } + +bool GraphicsPipe::setBase(Queue &queue) { + auto baseIndex = queue.rptr[1] & 0xf; + + switch (baseIndex) { + case 0: { + auto address0 = queue.rptr[2] & ~3; + auto address1 = queue.rptr[3] & ((1 << 16) - 1); + + displayListPatchBase = + address0 | (static_cast(address1) << 32); + break; + } + case 1: { + auto address0 = queue.rptr[2] & ~3; + auto address1 = queue.rptr[3] & ((1 << 16) - 1); + + drawIndexIndirPatchBase = + address0 | (static_cast(address1) << 32); + break; + } + + case 2: { + auto cs1Index = queue.rptr[2] & ((1 << 16) - 1); + auto cs2Index = queue.rptr[3] & ((1 << 16) - 1); + gdsPartitionBases[0] = cs1Index; + gdsPartitionBases[1] = cs2Index; + break; + } + + case 3: { + auto cs1Index = queue.rptr[2] & ((1 << 16) - 1); + auto cs2Index = queue.rptr[3] & ((1 << 16) - 1); + cePartitionBases[0] = cs1Index; + cePartitionBases[1] = cs2Index; + break; + } + + default: + rx::die("pm4: unknown SET_BASE index %u", baseIndex); + } + + return true; +} + +bool GraphicsPipe::clearState(Queue &queue) { + context = Registers::Context::Default; + return true; +} + +bool GraphicsPipe::contextControl(Queue &queue) { return true; } +bool GraphicsPipe::acquireMem(Queue &queue) { return true; } +bool GraphicsPipe::releaseMem(Queue &queue) { + auto eventCntl = queue.rptr[1]; + auto dataCntl = queue.rptr[2]; + auto addressLo = queue.rptr[3] & ~3; + auto addressHi = queue.rptr[3] & ~3; + auto dataLo = queue.rptr[4]; + auto dataHi = queue.rptr[5]; + + auto eventIndex = rx::getBits(eventCntl, 11, 8); + auto eventType = rx::getBits(eventCntl, 5, 0); + auto dataSel = rx::getBits(dataCntl, 31, 29); + auto intSel = rx::getBits(dataCntl, 25, 24); + + auto address = addressLo | (static_cast(addressHi) << 32); + auto pointer = RemoteMemory{queue.vmId}.getPointer(address); + + context.vgtEventInitiator = eventType; + + switch (dataSel) { + case 0: // none + break; + case 1: // 32 bit, low + *reinterpret_cast(pointer) = dataLo; + break; + case 2: // 64 bit + *pointer = dataLo | (static_cast(dataHi) << 32); + break; + case 3: // 64 bit, global GPU clock + *pointer = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + break; + case 4: // 64 bit, perf counter + *pointer = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + break; + + default: + rx::die("unimplemented event release mem data %#x", dataSel); + } + + return true; +} + +bool GraphicsPipe::drawPreamble(Queue &queue) { return true; } + +bool GraphicsPipe::indexBufferSize(Queue &queue) { + vgtIndexBufferSize = queue.rptr[1]; + return true; +} +bool GraphicsPipe::dispatchDirect(Queue &queue) { + auto dimX = queue.rptr[1]; + auto dimY = queue.rptr[2]; + auto dimZ = queue.rptr[3]; + auto dispatchInitiator = queue.rptr[4]; + sh.compute.computeDispatchInitiator = dispatchInitiator; + + // FIXME + return true; +} +bool GraphicsPipe::dispatchIndirect(Queue &queue) { + auto offset = queue.rptr[1]; + auto dispatchInitiator = queue.rptr[2]; + + sh.compute.computeDispatchInitiator = dispatchInitiator; + auto buffer = RemoteMemory{queue.vmId}.getPointer( + drawIndexIndirPatchBase + offset); + + auto dimX = buffer[0]; + auto dimY = buffer[1]; + auto dimZ = buffer[2]; + + // FIXME + return true; +} + +bool GraphicsPipe::setPredication(Queue &queue) { + auto startAddressLo = queue.rptr[1] & ~0xf; + auto predProperties = queue.rptr[2]; + + auto startAddressHi = rx::getBits(predProperties, 15, 0); + auto predBool = rx::getBit(predProperties, 8); + auto hint = rx::getBit(predProperties, 12); + auto predOp = rx::getBits(predProperties, 18, 16); + auto cont = rx::getBit(predProperties, 31); + + switch (predOp) { + case 0: // clear predicate + case 1: // set ZPass predicate + case 2: // set PrimCount predicate + break; + } + + // TODO + + return true; +} +bool GraphicsPipe::drawIndirect(Queue &queue) { + auto dataOffset = queue.rptr[1]; + auto baseVtxLoc = queue.rptr[2] & ((1 << 16) - 1); + auto startInstLoc = queue.rptr[3] & ((1 << 16) - 1); + auto drawInitiator = queue.rptr[4]; + + context.vgtDrawInitiator = drawInitiator; + + auto buffer = RemoteMemory{queue.vmId}.getPointer( + drawIndexIndirPatchBase + dataOffset); + + std::uint32_t vertexCountPerInstance = buffer[0]; + std::uint32_t instanceCount = buffer[1]; + std::uint32_t startVertexLocation = buffer[2]; + std::uint32_t startInstanceLocation = buffer[3]; + + // FIXME + rx::die("drawIndirect"); + return true; +} +bool GraphicsPipe::drawIndexIndirect(Queue &queue) { + auto dataOffset = queue.rptr[1]; + auto baseVtxLoc = queue.rptr[2] & ((1 << 16) - 1); + auto drawInitiator = queue.rptr[3]; + + auto buffer = RemoteMemory{queue.vmId}.getPointer( + drawIndexIndirPatchBase + dataOffset); + + context.vgtDrawInitiator = drawInitiator; + + std::uint32_t indexCountPerInstance = buffer[0]; + std::uint32_t instanceCount = buffer[1]; + std::uint32_t startIndexLocation = buffer[2]; + std::uint32_t baseVertexLocation = buffer[3]; + std::uint32_t startInstanceLocation = buffer[4]; + + // FIXME + rx::die("drawIndexIndirect"); + return true; +} +bool GraphicsPipe::indexBase(Queue &queue) { + auto addressLo = queue.rptr[1] << 1; + auto addressHi = queue.rptr[2] & ((1 << 16) - 1); + auto address = addressLo | (static_cast(addressHi) << 32); + vgtIndexBase = address; + return true; +} +bool GraphicsPipe::drawIndex2(Queue &queue) { + auto maxSize = queue.rptr[1]; + auto indexOffset = queue.rptr[2]; + auto indexCount = queue.rptr[3]; + auto drawInitiator = queue.rptr[4]; + + context.vgtDrawInitiator = drawInitiator; + uConfig.vgtNumIndices = indexCount; + + draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, + vgtIndexBase + indexOffset, maxSize); + return true; +} +bool GraphicsPipe::indexType(Queue &queue) { + uConfig.vgtIndexType = static_cast(queue.rptr[1] & 1); + return true; +} +bool GraphicsPipe::drawIndexAuto(Queue &queue) { + auto indexCount = queue.rptr[1]; + auto drawInitiator = queue.rptr[2]; + + uConfig.vgtNumIndices = indexCount; + context.vgtDrawInitiator = drawInitiator; + + draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, 0, 0); + return true; +} +bool GraphicsPipe::numInstances(Queue &queue) { + uConfig.vgtNumInstances = std::max(queue.rptr[1], 1u); + return true; +} +bool GraphicsPipe::drawIndexMultiAuto(Queue &queue) { + auto primCount = queue.rptr[1]; + auto drawInitiator = queue.rptr[2]; + auto control = queue.rptr[3]; + + auto indexOffset = rx::getBits(control, 15, 0); + auto primType = rx::getBits(control, 20, 16); + auto indexCount = rx::getBits(control, 31, 21); + + context.vgtDrawInitiator = drawInitiator; + uConfig.vgtPrimitiveType = static_cast(primType); + uConfig.vgtNumIndices = indexCount; + + // FIXME + return true; +} +bool GraphicsPipe::drawIndexOffset2(Queue &queue) { + auto maxSize = queue.rptr[1]; + auto indexOffset = queue.rptr[2]; + auto indexCount = queue.rptr[3]; + auto drawInitiator = queue.rptr[4]; + + context.vgtDrawInitiator = drawInitiator; + // FIXME + return true; +} +bool GraphicsPipe::writeData(Queue &queue) { + auto len = rx::getBits(queue.rptr[0], 29, 16) - 1; + auto control = queue.rptr[1]; + auto dstAddressLo = queue.rptr[2]; + auto dstAddressHi = queue.rptr[3]; + auto data = queue.rptr + 4; + + auto engineSel = rx::getBits(control, 31, 30); + auto wrConfirm = rx::getBit(control, 20); + auto wrOneAddress = rx::getBit(control, 16); + auto dstSel = rx::getBits(control, 11, 8); + + std::uint32_t *dstPointer = nullptr; + + switch (dstSel) { + case 0: // memory mapped register + dstPointer = getMmRegister(dstAddressLo & ((1 << 16) - 1)); + break; + + case 1: // memory sync + case 5: { // memory async + auto address = + (dstAddressLo & ~3) | (static_cast(dstAddressHi) << 32); + dstPointer = RemoteMemory{queue.vmId}.getPointer(address); + break; + } + + default: + rx::die("unimplemented write data, dst sel = %#x", dstSel); + } + + if (wrOneAddress) { + for (std::uint32_t i = 0; i < len; ++i) { + *dstPointer = data[i]; + } + } else { + std::memcpy(dstPointer, data, len * sizeof(std::uint32_t)); + } + + return true; +} +bool GraphicsPipe::memSemaphore(Queue &queue) { + // FIXME + return true; +} +bool GraphicsPipe::waitRegMem(Queue &queue) { + auto engine = rx::getBit(queue.rptr[1], 8); + auto memSpace = rx::getBit(queue.rptr[1], 4); + auto function = rx::getBits(queue.rptr[1], 2, 0); + auto pollAddressLo = queue.rptr[2]; + auto pollAddressHi = queue.rptr[3] & ((1 << 16) - 1); + auto reference = queue.rptr[4]; + auto mask = queue.rptr[5]; + auto pollInterval = queue.rptr[6]; + + std::uint32_t pollData; + + if (memSpace == 0) { + pollData = *getMmRegister(pollAddressLo & ((1 << 16) - 1)); + } else { + auto pollAddress = (pollAddressLo & ~3) | + (static_cast(pollAddressHi) << 32); + pollData = *RemoteMemory{queue.vmId}.getPointer(pollAddress); + } + + return compare(function, pollData, mask, reference); +} +bool GraphicsPipe::indirectBuffer(Queue &queue) { + rx::dieIf(queue.indirectLevel < 0, "unexpected indirect buffer from CP"); + + auto addressLo = queue.rptr[1] & ~3; + auto addressHi = queue.rptr[2] & ((1 << 16) - 1); + auto vmId = queue.rptr[3] >> 24; + auto ibSize = queue.rptr[4] & ((1 << 20) - 1); + auto address = addressLo | (static_cast(addressHi) << 32); + + auto rptr = RemoteMemory{queue.vmId}.getPointer(address); + setDeQueue(Queue::createFromRange(queue.vmId, rptr, ibSize), + queue.indirectLevel + 1); + return true; +} +bool GraphicsPipe::pfpSyncMe(Queue &queue) { + // TODO + return true; +} +bool GraphicsPipe::condWrite(Queue &queue) { + auto writeSpace = rx::getBit(queue.rptr[1], 8); + auto pollSpace = rx::getBit(queue.rptr[1], 4); + auto function = rx::getBits(queue.rptr[1], 2, 0); + auto pollAddressLo = queue.rptr[2]; + auto pollAddressHi = queue.rptr[3] & ((1 << 16) - 1); + auto reference = queue.rptr[4]; + auto mask = queue.rptr[5]; + auto writeAddressLo = queue.rptr[6]; + auto writeAddressHi = queue.rptr[7] & ((1 << 16) - 1); + auto writeData = queue.rptr[8]; + + std::uint32_t pollData; + + if (pollSpace == 0) { + pollData = *getMmRegister(pollAddressLo & ((1 << 16) - 1)); + } else { + auto pollAddress = (pollAddressLo & ~3) | + (static_cast(pollAddressHi) << 32); + pollData = *RemoteMemory{queue.vmId}.getPointer(pollAddress); + } + + if (compare(function, pollData, mask, reference)) { + if (writeSpace == 0) { + *getMmRegister(writeAddressLo & ((1 << 16) - 1)) = writeData; + } else { + auto writeAddress = (writeAddressLo & ~3) | + (static_cast(writeAddressHi) << 32); + + *RemoteMemory{queue.vmId}.getPointer(writeAddress) = + writeData; + } + } + + return true; +} + +bool GraphicsPipe::eventWrite(Queue &queue) { + enum { + kEventZPassDone = 1, + kEventSamplePipelineStat = 2, + kEventSampleStreamOutStat = 3, + kEventPartialFlush = 4, + }; + + auto eventCntl = queue.rptr[1]; + auto invL2 = rx::getBit(eventCntl, 20); + auto eventIndex = rx::getBits(eventCntl, 11, 8); + auto eventType = rx::getBits(eventCntl, 5, 0); + + context.vgtEventInitiator = eventType; + + if (eventIndex == kEventZPassDone || eventIndex == kEventSamplePipelineStat || + eventIndex == kEventSampleStreamOutStat) { + auto addressLo = queue.rptr[2] & ~7; + auto addressHi = queue.rptr[3] & ((1 << 16) - 1); + auto address = addressLo | (static_cast(addressHi) << 32); + rx::die("unimplemented event write, event index %#x, address %lx", + eventIndex, address); + return true; + } + + // FIXME + return true; +} + +bool GraphicsPipe::eventWriteEop(Queue &queue) { + auto eventCntl = queue.rptr[1]; + auto addressLo = queue.rptr[2] & ~3; + auto dataCntl = queue.rptr[3]; + auto dataLo = queue.rptr[4]; + auto dataHi = queue.rptr[5]; + + auto invL2 = rx::getBit(eventCntl, 20); + auto eventIndex = rx::getBits(eventCntl, 11, 8); + auto eventType = rx::getBits(eventCntl, 5, 0); + auto dataSel = rx::getBits(dataCntl, 31, 29); + auto intSel = rx::getBits(dataCntl, 25, 24); + auto addressHi = rx::getBits(dataCntl, 15, 0); + + auto address = addressLo | (static_cast(addressHi) << 32); + auto pointer = RemoteMemory{queue.vmId}.getPointer(address); + + context.vgtEventInitiator = eventType; + + switch (dataSel) { + case 0: // none + break; + case 1: // 32 bit, low + *reinterpret_cast(pointer) = dataLo; + break; + case 2: // 64 bit + *pointer = dataLo | (static_cast(dataHi) << 32); + break; + case 3: // 64 bit, global GPU clock + *pointer = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + break; + case 4: // 64 bit, perf counter + *pointer = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + break; + + default: + rx::die("unimplemented event write eop data %#x", dataSel); + } + + return true; +} + +bool GraphicsPipe::eventWriteEos(Queue &queue) { + auto eventCntl = queue.rptr[1]; + auto addressLo = queue.rptr[2] & ~3; + auto cmdInfo = queue.rptr[3]; + auto dataInfo = queue.rptr[4]; + + auto eventIndex = rx::getBits(eventCntl, 11, 8); + auto eventType = rx::getBits(eventCntl, 5, 0); + auto cmd = rx::getBits(cmdInfo, 31, 29); + auto addressHi = rx::getBits(cmdInfo, 15, 0); + + auto address = addressLo | (static_cast(addressHi) << 32); + auto pointer = RemoteMemory{queue.vmId}.getPointer(address); + + context.vgtEventInitiator = eventType; + + switch (cmd) { + case 1: { // store GDS data to memory + auto sizeDw = rx::getBits(dataInfo, 31, 16); + auto gdsIndexDw = rx::getBits(dataInfo, 15, 0); + rx::die("unimplemented event write eos gds data"); + break; + } + + case 2: // after GDS writes confirm, store 32 bit DATA to memory as fence + *pointer = dataInfo; + break; + + default: + rx::die("unexpected event write eos command: %#x", cmd); + } + return true; +} + +bool GraphicsPipe::dmaData(Queue &queue) { + // FIXME + return true; +} + +bool GraphicsPipe::setConfigReg(Queue &queue) { + rx::dieIf(queue.indirectLevel != 0, "setConfigReg from queue %d", + queue.indirectLevel); + + auto len = rx::getBits(queue.rptr[0], 29, 16); + auto offset = queue.rptr[1]; + auto data = queue.rptr + 2; + + rx::dieIf( + (offset + len) * sizeof(std::uint32_t) > sizeof(device->config), + "out of Config regs, offset: %u, count %u, %s\n", offset, len, + gnm::mmio::registerName(decltype(device->config)::kMmioOffset + offset)); + + std::memcpy(reinterpret_cast(&device->config) + offset, data, + sizeof(std::uint32_t) * len); + + return true; +} + +bool GraphicsPipe::setShReg(Queue &queue) { + auto len = rx::getBits(queue.rptr[0], 29, 16); + auto offset = queue.rptr[1]; + auto data = queue.rptr + 2; + + rx::dieIf((offset + len) * sizeof(std::uint32_t) > sizeof(sh), + "out of SH regs, offset: %u, count %u, %s\n", offset, len, + gnm::mmio::registerName(decltype(sh)::kMmioOffset + offset)); + + std::memcpy(reinterpret_cast(&sh) + offset, data, + sizeof(std::uint32_t) * len); + + return true; +} + +bool GraphicsPipe::setUConfigReg(Queue &queue) { + auto len = rx::getBits(queue.rptr[0], 29, 16); + auto offset = queue.rptr[1]; + auto data = queue.rptr + 2; + + rx::dieIf((offset + len) * sizeof(std::uint32_t) > sizeof(uConfig), + "out of UConfig regs, offset: %u, count %u, %s\n", offset, len, + gnm::mmio::registerName(decltype(uConfig)::kMmioOffset + offset)); + + std::memcpy(reinterpret_cast(&uConfig) + offset, data, + sizeof(std::uint32_t) * len); + + return true; +} + +bool GraphicsPipe::setContextReg(Queue &queue) { + auto len = rx::getBits(queue.rptr[0], 29, 16); + auto offset = queue.rptr[1]; + auto data = queue.rptr + 2; + + rx::dieIf((offset + len) * sizeof(std::uint32_t) > sizeof(context), + "out of Context regs, offset: %u, count %u, %s\n", offset, len, + gnm::mmio::registerName(decltype(context)::kMmioOffset + offset)); + + std::memcpy(reinterpret_cast(&context) + offset, data, + sizeof(std::uint32_t) * len); + + // for (std::size_t i = 0; i < len; ++i) { + // std::fprintf(stderr, + // "writing to %s value %x\n", + // gnm::mmio::registerName(decltype(context)::kMmioOffset + offset + i), + // data[i]); + // } + return true; +} + +bool GraphicsPipe::setCeDeCounters(Queue &queue) { + auto counterLo = queue.rptr[1]; + auto counterHi = queue.rptr[2]; + auto counter = counterLo | (static_cast(counterHi) << 32); + deCounter = counter; + ceCounter = counter; + return true; +} + +bool GraphicsPipe::waitOnCeCounter(Queue &queue) { + auto counterLo = queue.rptr[1]; + auto counterHi = queue.rptr[2]; + auto counter = counterLo | (static_cast(counterHi) << 32); + return deCounter >= counter; +} + +bool GraphicsPipe::waitOnDeCounterDiff(Queue &queue) { + auto waitDiff = queue.rptr[1]; + auto diff = ceCounter - deCounter; + return diff < waitDiff; +} + +bool GraphicsPipe::incrementCeCounter(Queue &queue) { + ceCounter++; + return true; +} + +bool GraphicsPipe::incrementDeCounter(Queue &queue) { + deCounter++; + return true; +} + +bool GraphicsPipe::loadConstRam(Queue &queue) { + std::uint32_t addressLo = queue.rptr[1]; + std::uint32_t addressHi = queue.rptr[2]; + std::uint32_t numDw = queue.rptr[3] & ((1 << 15) - 1); + std::uint32_t offset = + (queue.rptr[4] & ((1 << 16) - 1)) / sizeof(std::uint32_t); + auto address = addressLo | (static_cast(addressHi) << 32); + std::memcpy(constantMemory + offset, + RemoteMemory{queue.vmId}.getPointer(address), + numDw * sizeof(std::uint32_t)); + + return true; +} + +bool GraphicsPipe::writeConstRam(Queue &queue) { + std::uint32_t offset = + (queue.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t); + std::uint32_t data = queue.rptr[2]; + std::memcpy(constantMemory + offset, &data, sizeof(std::uint32_t)); + return true; +} + +bool GraphicsPipe::dumpConstRam(Queue &queue) { + std::uint32_t offset = + (queue.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t); + std::uint32_t numDw = queue.rptr[2] & ((1 << 15) - 1); + std::uint32_t addressLo = queue.rptr[3]; + std::uint32_t addressHi = queue.rptr[4]; + auto address = addressLo | (static_cast(addressHi) << 32); + std::memcpy(RemoteMemory{queue.vmId}.getPointer(address), + constantMemory + offset, numDw * sizeof(std::uint32_t)); + + return true; +} + +bool GraphicsPipe::unknownPacket(Queue &queue) { + auto op = rx::getBits(queue.rptr[0], 15, 8); + + rx::die("unimplemented gfx pm4 packet: %s, queue %u\n", + gnm::pm4OpcodeToString(op), queue.indirectLevel); +} diff --git a/rpcsx-gpu2/Pipe.hpp b/rpcsx-gpu2/Pipe.hpp new file mode 100644 index 00000000..4ac6a696 --- /dev/null +++ b/rpcsx-gpu2/Pipe.hpp @@ -0,0 +1,135 @@ +#pragma once +#include "Registers.hpp" +#include "Scheduler.hpp" + +#include +#include + +namespace amdgpu { +class Device; + +struct Queue { + int vmId = -1; + int indirectLevel = -1; + std::uint32_t *doorbell{}; + std::uint32_t *base{}; + std::uint64_t size{}; + std::uint32_t *rptr{}; + std::uint32_t *wptr{}; + + static Queue createFromRange(int vmId, std::uint32_t *base, + std::uint64_t size, int indirectLevel = 0, + std::uint32_t *doorbell = nullptr) { + Queue result; + result.vmId = vmId; + result.indirectLevel = indirectLevel; + result.doorbell = doorbell; + result.base = base; + result.size = size; + result.rptr = base; + result.wptr = base + size; + return result; + } +}; + +struct ComputePipe { + Device *device; + Scheduler scheduler; + + using CommandHandler = bool (ComputePipe::*)(Queue &); + CommandHandler commandHandlers[255]; + Queue queues[8]; + Registers::ComputeConfig computeConfig; + + ComputePipe(int index); + + bool processAllRings(); + void processRing(Queue &queue); + void mapQueue(int queueId, Queue queue); + + bool setShReg(Queue &queue); + bool unknownPacket(Queue &queue); + bool handleNop(Queue &queue); +}; + +struct GraphicsPipe { + Device *device; + Scheduler scheduler; + + std::uint64_t ceCounter = 0; + std::uint64_t deCounter = 0; + std::uint64_t displayListPatchBase = 0; + std::uint64_t drawIndexIndirPatchBase = 0; + std::uint64_t gdsPartitionBases[2]{}; + std::uint64_t cePartitionBases[2]{}; + std::uint64_t vgtIndexBase = 0; + std::uint32_t vgtIndexBufferSize = 0; + + std::uint32_t constantMemory[(48 * 1024) / sizeof(std::uint32_t)]{}; + + Registers::ShaderConfig sh; + Registers::Context context; + Registers::UConfig uConfig; + + Queue deQueues[3]; + Queue ceQueue; + + using CommandHandler = bool (GraphicsPipe::*)(Queue &); + CommandHandler commandHandlers[3][255]; + + GraphicsPipe(int index); + + void setCeQueue(Queue queue); + void setDeQueue(Queue queue, int ring); + + bool processAllRings(); + void processRing(Queue &queue); + + bool drawPreamble(Queue &queue); + bool indexBufferSize(Queue &queue); + bool handleNop(Queue &queue); + bool contextControl(Queue &queue); + bool acquireMem(Queue &queue); + bool releaseMem(Queue &queue); + bool dispatchDirect(Queue &queue); + bool dispatchIndirect(Queue &queue); + bool writeData(Queue &queue); + bool memSemaphore(Queue &queue); + bool waitRegMem(Queue &queue); + bool indirectBuffer(Queue &queue); + bool condWrite(Queue &queue); + bool eventWrite(Queue &queue); + bool eventWriteEop(Queue &queue); + bool eventWriteEos(Queue &queue); + bool dmaData(Queue &queue); + bool setBase(Queue &queue); + bool clearState(Queue &queue); + bool setPredication(Queue &queue); + bool drawIndirect(Queue &queue); + bool drawIndexIndirect(Queue &queue); + bool indexBase(Queue &queue); + bool drawIndex2(Queue &queue); + bool indexType(Queue &queue); + bool drawIndexAuto(Queue &queue); + bool numInstances(Queue &queue); + bool drawIndexMultiAuto(Queue &queue); + bool drawIndexOffset2(Queue &queue); + bool pfpSyncMe(Queue &queue); + bool setCeDeCounters(Queue &queue); + bool waitOnCeCounter(Queue &queue); + bool waitOnDeCounterDiff(Queue &queue); + bool incrementCeCounter(Queue &queue); + bool incrementDeCounter(Queue &queue); + bool loadConstRam(Queue &queue); + bool writeConstRam(Queue &queue); + bool dumpConstRam(Queue &queue); + bool setConfigReg(Queue &queue); + bool setShReg(Queue &queue); + bool setUConfigReg(Queue &queue); + bool setContextReg(Queue &queue); + + bool unknownPacket(Queue &queue); + + std::uint32_t *getMmRegister(std::uint32_t dwAddress); +}; +} // namespace amdgpu \ No newline at end of file diff --git a/rpcsx-gpu2/Registers.cpp b/rpcsx-gpu2/Registers.cpp new file mode 100644 index 00000000..6a834ff3 --- /dev/null +++ b/rpcsx-gpu2/Registers.cpp @@ -0,0 +1,52 @@ +#include "Registers.hpp" + +amdgpu::Registers::Context amdgpu::Registers::Context::Default = [] { + amdgpu::Registers::Context result{}; + result.paScScreenScissor.bottom = 0x4000; + result.paScScreenScissor.right = 0x4000; + + result.paScWindowScissor.top = 0x8000; + result.paScWindowScissor.bottom = 0x4000; + result.paScWindowScissor.right = 0x4000; + + for (auto &clipRect : result.paScClipRect) { + clipRect.bottom = 0x4000; + clipRect.right = 0x4000; + } + + result.unk_8c = 0xaa99aaaa; + result.paScGenericScissor.top = 0x8000; + result.paScGenericScissor.bottom = 0x4000; + result.paScGenericScissor.right = 0x4000; + + for (auto &vportScissor : result.paScVportScissor) { + vportScissor.top = 0x8000; + vportScissor.bottom = 0x4000; + vportScissor.right = 0x4000; + } + + for (auto &vportZ : result.paScVportZ) { + vportZ.min = 0.0f; + vportZ.max = 1.0f; + } + + result.unk_d4 = 0x2a00161a; + result.spiPsInControl = 2; + result.paClClipCntl = 0x0009'0000; + result.paSuScModeCntl.polyMode = gnm::PolyMode::Dual; + result.vgtGsPerEs = 256; + result.vgtEsPerGs = 128; + result.vgtGsPerVs = 2; + result.iaMultiVgtParam = 0xff; + result.unk_2f7 = 0x00001000; + result.paSuVtxCntl.pixCenterHalf = true; + result.paSuVtxCntl.roundMode = gnm::RoundMode::RoundToEven; + result.paClGbVertClipAdj = 1.0f; + result.paClGbVertDiscAdj = 1.0f; + result.paClGbHorzClipAdj = 1.0f; + result.paClGbHorzDiscAdj = 1.0f; + result.unk_316 = 0xe; + result.vgtOutDeallocCntl = 0x10; + return result; +}(); + diff --git a/rpcsx-gpu2/Registers.hpp b/rpcsx-gpu2/Registers.hpp new file mode 100644 index 00000000..65adffe7 --- /dev/null +++ b/rpcsx-gpu2/Registers.hpp @@ -0,0 +1,931 @@ +#pragma once + +#include "amdgpu/tiler.hpp" +#include "gnm/constants.hpp" +#include +#include +#include + +namespace amdgpu { +enum class Engine { + ME, + PFP, + CE, +}; + +enum class EventIndex { + OTHER, + ZPASS_DONE, + SAMAPE_PIPELINE_STAT, + SAMPLE_STREAM_OUT_STATS, + CS_VS_PS_PARTIAL_FLUSH, + ANY_EOP_TIMESTAMP, + CS_PS_EOS, +}; + +enum class ProtectionFaultAccess : std::uint32_t { + Read = 0, + Write = 1, +}; + +namespace detail { +#pragma pack(push, 1) +template struct Padding { +private: + std::uint32_t _[Count]; +}; +} // namespace detail + +template +struct Register : detail::Padding, ImplT { + Register() = default; + Register(const Register &) = default; + Register &operator=(const Register &) = default; + + Register &operator=(const ImplT &newValue) { + *static_cast(this) = newValue; + return *this; + } +}; + +template + requires(std::is_integral_v || std::is_floating_point_v || + std::is_enum_v) +struct Register : detail::Padding { + ImplT value; + + Register() = default; + Register(const Register &) = default; + Register &operator=(const Register &) = default; + Register &operator=(ImplT newValue) { + value = newValue; + return *this; + } + + operator ImplT() { return value; } +}; + +struct CbColorAttrib { + union { + struct { + std::uint32_t tileModeIndex : 5; + std::uint32_t fmaskTileModeIndex : 4; + std::uint32_t : 3; + std::uint32_t numSamples : 3; + std::uint32_t numFragments : 2; + std::uint32_t forceDstAlpha1 : 1; + }; + + std::uint32_t raw; + }; +}; + +struct CbColorView { + union { + struct { + std::uint32_t sliceStart : 11; + std::uint32_t : 2; + std::uint32_t sliceMax : 11; + }; + std::uint32_t raw; + }; +}; + +struct CbColorControl { + union { + struct { + std::uint32_t : 3; + std::uint32_t degammaEnable : 1; + gnm::CbMode mode : 3; + std::uint32_t : 9; + std::uint32_t rop3 : 8; + }; + std::uint32_t raw; + }; +}; + +struct CbShaderMask { + union { + struct { + std::uint32_t output0Enable : 4; + std::uint32_t output1Enable : 4; + std::uint32_t output2Enable : 4; + std::uint32_t output3Enable : 4; + std::uint32_t output4Enable : 4; + std::uint32_t output5Enable : 4; + std::uint32_t output6Enable : 4; + std::uint32_t output7Enable : 4; + }; + std::uint32_t raw; + }; +}; + +struct CbTargetMask { + union { + struct { + std::uint32_t target0Enable : 4; + std::uint32_t target1Enable : 4; + std::uint32_t target2Enable : 4; + std::uint32_t target3Enable : 4; + std::uint32_t target4Enable : 4; + std::uint32_t target5Enable : 4; + std::uint32_t target6Enable : 4; + std::uint32_t target7Enable : 4; + }; + std::uint32_t raw; + }; +}; + +enum class CbCompSwap : std::uint32_t { + Std, + Alt, + StdRev, + AltRev, +}; + +struct CbColorInfo { + union { + struct { + std::uint32_t endian : 2; + gnm::DataFormat dfmt : 5; + std::uint32_t linearGeneral : 1; + gnm::NumericFormat nfmt : 3; + CbCompSwap compSwap : 2; + std::uint32_t fastClear : 1; + std::uint32_t compression : 1; + std::uint32_t blendClamp : 1; + std::uint32_t blendBypass : 1; + std::uint32_t simpleFloat : 1; + std::uint32_t roundMode : 1; + std::uint32_t cmaskIsLinear : 1; + std::uint32_t blendOptDontRdDst : 3; + std::uint32_t blendOptDiscardPixel : 3; + }; + + std::uint32_t raw; + }; +}; + +struct CbColor { + std::uint32_t base; + std::uint32_t pitch; + std::uint32_t slice; + CbColorView view; + CbColorInfo info; + CbColorAttrib attrib; + std::uint32_t dccBase; + std::uint32_t cmask; + std::uint32_t cmaskSlice : 14; + std::uint32_t fmask; + std::uint32_t fmaskSlice; + std::uint32_t clearWord0; + std::uint32_t clearWord1; + std::uint32_t clearWord2; + std::uint32_t clearWord3; +}; + +struct PaClVport { + float xScale; + float xOffset; + float yScale; + float yOffset; + float zScale; + float zOffset; +}; + +struct PaScVportZ { + float min; + float max; +}; + +struct PaScRect { + std::uint16_t left; + std::uint16_t top; + std::uint16_t right; + std::uint16_t bottom; +}; + +struct SpiShaderPgm { + std::uint32_t rsrc3; + std::uint64_t address; + + union { + struct { + std::uint32_t vgprs : 6; + std::uint32_t sgprs : 4; + std::uint32_t priority : 2; + std::uint32_t floatMode : 8; + std::uint32_t priv : 1; + std::uint32_t dx10Clamp : 1; + std::uint32_t debugMode : 1; + std::uint32_t ieeeMode : 1; + }; + + struct { + std::uint32_t : 24; + std::uint32_t cuGroupEnable : 1; + } es; + + struct { + std::uint32_t : 24; + std::uint32_t cuGroupEnable : 1; + } gs; + + struct { + std::uint32_t : 24; + std::uint32_t vgprCompCnt : 2; + } ls; + + struct { + std::uint32_t : 24; + std::uint32_t cuGroupDisable : 1; + } ps; + + struct { + std::uint32_t : 24; + std::uint32_t vgprCompCnt : 2; + std::uint32_t cuGroupEnable : 1; + } vs; + + std::uint8_t getVGprCount() const { return (vgprs + 1) * 4; } + std::uint8_t getSGprCount() const { return (sgprs + 1) * 8; } + + std::uint32_t raw; + } rsrc1; + + union { + struct { + std::uint32_t scratchEn : 1; + std::uint32_t userSgpr : 5; + std::uint32_t trapPresent : 1; + }; + + struct { + std::uint32_t : 7; + std::uint32_t ocLdsEn : 1; + std::uint32_t soBase0En : 1; + std::uint32_t soBase1En : 1; + std::uint32_t soBase2En : 1; + std::uint32_t soBase3En : 1; + std::uint32_t soEn : 1; + std::uint32_t excpEn : 7; + } vs; + + struct { + std::uint32_t : 7; + std::uint32_t ocLdsEn : 1; + std::uint32_t excpEn : 7; + } es; + + struct { + std::uint32_t : 7; + std::uint32_t excpEn : 7; + } gs; + + struct { + std::uint32_t : 7; + std::uint32_t ocLdsEn : 1; + std::uint32_t tgSizeEn : 1; + std::uint32_t excpEn : 7; + } hs; + + struct { + std::uint32_t : 7; + std::uint32_t ldsSize : 9; + std::uint32_t excpEn : 7; + } ls; + std::uint32_t raw; + } rsrc2; + + std::array userData; +}; + +struct VmProtectionFault { + std::uint32_t protection : 8; + std::uint32_t : 4; + std::uint32_t client : 8; + std::uint32_t : 4; + ProtectionFaultAccess rw : 1; + std::uint32_t vmid : 4; + std::uint32_t : 3; +}; + +enum class LsStage : std::uint32_t { + LsOff, + LsOn, + CsOn, +}; + +enum class EsStage : std::uint32_t { + EsOff, + EsDs, + EsReal, +}; + +enum class VsStage : std::uint32_t { + VsReal, + VsDs, + VsCopy, +}; + +struct VgtShaderStagesEn { + union { + struct { + LsStage lsEn : 2; + bool hsEn : 1; + EsStage esEn : 2; + bool gsEn : 1; + VsStage vsEn : 2; + bool dynamicHs : 1; + }; + std::uint32_t raw; + }; +}; + +struct FbInfo { + std::uint16_t base; // address >> 24 + std::uint16_t unk; +}; + +struct DbDepthControl { + union { + struct { + bool stencilEnable : 1; + bool depthEnable : 1; + bool depthWriteEnable : 1; + bool depthBoundsEnable : 1; + gnm::CompareFunc zFunc : 3; + bool backFaceEnable : 1; + gnm::CompareFunc stencilFunc : 3; + std::uint32_t : 9; + gnm::CompareFunc stencilFuncBackFace : 3; + std::uint32_t : 7; + bool enableColorWritesOnDepthFail : 1; + bool disableColorWritesOnDepthPass : 1; + }; + + std::uint32_t raw; + }; +}; + +struct DbZInfo { + union { + struct { + gnm::ZFormat format : 2; + std::uint32_t numSamples : 2; + std::uint32_t : 16; + std::uint32_t tileModeIndex : 3; + std::uint32_t : 4; + bool allowExpClear : 1; + std::uint32_t readSize : 1; // 0 - 256 bit, 1 - 512 bit + bool tileSurfaceEnable : 1; + std::uint32_t : 1; + bool zRangePrecision : 1; + }; + + std::uint32_t raw; + }; +}; + +struct DbRenderControl { + union { + struct { + bool depthClearEnable : 1; + bool stencilClearEnable : 1; + bool depthCopy : 1; + bool stencilCopy : 1; + bool resummarizeEnable : 1; + bool stencilCompressDisable : 1; + bool depthCompressDisable : 1; + bool copyCentroid : 1; + std::uint32_t copySample : 4; + }; + + std::uint32_t raw; + }; +}; + +struct CbBlendControl { + union { + struct { + gnm::BlendMultiplier colorSrcBlend : 5; + gnm::BlendFunc colorCombFcn : 3; + gnm::BlendMultiplier colorDstBlend : 5; + std::uint32_t : 3; + gnm::BlendMultiplier alphaSrcBlend : 5; + gnm::BlendFunc alphaCombFcn : 3; + gnm::BlendMultiplier alphaDstBlend : 5; + + bool separateAlphaBlend : 1; + bool enable : 1; + bool disableRop3 : 1; + }; + + std::uint32_t raw; + }; +}; + +struct PaSuScModeCntl { + union { + struct { + bool cullFront : 1; + bool cullBack : 1; + gnm::Face face : 1; + gnm::PolyMode polyMode : 2; + gnm::PolyModePtype polyModeFrontPtype : 3; + gnm::PolyModePtype polyModeBackPtype : 3; + bool polyOffsetFrontEnable : 1; + bool polyOffsetBackEnable : 1; + bool polyOffsetParaEnable : 1; + std::uint32_t : 2; + bool vtxWindowOffsetEnable : 1; + std::uint32_t : 2; + bool provokingVtxLast : 1; + bool perspCorrDis : 1; + bool multiPrimIbEna : 1; + }; + + std::uint32_t raw; + }; +}; + +struct PaSuVtxCntl { + union { + struct { + bool pixCenterHalf : 1; + gnm::RoundMode roundMode : 2; + gnm::QuantMode quantMode : 3; + }; + + std::uint32_t raw; + }; +}; + +struct SpiPsInput { + union { + struct { + bool perspSampleEna : 1; + bool perspCenterEna : 1; + bool perspCentroidEna : 1; + bool perspPullModelEna : 1; + bool linearSampleEna : 1; + bool linearCenterEna : 1; + bool linearCentroidEna : 1; + bool lineStippleTexEna : 1; + bool posXFloatEna : 1; + bool posYFloatEna : 1; + bool posZFloatEna : 1; + bool posWFloatEna : 1; + bool frontFaceEna : 1; + bool ancillaryEna : 1; + bool sampleCoverageEna : 1; + bool posFixedPtEna : 1; + }; + + std::uint32_t raw; + }; +}; + +enum class SpiPsDefaultVal : std::uint8_t { + X0_Y0_Z0_W0, + X0_Y0_Z0_W1, + X1_Y1_Z1_W0, + X1_Y1_Z1_W1, +}; + +struct SpiPsInputCntl { + union { + struct { + std::uint32_t offset : 4; + bool useDefaultVal : 1; + std::uint32_t : 3; + SpiPsDefaultVal defaultVal : 2; + bool flatShade : 1; + std::uint32_t : 2; + std::uint32_t cylWrap : 4; + bool ptSpriteTex : 1; + }; + + std::uint32_t raw; + }; +}; +struct Registers { + static constexpr auto kRegisterCount = 0xf000; + + struct Config { + static constexpr auto kMmioOffset = 0x2000; + + Register<0xad, std::array> cpPrtLodStatsCntls; + Register<0x1c0> cpRbRptr; + Register<0x1bf> cpRb1Rptr; + Register<0x1be> cpRb2Rptr; + Register<0x232> vgtEsGsRingSize; + Register<0x233> vgtGsVsRingSize; + Register<0x262> vgtTfRingSize; + Register<0x26e> vgtTfMemoryBase; + Register<0x3c0, std::array> sqBufRsrcWords; + Register<0x3c4, std::array> sqImgRsrcWords; + Register<0x3cc, std::array> sqImgSampWords; + Register<0x644, std::array> gbTileModes; + Register<0x664, std::array> gbMacroTileModes; + }; + + struct ComputeConfig { + static constexpr auto kMmioOffset = 0x2e00; + + std::uint32_t computeDispatchInitiator; + std::uint32_t _pad0[6]; + std::uint32_t computeNumThreadX; + std::uint32_t computeNumThreadY; + std::uint32_t computeNumThreadZ; + std::uint32_t _pad1[2]; + std::uint32_t computePgmLo; + std::uint32_t computePgmHi; + std::uint32_t _pad2[4]; + std::uint32_t computePgmRsrc1; + std::uint32_t computePgmRsrc2; + std::uint32_t _pad3[1]; + std::uint32_t computeResourceLimits; + std::uint32_t computeStaticThreadMgmtSe0; + std::uint32_t computeStaticThreadMgmtSe1; + std::uint32_t computeTmpRingSize; + std::uint32_t _pad4[39]; + std::array userData; + }; + + struct ShaderConfig { + static constexpr auto kMmioOffset = 0x2c00; + + union { + Register<0x7, SpiShaderPgm> spiShaderPgmPs; + Register<0x47, SpiShaderPgm> spiShaderPgmVs; + Register<0x87, SpiShaderPgm> spiShaderPgmGs; + Register<0xc7, SpiShaderPgm> spiShaderPgmEs; + Register<0x107, SpiShaderPgm> spiShaderPgmHs; + Register<0x147, SpiShaderPgm> spiShaderPgmLs; + Register<0x200, ComputeConfig> compute; + }; + }; + + struct Context { + static constexpr auto kMmioOffset = 0xa000; + static Context Default; + + union { + Register<0x0, DbRenderControl> dbRenderControl; + Register<0x1> dbCountControl; + Register<0x2> dbDepthView; + Register<0x3> dbRenderOverride; + Register<0x4> dbRenderOverride2; + Register<0x5> dbHTileDataBase; + Register<0x8, float> dbDepthBoundsMin; + Register<0x9, float> dbDepthBoundsMax; + Register<0xa> dbStencilClear; + Register<0xb, float> dbDepthClear; + Register<0xc, PaScRect> paScScreenScissor; + Register<0xf> dbDepthInfo; + Register<0x10, DbZInfo> dbZInfo; + Register<0x11> dbStencilInfo; + Register<0x12> dbZReadBase; + Register<0x13> dbStencilReadBase; + Register<0x14> dbZWriteBase; + Register<0x15> dbStencilWriteBase; + Register<0x16> dbDepthSize; + Register<0x17> dbDepthSlice; + Register<0x20> taBcBaseAddr; + Register<0x80> paScWindowOffset; + Register<0x81, PaScRect> paScWindowScissor; + Register<0x83> paScClipRectRule; + Register<0x84, std::array> paScClipRect; + Register<0x8c> unk_8c; + Register<0x8d> paSuHardwareScreenOffset; + Register<0x8e, CbTargetMask> cbTargetMask; + Register<0x8f, CbShaderMask> cbShaderMask; + Register<0x90, PaScRect> paScGenericScissor; + Register<0x94, std::array> paScVportScissor; + Register<0xb4, std::array> paScVportZ; + Register<0xd4> unk_d4; + Register<0xd8> cpPerfMonCntxCntl; + Register<0x100> vgtMaxVtxIndx; + Register<0x101> vgtMinVtxIndx; + Register<0x102> vgtIndxOffset; + Register<0x103> vgtMultiPrimIbResetIndx; + Register<0x105, float> cbBlendRed; + Register<0x106, float> cbBlendGreen; + Register<0x107, float> cbBlendBlue; + Register<0x108, float> cbBlendAlpha; + Register<0x10b> dbStencilControl; + Register<0x10c> dbStencilRefMask; + Register<0x10d> dbStencilRefMaskBf; + Register<0x10f, std::array> paClVports; + Register<0x16f> paClUcp0X; + Register<0x170> paClUcp0Y; + Register<0x171> paClUcp0Z; + Register<0x172> paClUcp0W; + Register<0x191, std::array> spiPsInputCntl; + Register<0x1b1> spiVsOutConfig; + Register<0x1b3, SpiPsInput> spiPsInputEna; + Register<0x1b4, SpiPsInput> spiPsInputAddr; + Register<0x1b6> spiPsInControl; + Register<0x1b8> spiBarycCntl; + Register<0x1ba> spiTmpRingSize; + Register<0x1c3> spiShaderPosFormat; + Register<0x1c4> spiShaderZFormat; + Register<0x1c5> spiShaderColFormat; + Register<0x1e0, std::array> cbBlendControl; + Register<0x1f9> vgtDmaBaseHi; + Register<0x1fa> vgtDmaBase; + Register<0x1fc> vgtDrawInitiator; + Register<0x1fd> vgtImmedData; + Register<0x200, DbDepthControl> dbDepthControl; + Register<0x201> dbEqaa; + Register<0x202, CbColorControl> cbColorControl; + Register<0x203> dbShaderControl; + Register<0x204> paClClipCntl; + Register<0x205, PaSuScModeCntl> paSuScModeCntl; + Register<0x206> paClVteCntl; + Register<0x207> paClVsOutCntl; + Register<0x280> paSuPointSize; + Register<0x281> paSuPointMinmax; + Register<0x282> paSuLineCntl; + Register<0x284> vgtOutputPathCntl; + Register<0x286> vgtHosMaxTessLevel; + Register<0x287> vgtHosMinTessLevel; + Register<0x290> vgtGsMode; + Register<0x291> vgtGsOnChipCntl; + Register<0x292> paScModeCntl0; + Register<0x293> paScModeCntl1; + Register<0x295> vgtGsPerEs; + Register<0x296> vgtEsPerGs; + Register<0x297> vgtGsPerVs; + Register<0x298, std::array> vgtGsVsRingOffsets; + Register<0x29b> vgtGsOutPrimType; + Register<0x29d> vgtDmaSize; + Register<0x29e> vgtDmaMaxSize; + Register<0x29f> vgtDmaIndexType; + Register<0x2a1> vgtPrimitiveIdEn; + Register<0x2a2> vgtDmaNumInstances; + Register<0x2a4> vgtEventInitiator; + Register<0x2a5> vgtMultiPrimIbResetEn; + Register<0x2a8> vgtInstanceStepRate0; + Register<0x2a9> vgtInstanceStepRate1; + Register<0x2aa> iaMultiVgtParam; + Register<0x2ab> vgtEsGsRingItemSize; + Register<0x2ac> vgtGsVsRingItemSize; + Register<0x2ad> vgtReuseOff; + Register<0x2ae> vgtVtxCntEn; + Register<0x2af> dbHTileSurface; + Register<0x2b0> dbSResultsCompareState0; + Register<0x2b1> dbSResultsCompareState1; + Register<0x2b4> vgtStrmOutBufferSize0; + Register<0x2b5> vgtStrmOutVtxStride0; + Register<0x2b8> vgtStrmOutBufferSize1; + Register<0x2b9> vgtStrmOutVtxStride1; + Register<0x2bc> vgtStrmOutBufferSize2; + Register<0x2bd> vgtStrmOutVtxStride2; + Register<0x2c0> vgtStrmOutBufferSize3; + Register<0x2c1> vgtStrmOutVtxStride3; + Register<0x2ca> vgtStrmOutDrawOpaqueOffset; + Register<0x2cb> vgtStrmOutDrawOpaqueBufferFilledSize; + Register<0x2cc> vgtStrmOutDrawOpaqueVertexStride; + Register<0x2ce> vgtGsMaxVertOut; + Register<0x2d5, VgtShaderStagesEn> vgtShaderStagesEn; + Register<0x2d6> vgtLsHsConfig; + Register<0x2d7, std::array> vgtGsVertItemSizes; + Register<0x2db> vgtTfParam; + Register<0x2dc> dbAlphaToMask; + Register<0x2dd> vgtDispatchDrawIndex; + Register<0x2de> paSuPolyOffsetDbFmtCntl; + Register<0x2df> paSuPolyOffsetClamp; + Register<0x2e0> paSuPolyOffsetFrontScale; + Register<0x2e1> paSuPolyOffsetFrontOffset; + Register<0x2e2> paSuPolyOffsetBackScale; + Register<0x2e3> paSuPolyOffsetBackOffset; + Register<0x2e4> vgtGsInstanceCnt; + Register<0x2e5> vgtStrmOutConfig; + Register<0x2e6> vgtStrmOutBufferConfig; + Register<0x2f5> paScCentroidPriority0; + Register<0x2f6> paScCentroidPriority1; + Register<0x2f7> unk_2f7; + Register<0x2f8> paScAaConfig; + Register<0x2f9, PaSuVtxCntl> paSuVtxCntl; + Register<0x2fa, float> paClGbVertClipAdj; + Register<0x2fb, float> paClGbVertDiscAdj; + Register<0x2fc, float> paClGbHorzClipAdj; + Register<0x2fd, float> paClGbHorzDiscAdj; + Register<0x2fe, std::array> paScAaSampleLocsPixelX0Y0; + Register<0x302, std::array> paScAaSampleLocsPixelX1Y0; + Register<0x306, std::array> paScAaSampleLocsPixelX0Y1; + Register<0x30a, std::array> paScAaSampleLocsPixelX1Y1; + Register<0x30e> paScAaMaskX0Y0_X1Y0; + Register<0x30f> paScAaMaskX0Y1_X1Y1; + Register<0x316> unk_316; + Register<0x317> vgtOutDeallocCntl; + Register<0x318, std::array> cbColor; + }; + }; + + struct UConfig { + static constexpr auto kMmioOffset = 0xc000; + + union { + Register<0x3f> cpStrmOutCntl; + Register<0x79> cpCoherBaseHi; + Register<0x7d> cpCoherSize; + Register<0x7e> cpCoherBase; + Register<0x8b> cpDmaReadTags; + Register<0x8c> cpCoherSizeHi; + Register<0x200> grbmGfxIndex; + Register<0x242, gnm::PrimitiveType> vgtPrimitiveType; + Register<0x243, gnm::IndexType> vgtIndexType; + Register<0x24c> vgtNumIndices; + Register<0x24d> vgtNumInstances; + Register<0x340, std::array> sqThreadTraceUserdata; + Register<0x41d> gdsOaCntl; + Register<0x41e> gdsOaCounter; + Register<0x41f> gdsOaAddress; + }; + }; + + struct Counters { + static constexpr auto kMmioOffset = 0xd000; + + union { + Register<0x0, std::uint64_t> cpgPerfCounter1; + Register<0x2, std::uint64_t> cpgPerfCounter0; + Register<0x4, std::uint64_t> cpcPerfCounter1; + Register<0x6, std::uint64_t> cpcPerfCounter0; + Register<0x8, std::uint64_t> cpfPerfCounter1; + Register<0xa, std::uint64_t> cpfPerfCounter0; + Register<0x80, std::array> wdPerfCounters; + Register<0x88, std::array> iaPerfCounters; + Register<0x90, std::array> vgtPerfCounters; + Register<0x100, std::array> paSuPerfCounters; + Register<0x140, std::array> paScPerfCounters; + Register<0x180> spiPerfCounter0Hi; + Register<0x181> spiPerfCounter0Lo; + Register<0x182> spiPerfCounter1Hi; + Register<0x183> spiPerfCounter1Lo; + Register<0x184> spiPerfCounter2Hi; + Register<0x185> spiPerfCounter2Lo; + Register<0x186> spiPerfCounter3Hi; + Register<0x187> spiPerfCounter3Lo; + Register<0x188> spiPerfCounter4Hi; + Register<0x189> spiPerfCounter4Lo; + Register<0x18a> spiPerfCounter5Hi; + Register<0x18b> spiPerfCounter5Lo; + Register<0x1c0, std::array> sqPerfCounters; + Register<0x240, std::array> sxPerfCounters; + Register<0x280, std::array> gdsPerfCounters; + Register<0x2c0, std::array> taPerfCounters; + Register<0x300, std::array> tdPerfCounters; + Register<0x340, std::array> tcpPerfCounters; + Register<0x380, std::array> tccPerfCounters; + Register<0x390, std::array> tcaPerfCounters; + Register<0x3a0, std::array> tcsPerfCounters; + Register<0x406, std::array> cbPerfCounters; + Register<0x440, std::array> dbPerfCounters; + Register<0x800> cpgPerfCounter1Select; + Register<0x801> cpgPerfCounter0Select1; + Register<0x802> cpgPerfCounter0Select; + Register<0x803> cpcPerfCounter1Select; + Register<0x804> cpcPerfCounter0Select1; + Register<0x805> cpfPerfCounter1Select; + Register<0x806> cpfPerfCounter0Select1; + Register<0x807> cpfPerfCounter0Select; + Register<0x808> cpPerfMonCntl; + Register<0x809> cpcPerfCounter0Select; + Register<0x880> wdPerfCounter0Select; + Register<0x881> wdPerfCounter1Select; + Register<0x882> wdPerfCounter2Select; + Register<0x883> wdPerfCounter3Select; + Register<0x884> iaPerfCounter0Select; + Register<0x885> iaPerfCounter1Select; + Register<0x886> iaPerfCounter2Select; + Register<0x887> iaPerfCounter3Select; + Register<0x888> iaPerfCounter0Select1; + Register<0x88c> vgtPerfCounter0Select; + Register<0x88d> vgtPerfCounter1Select; + Register<0x88e> vgtPerfCounter2Select; + Register<0x88f> vgtPerfCounter3Select; + Register<0x890> vgtPerfCounter0Select1; + Register<0x891> vgtPerfCounter1Select1; + Register<0x900> paSuPerfCounter0Select; + Register<0x901> paSuPerfCounter0Select1; + Register<0x902> paSuPerfCounter1Select; + Register<0x903> paSuPerfCounter1Select1; + Register<0x904> paSuPerfCounter2Select; + Register<0x905> paSuPerfCounter3Select; + Register<0x940> paScPerfCounter0Select; + Register<0x941> paScPerfCounter0Select1; + Register<0x942> paScPerfCounter1Select; + Register<0x943> paScPerfCounter2Select; + Register<0x944> paScPerfCounter3Select; + Register<0x945> paScPerfCounter4Select; + Register<0x946> paScPerfCounter5Select; + Register<0x947> paScPerfCounter6Select; + Register<0x948> paScPerfCounter7Select; + Register<0x980> spiPerfCounter0Select; + Register<0x981> spiPerfCounter1Select; + Register<0x982> spiPerfCounter2Select; + Register<0x983> spiPerfCounter3Select; + Register<0x984> spiPerfCounter0Select1; + Register<0x985> spiPerfCounter1Select1; + Register<0x986> spiPerfCounter2Select1; + Register<0x987> spiPerfCounter3Select1; + Register<0x988> spiPerfCounter4Select; + Register<0x989> spiPerfCounter5Select; + Register<0x98a> spiPerfCounterBins; + Register<0x9c0, std::array> sqPerfCountersSelect; + Register<0x9e0> sqPerfCounterCtrl; + Register<0xa40> sxPerfCounter0Select; + Register<0xa41> sxPerfCounter1Select; + Register<0xa42> sxPerfCounter2Select; + Register<0xa43> sxPerfCounter3Select; + Register<0xa44> sxPerfCounter0Select1; + Register<0xa45> sxPerfCounter1Select1; + Register<0xa80> gdsPerfCounter0Select; + Register<0xa81> gdsPerfCounter1Select; + Register<0xa82> gdsPerfCounter2Select; + Register<0xa83> gdsPerfCounter3Select; + Register<0xa84> gdsPerfCounter0Select1; + Register<0xac0> taPerfCounter0Select; + Register<0xac1> taPerfCounter0Select1; + Register<0xac2> taPerfCounter1Select; + Register<0xb00> tdPerfCounter0Select; + Register<0xb01> tdPerfCounter0Select1; + Register<0xb02> tdPerfCounter1Select; + Register<0xb40> tcpPerfCounter0Select; + Register<0xb41> tcpPerfCounter0Select1; + Register<0xb42> tcpPerfCounter1Select; + Register<0xb43> tcpPerfCounter1Select1; + Register<0xb44> tcpPerfCounter2Select; + Register<0xb45> tcpPerfCounter3Select; + Register<0xb80> tccPerfCounter0Select; + Register<0xb81> tccPerfCounter0Select1; + Register<0xb82> tccPerfCounter1Select; + Register<0xb83> tccPerfCounter1Select1; + Register<0xb84> tccPerfCounter2Select; + Register<0xb85> tccPerfCounter3Select; + Register<0xb90> tcaPerfCounter0Select; + Register<0xb91> tcaPerfCounter0Select1; + Register<0xb92> tcaPerfCounter1Select; + Register<0xb93> tcaPerfCounter1Select1; + Register<0xb94> tcaPerfCounter2Select; + Register<0xb95> tcaPerfCounter3Select; + Register<0xba0> tcsPerfCounter0Select; + Register<0xba1> tcsPerfCounter0Select1; + Register<0xba2> tcsPerfCounter1Select; + Register<0xba3> tcsPerfCounter2Select; + Register<0xba4> tcsPerfCounter3Select; + Register<0xc00> cbPerfCounterFilter; + Register<0xc01> cbPerfCounter0Select; + Register<0xc02> cbPerfCounter0Select1; + Register<0xc03> cbPerfCounter1Select; + Register<0xc04> cbPerfCounter2Select; + Register<0xc05> cbPerfCounter3Select; + Register<0xc40> dbPerfCounter0Select; + Register<0xc41> dbPerfCounter0Select1; + Register<0xc42> dbPerfCounter1Select; + Register<0xc43> dbPerfCounter1Select1; + Register<0xc44> dbPerfCounter2Select; + Register<0xc46> dbPerfCounter3Select; + }; + }; + + union { + Register<0x50c, std::uint32_t> vmContext0ProtectionIntrCtl; + Register<0x50d, std::uint32_t> vmContext1ProtectionIntrCtl; + Register<0x536, VmProtectionFault> vmContext0ProtectionFault; + Register<0x537, VmProtectionFault> vmContext1ProtectionFault; + Register<0x53e, std::uint32_t> + vmContext0ProtectionFaultPage; // address >> 12 + Register<0x53f, std::uint32_t> + vmContext1ProtectionFaultPage; // address >> 12 + Register<0x809, FbInfo> fbInfo; + Register<0xf82, std::uint32_t> ihRptr; + Register<0xf83, std::uint32_t> ihWptr; + + Register config; + Register sh; + + Register<0x3045> cpRbWptr; + Register<0x3064> cpRb1Wptr; + Register<0x3069> cpRb2Wptr; + Register<0x3049> cpIntCntl; + Register<0x304a> cpIntStatus; + Register<0x306a, std::array> cpIntCntlRings; + Register<0x306d, std::array> cpIntStatusRings; + Register<0x324b> cpHqdQueuePriority; + Register<0x324c> cpHqdQuantum; + + Register context; + Register uconfig; + Register counters; + + std::uint32_t raw[kRegisterCount]; + }; +}; + +#pragma pack(pop) +} // namespace amdgpu \ No newline at end of file diff --git a/rpcsx-gpu2/Renderer.cpp b/rpcsx-gpu2/Renderer.cpp new file mode 100644 index 00000000..bc521208 --- /dev/null +++ b/rpcsx-gpu2/Renderer.cpp @@ -0,0 +1,1273 @@ +#include "Renderer.hpp" +#include "Device.hpp" +#include "gnm/descriptors.hpp" +#include "rx/MemoryTable.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace shader; + +namespace gnm { +VkRect2D toVkRect2D(amdgpu::PaScRect rect) { + return { + .offset = + { + .x = rect.left, + .y = rect.top, + }, + .extent = + { + .width = static_cast(rect.right - rect.left), + .height = static_cast(rect.bottom - rect.top), + }, + }; +} + +amdgpu::PaScRect intersection(amdgpu::PaScRect rect, amdgpu::PaScRect scissor) { + amdgpu::PaScRect result{ + .left = std::max(rect.left, scissor.left), + .top = std::max(rect.top, scissor.top), + .right = std::min(rect.right, scissor.right), + .bottom = std::min(rect.bottom, scissor.bottom), + }; + + result.top = std::min(result.top, result.bottom); + result.bottom = std::max(result.top, result.bottom); + result.left = std::min(result.left, result.right); + result.right = std::max(result.left, result.right); + return result; +} +} // namespace gnm + +struct MemoryTableSlot { + std::uint64_t address; + union { + struct { + std::uint64_t size : 40; + std::uint64_t flags : 4; + }; + std::uint64_t sizeAndFlags; + }; + std::uint64_t deviceAddress; +}; +struct MemoryTable { + std::uint32_t count; + std::uint32_t pad; + MemoryTableSlot slots[]; +}; + +static VkShaderEXT getPrimTypeRectGeomShader(amdgpu::Cache &cache) { + static VkShaderEXT shader = VK_NULL_HANDLE; + if (shader != VK_NULL_HANDLE) { + return shader; + } + + VkShaderCreateInfoEXT createInfo{ + .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT, + .stage = VK_SHADER_STAGE_GEOMETRY_BIT, + .codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT, + .codeSize = sizeof(spirv_rect_list_geom), + .pCode = spirv_rect_list_geom, + .pName = "main", + .setLayoutCount = + static_cast(cache.getGraphicsDescriptorSetLayouts().size()), + .pSetLayouts = cache.getGraphicsDescriptorSetLayouts().data()}; + + VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &createInfo, + vk::context->allocator, &shader)); + return shader; +} + +static VkShaderEXT getFillRedFragShader(amdgpu::Cache &cache) { + static VkShaderEXT shader = VK_NULL_HANDLE; + if (shader != VK_NULL_HANDLE) { + return shader; + } + + VkShaderCreateInfoEXT createInfo{ + .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT, + .codeSize = sizeof(spirv_fill_red_frag), + .pCode = spirv_fill_red_frag, + .pName = "main", + .setLayoutCount = + static_cast(cache.getGraphicsDescriptorSetLayouts().size()), + .pSetLayouts = cache.getGraphicsDescriptorSetLayouts().data()}; + + VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &createInfo, + vk::context->allocator, &shader)); + return shader; +} + +static VkShaderEXT getFlipVertexShader(amdgpu::Cache &cache) { + static VkShaderEXT shader = VK_NULL_HANDLE; + if (shader != VK_NULL_HANDLE) { + return shader; + } + + VkShaderCreateInfoEXT createInfo{ + .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT, + .stage = VK_SHADER_STAGE_VERTEX_BIT, + .codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT, + .codeSize = sizeof(spirv_flip_vert), + .pCode = spirv_flip_vert, + .pName = "main", + .setLayoutCount = + static_cast(cache.getGraphicsDescriptorSetLayouts().size()), + .pSetLayouts = cache.getGraphicsDescriptorSetLayouts().data()}; + + VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &createInfo, + vk::context->allocator, &shader)); + return shader; +} + +static VkShaderEXT getFlipFragmentShader(amdgpu::Cache &cache) { + static VkShaderEXT shader = VK_NULL_HANDLE; + if (shader != VK_NULL_HANDLE) { + return shader; + } + + VkShaderCreateInfoEXT createInfo{ + .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT, + .stage = VK_SHADER_STAGE_FRAGMENT_BIT, + .codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT, + .codeSize = sizeof(spirv_flip_frag), + .pCode = spirv_flip_frag, + .pName = "main", + .setLayoutCount = + static_cast(cache.getGraphicsDescriptorSetLayouts().size()), + .pSetLayouts = cache.getGraphicsDescriptorSetLayouts().data()}; + + VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &createInfo, + vk::context->allocator, &shader)); + return shader; +} + +static VkPrimitiveTopology toVkPrimitiveType(gnm::PrimitiveType type) { + switch (type) { + case gnm::PrimitiveType::PointList: + return VK_PRIMITIVE_TOPOLOGY_POINT_LIST; + case gnm::PrimitiveType::LineList: + return VK_PRIMITIVE_TOPOLOGY_LINE_LIST; + case gnm::PrimitiveType::LineStrip: + return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP; + case gnm::PrimitiveType::TriList: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + case gnm::PrimitiveType::TriFan: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN; + case gnm::PrimitiveType::TriStrip: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP; + case gnm::PrimitiveType::Patch: + return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST; + case gnm::PrimitiveType::LineListAdjacency: + return VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY; + case gnm::PrimitiveType::LineStripAdjacency: + return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY; + case gnm::PrimitiveType::TriListAdjacency: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY; + case gnm::PrimitiveType::TriStripAdjacency: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY; + case gnm::PrimitiveType::LineLoop: + return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP; // FIXME + + case gnm::PrimitiveType::RectList: + case gnm::PrimitiveType::QuadList: + case gnm::PrimitiveType::QuadStrip: + case gnm::PrimitiveType::Polygon: + return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + + default: + rx::die("toVkPrimitiveType: unexpected primitive type %u", + static_cast(type)); + } +} + +struct ShaderResources : eval::Evaluator { + amdgpu::Cache::Tag *cacheTag; + shader::eval::Evaluator evaluator; + std::map slotResources; + std::span userSgprs; + + rx::MemoryTableWithPayload bufferMemoryTable; + std::vector> resourceSlotToAddress; + std::vector samplerResources; + std::vector imageResources[3]; + + using Evaluator::eval; + + ShaderResources() = default; + + void loadResources(shader::gcn::Resources &res, + std::span userSgprs) { + this->userSgprs = userSgprs; + for (auto &pointer : res.pointers) { + auto pointerBase = eval(pointer.base).zExtScalar(); + auto pointerOffset = eval(pointer.offset).zExtScalar(); + + if (!pointerBase || !pointerOffset) { + res.dump(); + rx::die("failed to evaluate pointer"); + } + + bufferMemoryTable.map(*pointerBase, + *pointerBase + *pointerOffset + pointer.size, + Access::Read); + resourceSlotToAddress.push_back({pointer.resourceSlot, *pointerBase}); + } + + for (auto &bufferRes : res.buffers) { + auto word0 = eval(bufferRes.words[0]).zExtScalar(); + auto word1 = eval(bufferRes.words[1]).zExtScalar(); + auto word2 = eval(bufferRes.words[2]).zExtScalar(); + auto word3 = eval(bufferRes.words[3]).zExtScalar(); + + if (!word0 || !word1 || !word2 || !word3) { + res.dump(); + rx::die("failed to evaluate V#"); + } + + gnm::VBuffer buffer{}; + std::memcpy(reinterpret_cast(&buffer), &*word0, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 1, &*word1, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 2, &*word2, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 3, &*word3, + sizeof(std::uint32_t)); + + bufferMemoryTable.map(buffer.address(), buffer.address() + buffer.size(), + bufferRes.access); + resourceSlotToAddress.push_back( + {bufferRes.resourceSlot, buffer.address()}); + } + + for (auto &texture : res.textures) { + auto word0 = eval(texture.words[0]).zExtScalar(); + auto word1 = eval(texture.words[1]).zExtScalar(); + auto word2 = eval(texture.words[2]).zExtScalar(); + auto word3 = eval(texture.words[3]).zExtScalar(); + + if (!word0 || !word1 || !word2 || !word3) { + res.dump(); + rx::die("failed to evaluate 128 bit T#"); + } + + gnm::TBuffer buffer{}; + std::memcpy(reinterpret_cast(&buffer), &*word0, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 1, &*word1, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 2, &*word2, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 3, &*word3, + sizeof(std::uint32_t)); + + if (texture.words[4] != nullptr) { + auto word4 = eval(texture.words[4]).zExtScalar(); + auto word5 = eval(texture.words[5]).zExtScalar(); + auto word6 = eval(texture.words[6]).zExtScalar(); + auto word7 = eval(texture.words[7]).zExtScalar(); + + if (!word4 || !word5 || !word6 || !word7) { + res.dump(); + rx::die("failed to evaluate 256 bit T#"); + } + + std::memcpy(reinterpret_cast(&buffer) + 4, &*word4, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 5, &*word5, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 6, &*word6, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 7, &*word7, + sizeof(std::uint32_t)); + } + + std::vector *resources = nullptr; + + switch (buffer.type) { + case gnm::TextureType::Array1D: + case gnm::TextureType::Dim1D: + resources = &imageResources[0]; + break; + case gnm::TextureType::Dim2D: + case gnm::TextureType::Array2D: + case gnm::TextureType::Msaa2D: + case gnm::TextureType::MsaaArray2D: + case gnm::TextureType::Cube: + resources = &imageResources[1]; + break; + case gnm::TextureType::Dim3D: + resources = &imageResources[2]; + break; + } + + rx::dieIf(resources == nullptr, + "ShaderResources: unexpected texture type %u", + static_cast(buffer.type)); + + slotResources[texture.resourceSlot] = resources->size(); + resources->push_back(cacheTag->getImageView( + amdgpu::ImageViewKey::createFrom(buffer), texture.access)); + } + + for (auto &sampler : res.samplers) { + auto word0 = eval(sampler.words[0]).zExtScalar(); + auto word1 = eval(sampler.words[1]).zExtScalar(); + auto word2 = eval(sampler.words[2]).zExtScalar(); + auto word3 = eval(sampler.words[3]).zExtScalar(); + + if (!word0 || !word1 || !word2 || !word3) { + res.dump(); + rx::die("failed to evaluate S#"); + } + + gnm::SSampler sSampler{}; + std::memcpy(reinterpret_cast(&sSampler), &*word0, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&sSampler) + 1, &*word1, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&sSampler) + 2, &*word2, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&sSampler) + 3, &*word3, + sizeof(std::uint32_t)); + + if (sampler.unorm) { + sSampler.force_unorm_coords = true; + } + + slotResources[sampler.resourceSlot] = samplerResources.size(); + samplerResources.push_back( + cacheTag->getSampler(amdgpu::SamplerKey::createFrom(sSampler))); + } + } + + void buildMemoryTable(MemoryTable &memoryTable) { + memoryTable.count = 0; + + for (auto p : bufferMemoryTable) { + auto size = p.endAddress - p.beginAddress; + auto buffer = cacheTag->getBuffer(p.beginAddress, size, p.payload); + + auto memoryTableSlot = memoryTable.count; + memoryTable.slots[memoryTable.count++] = { + .address = p.beginAddress, + .size = size, + .flags = static_cast(p.payload), + .deviceAddress = buffer.deviceAddress, + }; + + for (auto [slot, address] : resourceSlotToAddress) { + if (address >= p.beginAddress && address < p.endAddress) { + slotResources[slot] = memoryTableSlot; + } + } + } + } + + std::uint32_t getResourceSlot(std::uint32_t id) { + if (auto it = slotResources.find(id); it != slotResources.end()) { + return it->second; + } + return -1; + } + + template T readPointer(std::uint64_t address) { + T result{}; + cacheTag->readMemory(&result, address, sizeof(result)); + return result; + } + + eval::Value eval(ir::InstructionId instId, + std::span operands) override { + if (instId == ir::amdgpu::POINTER) { + auto type = operands[0].getAsValue(); + auto loadSize = *operands[1].getAsInt32(); + auto base = eval(operands[2]).zExtScalar(); + auto offset = eval(operands[3]).zExtScalar(); + + if (!base || !offset) { + rx::die("failed to evaluate pointer dependency"); + } + + eval::Value result; + auto address = *base + *offset; + + switch (loadSize) { + case 1: + result = readPointer(address); + break; + case 2: + result = readPointer(address); + break; + case 4: + result = readPointer(address); + break; + case 8: + result = readPointer(address); + break; + case 12: + result = readPointer(address); + break; + case 16: + result = readPointer(address); + break; + case 32: + result = readPointer>(address); + break; + default: + rx::die("unexpected pointer load size"); + } + + return result; + } + + if (instId == ir::amdgpu::VBUFFER) { + rx::die("resource depends on buffer value"); + } + + if (instId == ir::amdgpu::TBUFFER) { + rx::die("resource depends on texture value"); + } + + if (instId == ir::amdgpu::SAMPLER) { + rx::die("resource depends on sampler value"); + } + + if (instId == ir::amdgpu::USER_SGPR) { + auto index = static_cast(*operands[1].getAsInt32()); + rx::dieIf(index >= userSgprs.size(), "out of user sgprs"); + return userSgprs[index]; + } + + if (instId == ir::amdgpu::IMM) { + auto address = static_cast(*operands[1].getAsInt64()); + + std::uint32_t result; + cacheTag->readMemory(&result, address, sizeof(result)); + return result; + } + + return Evaluator::eval(instId, operands); + } +}; + +void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, + std::uint32_t vertexCount, std::uint32_t firstInstance, + std::uint32_t instanceCount, std::uint64_t indiciesAddress, + std::uint32_t indexCount) { + if (pipe.uConfig.vgtPrimitiveType == gnm::PrimitiveType::None) { + return; + } + + auto cacheTag = pipe.device->getCacheTag(vmId, pipe.scheduler); + auto targetMask = pipe.context.cbTargetMask.raw; + + VkRenderingAttachmentInfo colorAttachments[8]{}; + VkBool32 colorBlendEnable[8]{}; + VkColorBlendEquationEXT colorBlendEquation[8]{}; + VkColorComponentFlags colorWriteMask[8]{}; + VkViewport viewPorts[8]{}; + VkRect2D viewPortScissors[8]{}; + unsigned renderTargets = 0; + + for (auto &cbColor : pipe.context.cbColor) { + if (targetMask == 0) { + break; + } + + auto viewPortScissor = pipe.context.paScScreenScissor; + // viewPortScissor = gnm::intersection( + // viewPortScissor, pipe.context.paScVportScissor[renderTargets]); + // viewPortScissor = + // gnm::intersection(viewPortScissor, pipe.context.paScWindowScissor); + // viewPortScissor = + // gnm::intersection(viewPortScissor, pipe.context.paScGenericScissor); + + auto viewPortRect = gnm::toVkRect2D(viewPortScissor); + + viewPorts[renderTargets].x = viewPortRect.offset.x; + viewPorts[renderTargets].y = viewPortRect.offset.y; + viewPorts[renderTargets].width = viewPortRect.extent.width; + viewPorts[renderTargets].height = viewPortRect.extent.height; + viewPorts[renderTargets].minDepth = + pipe.context.paScVportZ[renderTargets].min; + viewPorts[renderTargets].maxDepth = + pipe.context.paScVportZ[renderTargets].max; + + auto vkViewPortScissor = gnm::toVkRect2D(viewPortScissor); + viewPortScissors[renderTargets] = vkViewPortScissor; + + ImageViewKey renderTargetInfo{}; + renderTargetInfo.type = gnm::TextureType::Dim2D; + renderTargetInfo.pitch = vkViewPortScissor.extent.width; + renderTargetInfo.address = cbColor.base << 8; + renderTargetInfo.extent.width = vkViewPortScissor.extent.width; + renderTargetInfo.extent.height = vkViewPortScissor.extent.height; + renderTargetInfo.extent.depth = 1; + renderTargetInfo.dfmt = cbColor.info.dfmt; + renderTargetInfo.nfmt = cbColor.info.nfmt; + renderTargetInfo.mipCount = 1; + renderTargetInfo.arrayLayerCount = 1; + + renderTargetInfo.tileMode = + cbColor.info.linearGeneral + ? TileMode{.raw = 0} + : getDefaultTileModes()[/*cbColor.attrib.tileModeIndex*/ + 13]; + + // std::printf("draw to %lx\n", renderTargetInfo.address); + + auto access = Access::None; + + if (!cbColor.info.fastClear) { + access |= Access::Read; + } + if (targetMask & 0xf) { + access |= Access::Write; + } + + auto imageView = cacheTag.getImageView(renderTargetInfo, access); + + colorAttachments[renderTargets] = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = imageView.handle, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + .loadOp = cbColor.info.fastClear ? VK_ATTACHMENT_LOAD_OP_CLEAR + : VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + + .clearValue = + { + .color = + { + .uint32 = + { + cbColor.clearWord0, + cbColor.clearWord1, + cbColor.clearWord2, + }, + }, + }, + }; + + auto &blendControl = pipe.context.cbBlendControl[renderTargets]; + + colorBlendEnable[renderTargets] = blendControl.enable; + colorBlendEquation[renderTargets] = VkColorBlendEquationEXT{ + .srcColorBlendFactor = gnm::toVkBlendFactor(blendControl.colorSrcBlend), + .dstColorBlendFactor = gnm::toVkBlendFactor(blendControl.colorDstBlend), + .colorBlendOp = gnm::toVkBlendOp(blendControl.colorCombFcn), + .srcAlphaBlendFactor = + blendControl.separateAlphaBlend + ? gnm::toVkBlendFactor(blendControl.alphaSrcBlend) + : gnm::toVkBlendFactor(blendControl.colorSrcBlend), + .dstAlphaBlendFactor = + blendControl.separateAlphaBlend + ? gnm::toVkBlendFactor(blendControl.alphaDstBlend) + : gnm::toVkBlendFactor(blendControl.colorDstBlend), + .alphaBlendOp = blendControl.separateAlphaBlend + ? gnm::toVkBlendOp(blendControl.alphaCombFcn) + : gnm::toVkBlendOp(blendControl.colorCombFcn), + }; + + colorWriteMask[renderTargets] = + ((targetMask & 1) ? VK_COLOR_COMPONENT_R_BIT : 0) | + ((targetMask & 2) ? VK_COLOR_COMPONENT_G_BIT : 0) | + ((targetMask & 4) ? VK_COLOR_COMPONENT_B_BIT : 0) | + ((targetMask & 8) ? VK_COLOR_COMPONENT_A_BIT : 0); + + renderTargets++; + targetMask >>= 4; + } + + // if (pipe.context.cbTargetMask == 0) { + // return; + // } + + // auto cache = pipe.device->getCache(vmId); + + if (indiciesAddress == 0) { + indexCount = vertexCount; + } + + auto indexBuffer = cacheTag.getIndexBuffer(indiciesAddress, indexCount, + pipe.uConfig.vgtPrimitiveType, + pipe.uConfig.vgtIndexType); + + auto stages = Cache::kGraphicsStages; + VkShaderEXT shaders[stages.size()]{}; + + auto pipelineLayout = cacheTag.getGraphicsPipelineLayout(); + auto descriptorSets = cacheTag.createGraphicsDescriptorSets(); + + std::vector descriptorBuffers; + auto &memoryTableBuffer = cacheTag.getCache()->getMemoryTableBuffer(); + std::uint64_t memoryTableAddress = memoryTableBuffer.getAddress(); + auto memoryTable = std::bit_cast(memoryTableBuffer.getData()); + + std::uint64_t gdsAddress = cacheTag.getCache()->getGdsBuffer().getAddress(); + ShaderResources shaderResources; + shaderResources.cacheTag = &cacheTag; + + struct MemoryTableConfigSlot { + std::uint32_t bufferIndex; + std::uint32_t configIndex; + std::uint32_t resourceSlot; + }; + std::vector memoryTableConfigSlots; + + auto addShader = [&](const SpiShaderPgm &pgm, shader::gcn::Stage stage) { + shader::gcn::Environment env{ + .vgprCount = pgm.rsrc1.getVGprCount(), + .sgprCount = pgm.rsrc1.getSGprCount(), + .userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr), + .supportsBarycentric = vk::context->supportsBarycentric, + .supportsInt8 = vk::context->supportsInt8, + .supportsInt64Atomics = vk::context->supportsInt64Atomics, + // .supportsBarycentric = false, + // .supportsInt8 = false, + // .supportsInt64Atomics = false, + }; + + auto shader = cacheTag.getShader({ + .address = pgm.address << 8, + .stage = stage, + .env = env, + }); + + shaderResources.loadResources( + shader.info->resources, + std::span(pgm.userData.data(), pgm.rsrc2.userSgpr)); + + const auto &configSlots = shader.info->configSlots; + + auto configSize = configSlots.size() * sizeof(std::uint32_t); + auto configBuffer = cacheTag.getInternalBuffer(configSize); + + auto configPtr = reinterpret_cast(configBuffer.data); + + shader::gcn::PsVGprInput + psVgprInput[static_cast(shader::gcn::PsVGprInput::Count)]; + std::size_t psVgprInputs = 0; + + if (stage == shader::gcn::Stage::Ps) { + SpiPsInput spiInputAddr = pipe.context.spiPsInputAddr; + + if (spiInputAddr.perspSampleEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspSample; + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspSample; + } + if (spiInputAddr.perspCenterEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspCenter; + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspCenter; + } + if (spiInputAddr.perspCentroidEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspCentroid; + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspCentroid; + } + if (spiInputAddr.perspPullModelEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IW; + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JW; + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::_1W; + } + if (spiInputAddr.linearSampleEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearSample; + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearSample; + } + if (spiInputAddr.linearCenterEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearCenter; + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearCenter; + } + if (spiInputAddr.linearCentroidEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearCentroid; + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearCentroid; + } + if (spiInputAddr.posXFloatEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::X; + } + if (spiInputAddr.posYFloatEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Y; + } + if (spiInputAddr.posZFloatEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Z; + } + if (spiInputAddr.posWFloatEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::W; + } + if (spiInputAddr.frontFaceEna) { + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::FrontFace; + } + if (spiInputAddr.ancillaryEna) { + rx::die("unimplemented ancillary fs input"); + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Ancillary; + } + if (spiInputAddr.sampleCoverageEna) { + rx::die("unimplemented sample coverage fs input"); + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::SampleCoverage; + } + if (spiInputAddr.posFixedPtEna) { + rx::die("unimplemented pos fixed fs input"); + psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::PosFixed; + } + } + + for (std::size_t index = 0; const auto &slot : configSlots) { + switch (slot.type) { + case shader::gcn::ConfigType::Imm: + cacheTag.readMemory(&configPtr[index], slot.data, + sizeof(std::uint32_t)); + break; + case shader::gcn::ConfigType::UserSgpr: + configPtr[index] = pgm.userData[slot.data]; + break; + case shader::gcn::ConfigType::ViewPortOffsetX: + configPtr[index] = std::bit_cast( + pipe.context.paClVports[0].xOffset / (viewPorts[0].width / 2.f) - + 1); + break; + case shader::gcn::ConfigType::ViewPortOffsetY: + configPtr[index] = std::bit_cast( + pipe.context.paClVports[0].yOffset / (viewPorts[0].height / 2.f) - + 1); + break; + case shader::gcn::ConfigType::ViewPortOffsetZ: + configPtr[index] = + std::bit_cast(pipe.context.paClVports[0].zOffset); + break; + case shader::gcn::ConfigType::ViewPortScaleX: + configPtr[index] = std::bit_cast( + pipe.context.paClVports[0].xScale / (viewPorts[0].width / 2.f)); + break; + case shader::gcn::ConfigType::ViewPortScaleY: + configPtr[index] = std::bit_cast( + pipe.context.paClVports[0].yScale / (viewPorts[0].height / 2.f)); + break; + case shader::gcn::ConfigType::ViewPortScaleZ: + configPtr[index] = + std::bit_cast(pipe.context.paClVports[0].zScale); + break; + case shader::gcn::ConfigType::PsInputVGpr: + if (slot.data > psVgprInputs) { + configPtr[index] = ~0; + } else { + configPtr[index] = + std::bit_cast(psVgprInput[slot.data]); + } + break; + case shader::gcn::ConfigType::VsPrimType: + if (indexBuffer.handle == VK_NULL_HANDLE && + pipe.uConfig.vgtPrimitiveType != indexBuffer.primType) { + configPtr[index] = + static_cast(pipe.uConfig.vgtPrimitiveType.value); + } else { + configPtr[index] = 0; + } + break; + + case shader::gcn::ConfigType::ResourceSlot: + memoryTableConfigSlots.push_back({ + .bufferIndex = static_cast(descriptorBuffers.size()), + .configIndex = static_cast(index), + .resourceSlot = static_cast(slot.data), + }); + break; + + case shader::gcn::ConfigType::MemoryTable: + if (slot.data == 0) { + configPtr[index] = static_cast(memoryTableAddress); + } else { + configPtr[index] = + static_cast(memoryTableAddress >> 32); + } + break; + case shader::gcn::ConfigType::Gds: + if (slot.data == 0) { + configPtr[index] = static_cast(gdsAddress); + } else { + configPtr[index] = static_cast(gdsAddress >> 32); + } + break; + + case shader::gcn::ConfigType::CbCompSwap: + configPtr[index] = std::bit_cast( + pipe.context.cbColor[slot.data].info.compSwap); + break; + } + + ++index; + } + + VkDescriptorBufferInfo bufferInfo{ + .buffer = configBuffer.handle, + .offset = configBuffer.offset, + .range = configSize, + }; + + auto stageIndex = Cache::getStageIndex(shader.stage); + + VkWriteDescriptorSet writeDescSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = descriptorSets[stageIndex], + .dstBinding = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .pBufferInfo = &bufferInfo, + }; + + vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); + + shaders[stageIndex] = shader.handle + ? shader.handle + : getFillRedFragShader(*cacheTag.getCache()); + descriptorBuffers.push_back(configPtr); + }; + + if (pipe.context.vgtShaderStagesEn.vsEn == amdgpu::VsStage::VsReal) { + addShader(pipe.sh.spiShaderPgmVs, shader::gcn::Stage::VsVs); + } + + if (true) { + addShader(pipe.sh.spiShaderPgmPs, shader::gcn::Stage::Ps); + } else { + shaders[Cache::getStageIndex(VK_SHADER_STAGE_FRAGMENT_BIT)] = + getFillRedFragShader(*cacheTag.getCache()); + } + + if (pipe.uConfig.vgtPrimitiveType == gnm::PrimitiveType::RectList) { + shaders[Cache::getStageIndex(VK_SHADER_STAGE_GEOMETRY_BIT)] = + getPrimTypeRectGeomShader(*cacheTag.getCache()); + } + + if (indiciesAddress == 0) { + vertexCount = indexBuffer.indexCount; + } + + auto commandBuffer = pipe.scheduler.getCommandBuffer(); + + VkRenderingInfo renderInfo{ + .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, + .renderArea = gnm::toVkRect2D(pipe.context.paScScreenScissor), + .layerCount = 1, + .colorAttachmentCount = renderTargets, + .pColorAttachments = colorAttachments, + // .pDepthAttachment = &depthAttachment, + // .pStencilAttachment = &stencilAttachment, + }; + + vkCmdBeginRendering(commandBuffer, &renderInfo); + vkCmdSetRasterizerDiscardEnable(commandBuffer, VK_FALSE); + + vkCmdSetViewportWithCount(commandBuffer, renderTargets, viewPorts); + vkCmdSetScissorWithCount(commandBuffer, renderTargets, viewPortScissors); + + vk::CmdSetColorBlendEnableEXT(commandBuffer, 0, renderTargets, + colorBlendEnable); + vk::CmdSetColorBlendEquationEXT(commandBuffer, 0, renderTargets, + colorBlendEquation); + + vk::CmdSetDepthClampEnableEXT(commandBuffer, VK_FALSE); + vkCmdSetDepthCompareOp(commandBuffer, + gnm::toVkCompareOp(pipe.context.dbDepthControl.zFunc)); + vkCmdSetDepthTestEnable(commandBuffer, pipe.context.dbDepthControl.depthEnable + ? VK_TRUE + : VK_FALSE); + vkCmdSetDepthWriteEnable( + commandBuffer, + pipe.context.dbDepthControl.depthWriteEnable ? VK_TRUE : VK_FALSE); + vkCmdSetDepthBounds(commandBuffer, pipe.context.dbDepthBoundsMin, + pipe.context.dbDepthBoundsMax); + vkCmdSetDepthBoundsTestEnable( + commandBuffer, + pipe.context.dbDepthControl.depthBoundsEnable ? VK_TRUE : VK_FALSE); + // vkCmdSetStencilOp(commandBuffer, VK_STENCIL_FACE_FRONT_AND_BACK, + // VK_STENCIL_OP_KEEP, VK_STENCIL_OP_KEEP, + // VK_STENCIL_OP_KEEP, VK_COMPARE_OP_ALWAYS); + + vkCmdSetDepthBiasEnable(commandBuffer, VK_FALSE); + vkCmdSetDepthBias(commandBuffer, 0, 1, 1); + vkCmdSetPrimitiveRestartEnable(commandBuffer, VK_FALSE); + + vk::CmdSetAlphaToOneEnableEXT(commandBuffer, VK_FALSE); + + vk::CmdSetLogicOpEnableEXT(commandBuffer, VK_FALSE); + vk::CmdSetLogicOpEXT(commandBuffer, VK_LOGIC_OP_AND); + vk::CmdSetPolygonModeEXT(commandBuffer, VK_POLYGON_MODE_FILL); + vk::CmdSetRasterizationSamplesEXT(commandBuffer, VK_SAMPLE_COUNT_1_BIT); + VkSampleMask sampleMask = ~0; + vk::CmdSetSampleMaskEXT(commandBuffer, VK_SAMPLE_COUNT_1_BIT, &sampleMask); + vk::CmdSetTessellationDomainOriginEXT( + commandBuffer, VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT); + vk::CmdSetAlphaToCoverageEnableEXT(commandBuffer, VK_FALSE); + vk::CmdSetVertexInputEXT(commandBuffer, 0, nullptr, 0, nullptr); + vk::CmdSetColorWriteMaskEXT(commandBuffer, 0, renderTargets, colorWriteMask); + + vkCmdSetStencilCompareMask(commandBuffer, VK_STENCIL_FACE_FRONT_AND_BACK, 0); + vkCmdSetStencilWriteMask(commandBuffer, VK_STENCIL_FACE_FRONT_AND_BACK, 0); + vkCmdSetStencilReference(commandBuffer, VK_STENCIL_FACE_FRONT_AND_BACK, 0); + + VkCullModeFlags cullMode = VK_CULL_MODE_NONE; + if (pipe.context.paSuScModeCntl.cullBack) { + cullMode |= VK_CULL_MODE_BACK_BIT; + } + if (pipe.context.paSuScModeCntl.cullFront) { + cullMode |= VK_CULL_MODE_FRONT_BIT; + } + + vkCmdSetCullMode(commandBuffer, cullMode); + vkCmdSetFrontFace(commandBuffer, + gnm::toVkFrontFace(pipe.context.paSuScModeCntl.face)); + + vkCmdSetPrimitiveTopology(commandBuffer, + toVkPrimitiveType(pipe.uConfig.vgtPrimitiveType)); + vkCmdSetStencilTestEnable(commandBuffer, VK_FALSE); + + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipelineLayout, 0, descriptorSets.size(), + descriptorSets.data(), 0, nullptr); + + vk::CmdBindShadersEXT(commandBuffer, stages.size(), stages.data(), shaders); + + shaderResources.buildMemoryTable(*memoryTable); + + for (auto &sampler : shaderResources.samplerResources) { + uint32_t index = &sampler - shaderResources.samplerResources.data(); + + VkDescriptorImageInfo samplerInfo{.sampler = sampler.handle}; + + VkWriteDescriptorSet writeDescSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = descriptorSets[0], + .dstBinding = Cache::getDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLER), + .dstArrayElement = index, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER, + .pImageInfo = &samplerInfo, + }; + + vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); + } + + for (auto &imageResources : shaderResources.imageResources) { + auto dim = (&imageResources - shaderResources.imageResources) + 1; + for (auto &image : imageResources) { + uint32_t index = &image - imageResources.data(); + + VkDescriptorImageInfo imageInfo{ + .imageView = image.handle, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + + VkWriteDescriptorSet writeDescSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = descriptorSets[0], + .dstBinding = static_cast(Cache::getDescriptorBinding( + VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, dim)), + .dstArrayElement = index, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .pImageInfo = &imageInfo, + }; + + vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); + } + } + + for (auto &mtConfig : memoryTableConfigSlots) { + auto config = descriptorBuffers[mtConfig.bufferIndex]; + config[mtConfig.configIndex] = + shaderResources.getResourceSlot(mtConfig.resourceSlot); + } + + if (indexBuffer.handle != VK_NULL_HANDLE) { + vkCmdBindIndexBuffer(commandBuffer, indexBuffer.handle, indexBuffer.offset, + gnm::toVkIndexType(indexBuffer.indexType)); + vkCmdDrawIndexed(commandBuffer, indexCount, instanceCount, 0, firstVertex, + firstInstance); + } else { + vkCmdDraw(commandBuffer, vertexCount, instanceCount, firstVertex, + firstInstance); + } + + vkCmdEndRendering(commandBuffer); + pipe.scheduler.submit(); + pipe.scheduler.then([=, cacheTag = std::move(cacheTag), + shaderResources = std::move(shaderResources)] {}); +} + +static void +transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, + VkImageLayout oldLayout, VkImageLayout newLayout, + const VkImageSubresourceRange &subresourceRange) { + VkImageMemoryBarrier barrier{}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image; + barrier.subresourceRange = subresourceRange; + + auto layoutToStageAccess = [](VkImageLayout layout) + -> std::pair { + switch (layout) { + case VK_IMAGE_LAYOUT_UNDEFINED: + case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: + case VK_IMAGE_LAYOUT_GENERAL: + return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0}; + + case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: + return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT}; + + case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: + return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT}; + + case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: + return {VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT}; + + case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: + return {VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT}; + + case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: + return {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT}; + + default: + std::abort(); + } + }; + + auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout); + auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout); + + barrier.srcAccessMask = sourceAccess; + barrier.dstAccessMask = destinationAccess; + + vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, + nullptr, 0, nullptr, 1, &barrier); +} + +void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer, + VkExtent2D targetExtent, std::uint64_t address, + VkImageView target, VkExtent2D imageExtent, + CbCompSwap compSwap, TileMode tileMode, gnm::DataFormat dfmt, + gnm::NumericFormat nfmt) { + auto pipelineLayout = cacheTag.getGraphicsPipelineLayout(); + auto descriptorSets = cacheTag.createGraphicsDescriptorSets(); + + ImageViewKey framebuffer{}; + framebuffer.type = gnm::TextureType::Dim2D; + framebuffer.pitch = imageExtent.width; + framebuffer.address = address; + framebuffer.extent.width = imageExtent.width; + framebuffer.extent.height = imageExtent.height; + framebuffer.extent.depth = 1; + framebuffer.dfmt = dfmt; + framebuffer.nfmt = nfmt; + framebuffer.mipCount = 1; + framebuffer.arrayLayerCount = 1; + framebuffer.tileMode = tileMode; + + switch (compSwap) { + case CbCompSwap::Std: + framebuffer.R = gnm::Swizzle::R; + framebuffer.G = gnm::Swizzle::G; + framebuffer.B = gnm::Swizzle::B; + framebuffer.A = gnm::Swizzle::A; + break; + case CbCompSwap::Alt: + framebuffer.R = gnm::Swizzle::B; + framebuffer.G = gnm::Swizzle::G; + framebuffer.B = gnm::Swizzle::R; + framebuffer.A = gnm::Swizzle::A; + break; + case CbCompSwap::StdRev: + framebuffer.R = gnm::Swizzle::A; + framebuffer.G = gnm::Swizzle::B; + framebuffer.B = gnm::Swizzle::G; + framebuffer.A = gnm::Swizzle::R; + break; + case CbCompSwap::AltRev: + framebuffer.R = gnm::Swizzle::A; + framebuffer.G = gnm::Swizzle::R; + framebuffer.B = gnm::Swizzle::G; + framebuffer.A = gnm::Swizzle::B; + break; + } + + SamplerKey framebufferSampler = { + .magFilter = VK_FILTER_LINEAR, + .minFilter = VK_FILTER_LINEAR, + }; + + auto imageView = cacheTag.getImageView(framebuffer, Access::Read); + auto sampler = cacheTag.getSampler(framebufferSampler); + + cacheTag.submitAndWait(); + + VkDescriptorImageInfo imageInfo{ + .sampler = sampler.handle, + .imageView = imageView.handle, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + + VkWriteDescriptorSet writeDescSet[]{ + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = descriptorSets[0], + .dstBinding = + Cache::getDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 2), + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .pImageInfo = &imageInfo, + }, + { + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = descriptorSets[0], + .dstBinding = Cache::getDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLER), + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER, + .pImageInfo = &imageInfo, + }}; + + vkUpdateDescriptorSets(vk::context->device, std::size(writeDescSet), + writeDescSet, 0, nullptr); + + VkRenderingAttachmentInfo colorAttachments[1]{{ + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = target, + .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .clearValue = {}, + }}; + VkBool32 colorBlendEnable[1]{VK_FALSE}; + VkColorBlendEquationEXT colorBlendEquation[1]{}; + VkColorComponentFlags colorWriteMask[1]{ + VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT}; + VkViewport viewPorts[1]{ + { + .width = float(targetExtent.width), + .height = float(targetExtent.height), + }, + }; + + VkRect2D viewPortScissors[1]{{ + {}, + targetExtent, + }}; + + VkRenderingInfo renderInfo{ + .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, + .renderArea = + { + .offset = {}, + .extent = targetExtent, + }, + .layerCount = 1, + .colorAttachmentCount = 1, + .pColorAttachments = colorAttachments, + }; + + vkCmdBeginRendering(commandBuffer, &renderInfo); + vkCmdSetRasterizerDiscardEnable(commandBuffer, VK_FALSE); + + vkCmdSetViewportWithCount(commandBuffer, 1, viewPorts); + vkCmdSetScissorWithCount(commandBuffer, 1, viewPortScissors); + + vk::CmdSetColorBlendEnableEXT(commandBuffer, 0, 1, colorBlendEnable); + vk::CmdSetColorBlendEquationEXT(commandBuffer, 0, 1, colorBlendEquation); + + vk::CmdSetDepthClampEnableEXT(commandBuffer, VK_FALSE); + vkCmdSetDepthTestEnable(commandBuffer, VK_FALSE); + vkCmdSetDepthWriteEnable(commandBuffer, VK_FALSE); + vkCmdSetDepthBounds(commandBuffer, 0.0f, 1.0f); + vkCmdSetDepthBoundsTestEnable(commandBuffer, VK_FALSE); + + vkCmdSetDepthBiasEnable(commandBuffer, VK_FALSE); + vkCmdSetDepthBias(commandBuffer, 0, 1, 1); + vkCmdSetPrimitiveRestartEnable(commandBuffer, VK_FALSE); + + vk::CmdSetAlphaToOneEnableEXT(commandBuffer, VK_FALSE); + + vk::CmdSetLogicOpEnableEXT(commandBuffer, VK_FALSE); + vk::CmdSetLogicOpEXT(commandBuffer, VK_LOGIC_OP_AND); + vk::CmdSetPolygonModeEXT(commandBuffer, VK_POLYGON_MODE_FILL); + vk::CmdSetRasterizationSamplesEXT(commandBuffer, VK_SAMPLE_COUNT_1_BIT); + VkSampleMask sampleMask = ~0; + vk::CmdSetSampleMaskEXT(commandBuffer, VK_SAMPLE_COUNT_1_BIT, &sampleMask); + vk::CmdSetTessellationDomainOriginEXT( + commandBuffer, VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT); + vk::CmdSetAlphaToCoverageEnableEXT(commandBuffer, VK_FALSE); + vk::CmdSetVertexInputEXT(commandBuffer, 0, nullptr, 0, nullptr); + vk::CmdSetColorWriteMaskEXT(commandBuffer, 0, 1, colorWriteMask); + + vkCmdSetStencilCompareMask(commandBuffer, VK_STENCIL_FACE_FRONT_AND_BACK, 0); + vkCmdSetStencilWriteMask(commandBuffer, VK_STENCIL_FACE_FRONT_AND_BACK, 0); + vkCmdSetStencilReference(commandBuffer, VK_STENCIL_FACE_FRONT_AND_BACK, 0); + + vkCmdSetCullMode(commandBuffer, VK_CULL_MODE_NONE); + vkCmdSetFrontFace(commandBuffer, VK_FRONT_FACE_CLOCKWISE); + + vkCmdSetPrimitiveTopology(commandBuffer, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST); + vkCmdSetStencilTestEnable(commandBuffer, VK_FALSE); + + auto stages = Cache::kGraphicsStages; + VkShaderEXT shaders[stages.size()]{}; + + shaders[Cache::getStageIndex(VK_SHADER_STAGE_VERTEX_BIT)] = + getFlipVertexShader(*cacheTag.getCache()); + + shaders[Cache::getStageIndex(VK_SHADER_STAGE_FRAGMENT_BIT)] = + getFlipFragmentShader(*cacheTag.getCache()); + + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipelineLayout, 0, descriptorSets.size(), + descriptorSets.data(), 0, nullptr); + + vk::CmdBindShadersEXT(commandBuffer, stages.size(), stages.data(), shaders); + + vkCmdDraw(commandBuffer, 6, 1, 0, 0); + + vkCmdEndRendering(commandBuffer); + + // { + // VkImageMemoryBarrier barrier{ + // .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + // .srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT, + // .dstAccessMask = VK_ACCESS_NONE, + // .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + // .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + // .image = imageView.imageHandle, + // .subresourceRange = + // { + // .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + // .levelCount = 1, + // .layerCount = 1, + // }, + // }; + + // vkCmdPipelineBarrier(commandBuffer, + // VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, + // VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0, 0, nullptr, + // 0, nullptr, 1, &barrier); + // } +} diff --git a/rpcsx-gpu2/Renderer.hpp b/rpcsx-gpu2/Renderer.hpp new file mode 100644 index 00000000..20102091 --- /dev/null +++ b/rpcsx-gpu2/Renderer.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "Cache.hpp" +#include "Pipe.hpp" +#include +#include + +namespace amdgpu { +void draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, + std::uint32_t vertexCount, std::uint32_t firstInstance, + std::uint32_t instanceCount, std::uint64_t indiciesAddress, + std::uint32_t indexCount); +void flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer, + VkExtent2D targetExtent, std::uint64_t address, VkImageView target, + VkExtent2D imageExtent, CbCompSwap compSwap, TileMode tileMode, + gnm::DataFormat dfmt, gnm::NumericFormat nfmt); +} // namespace amdgpu diff --git a/rpcsx-gpu2/lib/CMakeLists.txt b/rpcsx-gpu2/lib/CMakeLists.txt new file mode 100644 index 00000000..d31dba75 --- /dev/null +++ b/rpcsx-gpu2/lib/CMakeLists.txt @@ -0,0 +1,4 @@ +add_subdirectory(amdgpu-tiler) +add_subdirectory(gcn-shader) +add_subdirectory(vk) +add_subdirectory(gnm) diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/CMakeLists.txt b/rpcsx-gpu2/lib/amdgpu-tiler/CMakeLists.txt new file mode 100644 index 00000000..f6eacaa6 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/CMakeLists.txt @@ -0,0 +1,22 @@ +add_precompiled_vulkan_spirv(amdgpu_tiler_vulkan_shaders + shaders/tiler1d.comp.glsl + shaders/tiler2d.comp.glsl + shaders/tilerLinear.comp.glsl + shaders/detiler1d.comp.glsl + shaders/detiler2d.comp.glsl + shaders/detilerLinear.comp.glsl +) + +add_library(amdgpu_tiler STATIC src/tiler.cpp) +target_include_directories(amdgpu_tiler PUBLIC include) + +add_library(amdgpu_tiler_cpu STATIC src/tiler_cpu.cpp) +add_library(amdgpu_tiler_vulkan STATIC src/tiler_vulkan.cpp) + +target_link_libraries(amdgpu_tiler PUBLIC gnm) +target_link_libraries(amdgpu_tiler_cpu PUBLIC amdgpu_tiler) +target_link_libraries(amdgpu_tiler_vulkan PUBLIC amdgpu_tiler amdgpu_tiler_vulkan_shaders vk) + +add_library(amdgpu::tiler ALIAS amdgpu_tiler) +add_library(amdgpu::tiler::cpu ALIAS amdgpu_tiler_cpu) +add_library(amdgpu::tiler::vulkan ALIAS amdgpu_tiler_vulkan) diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp new file mode 100644 index 00000000..d53d6f56 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp @@ -0,0 +1,505 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace amdgpu { +inline constexpr uint32_t kMicroTileWidth = 8; +inline constexpr uint32_t kMicroTileHeight = 8; +inline constexpr uint32_t kDramRowSize = 0x400; +inline constexpr uint32_t kPipeInterleaveBytes = 256; + +enum ArrayMode { + kArrayModeLinearGeneral = 0x00000000, + kArrayModeLinearAligned = 0x00000001, + kArrayMode1dTiledThin = 0x00000002, + kArrayMode1dTiledThick = 0x00000003, + kArrayMode2dTiledThin = 0x00000004, + kArrayModeTiledThinPrt = 0x00000005, + kArrayMode2dTiledThinPrt = 0x00000006, + kArrayMode2dTiledThick = 0x00000007, + kArrayMode2dTiledXThick = 0x00000008, + kArrayModeTiledThickPrt = 0x00000009, + kArrayMode2dTiledThickPrt = 0x0000000a, + kArrayMode3dTiledThinPrt = 0x0000000b, + kArrayMode3dTiledThin = 0x0000000c, + kArrayMode3dTiledThick = 0x0000000d, + kArrayMode3dTiledXThick = 0x0000000e, + kArrayMode3dTiledThickPrt = 0x0000000f, +}; + +enum MicroTileMode { + kMicroTileModeDisplay = 0x00000000, + kMicroTileModeThin = 0x00000001, + kMicroTileModeDepth = 0x00000002, + kMicroTileModeRotated = 0x00000003, + kMicroTileModeThick = 0x00000004, +}; + +enum PipeConfig { + kPipeConfigP8_32x32_8x16 = 0x0000000a, + kPipeConfigP8_32x32_16x16 = 0x0000000c, + kPipeConfigP16 = 0x00000012, +}; + +enum TileSplit { + kTileSplit64B = 0x00000000, + kTileSplit128B = 0x00000001, + kTileSplit256B = 0x00000002, + kTileSplit512B = 0x00000003, + kTileSplit1KB = 0x00000004, + kTileSplit2KB = 0x00000005, + kTileSplit4KB = 0x00000006, +}; + +enum SampleSplit { + kSampleSplit1 = 0x00000000, + kSampleSplit2 = 0x00000001, + kSampleSplit4 = 0x00000002, + kSampleSplit8 = 0x00000003, +}; + +enum NumBanks { + kNumBanks2 = 0x00000000, + kNumBanks4 = 0x00000001, + kNumBanks8 = 0x00000002, + kNumBanks16 = 0x00000003, +}; + +enum BankWidth { + kBankWidth1 = 0x00000000, + kBankWidth2 = 0x00000001, + kBankWidth4 = 0x00000002, + kBankWidth8 = 0x00000003, +}; + +enum BankHeight { + kBankHeight1 = 0x00000000, + kBankHeight2 = 0x00000001, + kBankHeight4 = 0x00000002, + kBankHeight8 = 0x00000003, +}; + +enum MacroTileAspect { + kMacroTileAspect1 = 0x00000000, + kMacroTileAspect2 = 0x00000001, + kMacroTileAspect4 = 0x00000002, + kMacroTileAspect8 = 0x00000003, +}; + +struct TileMode { + std::uint32_t raw; + + constexpr ArrayMode arrayMode() const { + return ArrayMode((raw & 0x0000003c) >> 2); + } + constexpr PipeConfig pipeConfig() const { + return PipeConfig((raw & 0x000007c0) >> 6); + } + constexpr TileSplit tileSplit() const { + return TileSplit((raw & 0x00003800) >> 11); + } + constexpr MicroTileMode microTileMode() const { + return MicroTileMode((raw & 0x01c00000) >> 22); + } + constexpr SampleSplit sampleSplit() const { + return SampleSplit((raw & 0x06000000) >> 25); + } + constexpr std::uint32_t altPipeConfig() const { + return (raw & 0xf8000000) >> 27; + } + + constexpr TileMode &arrayMode(ArrayMode mode) { + raw = (raw & ~0x0000003c) | + (static_cast(mode) << 2) & 0x0000003c; + return *this; + } + constexpr TileMode &pipeConfig(PipeConfig mode) { + raw = (raw & ~0x000007c0) | + (static_cast(mode) << 6) & 0x000007c0; + return *this; + } + constexpr TileMode &tileSplit(TileSplit mode) { + raw = (raw & ~0x00003800) | + (static_cast(mode) << 11) & 0x00003800; + return *this; + } + constexpr TileMode µTileMode(MicroTileMode mode) { + raw = (raw & ~0x01c00000) | + (static_cast(mode) << 22) & 0x01c00000; + return *this; + } + constexpr TileMode &sampleSplit(SampleSplit mode) { + raw = (raw & ~0x06000000) | + (static_cast(mode) << 25) & 0x06000000; + return *this; + } +}; + +struct MacroTileMode { + std::uint32_t raw; + + constexpr std::uint32_t bankWidth() const { return (raw & 0x00000003) >> 0; } + constexpr std::uint32_t bankHeight() const { return (raw & 0x0000000c) >> 2; } + constexpr MacroTileAspect macroTileAspect() const { + return MacroTileAspect((raw & 0x00000030) >> 4); + } + constexpr std::uint32_t numBanks() const { return (raw & 0x000000c0) >> 6; } + + constexpr std::uint32_t altBankHeight() const { + return (raw & 0x00000300) >> 8; + } + constexpr std::uint32_t altMacroTileAspect() const { + return (raw & 0x00000c00) >> 10; + } + constexpr std::uint32_t altNumBanks() const { + return (raw & 0x00003000) >> 12; + } +}; + +struct SurfaceInfo { + std::uint32_t width; + std::uint32_t height; + std::uint32_t depth; + std::uint32_t pitch; + int arrayLayerCount; + int numFragments; + int bitsPerElement; + std::uint64_t totalSize; + + struct SubresourceInfo { + std::uint32_t dataWidth; + std::uint32_t dataHeight; + std::uint32_t dataDepth; + std::uint64_t offset; + std::uint64_t tiledSize; + std::uint64_t linearSize; + }; + + SubresourceInfo subresources[16]; + + void setSubresourceInfo(int mipLevel, const SubresourceInfo &subresource) { + subresources[mipLevel] = subresource; + } + + const SubresourceInfo &getSubresourceInfo(int mipLevel) const { + return subresources[mipLevel]; + } +}; + +constexpr uint32_t getMicroTileThickness(ArrayMode arrayMode) { + switch (arrayMode) { + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThick: + case kArrayMode3dTiledThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThickPrt: + return 4; + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledXThick: + return 8; + case kArrayModeLinearGeneral: + case kArrayModeLinearAligned: + case kArrayMode1dTiledThin: + case kArrayMode2dTiledThin: + case kArrayModeTiledThinPrt: + case kArrayMode2dTiledThinPrt: + case kArrayMode3dTiledThinPrt: + case kArrayMode3dTiledThin: + return 1; + } + + std::abort(); +} + +constexpr bool isMacroTiled(ArrayMode arrayMode) { + switch (arrayMode) { + case kArrayModeLinearGeneral: + case kArrayModeLinearAligned: + case kArrayMode1dTiledThin: + case kArrayMode1dTiledThick: + return false; + case kArrayMode2dTiledThin: + case kArrayModeTiledThinPrt: + case kArrayMode2dTiledThinPrt: + case kArrayMode2dTiledThick: + case kArrayMode2dTiledXThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThinPrt: + case kArrayMode3dTiledThin: + case kArrayMode3dTiledThick: + case kArrayMode3dTiledXThick: + case kArrayMode3dTiledThickPrt: + return true; + } + + std::abort(); +} + +constexpr bool isPrt(ArrayMode arrayMode) { + switch (arrayMode) { + case kArrayModeLinearGeneral: + case kArrayModeLinearAligned: + case kArrayMode1dTiledThin: + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThin: + case kArrayMode2dTiledThick: + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledThin: + case kArrayMode3dTiledThick: + case kArrayMode3dTiledXThick: + return false; + + case kArrayModeTiledThinPrt: + case kArrayMode2dTiledThinPrt: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThinPrt: + case kArrayMode3dTiledThickPrt: + return true; + } + + std::abort(); +} + +constexpr std::array getDefaultMacroTileModes() { + return {{ + {.raw = 0x26e8}, + {.raw = 0x26d4}, + {.raw = 0x21d0}, + {.raw = 0x21d0}, + {.raw = 0x2080}, + {.raw = 0x2040}, + {.raw = 0x1000}, + {.raw = 0x0000}, + {.raw = 0x36ec}, + {.raw = 0x26e8}, + {.raw = 0x21d4}, + {.raw = 0x20d0}, + {.raw = 0x1080}, + {.raw = 0x1040}, + {.raw = 0x0000}, + {.raw = 0x0000}, + }}; +} + +constexpr std::array getDefaultTileModes() { + return {{ + {.raw = 0x90800310}, {.raw = 0x90800b10}, {.raw = 0x90801310}, + {.raw = 0x90801b10}, {.raw = 0x90802310}, {.raw = 0x90800308}, + {.raw = 0x90801318}, {.raw = 0x90802318}, {.raw = 0x90000304}, + {.raw = 0x90000308}, {.raw = 0x92000310}, {.raw = 0x92000294}, + {.raw = 0x92000318}, {.raw = 0x90400308}, {.raw = 0x92400310}, + {.raw = 0x924002b0}, {.raw = 0x92400294}, {.raw = 0x92400318}, + {.raw = 0x9240032c}, {.raw = 0x9100030c}, {.raw = 0x9100031c}, + {.raw = 0x910002b4}, {.raw = 0x910002a4}, {.raw = 0x91000328}, + {.raw = 0x910002bc}, {.raw = 0x91000320}, {.raw = 0x910002b8}, + {.raw = 0x90c00308}, {.raw = 0x92c00310}, {.raw = 0x92c00294}, + {.raw = 0x92c00318}, {.raw = 0x00000000}, + }}; +} + +constexpr std::uint32_t getElementIndex(std::uint32_t x, std::uint32_t y, + std::uint32_t z, + std::uint32_t bitsPerElement, + MicroTileMode microTileMode, + ArrayMode arrayMode) { + std::uint32_t elem = 0; + + if (microTileMode == kMicroTileModeDisplay) { + switch (bitsPerElement) { + case 8: + elem |= ((x >> 0) & 0x1) << 0; + elem |= ((x >> 1) & 0x1) << 1; + elem |= ((x >> 2) & 0x1) << 2; + elem |= ((y >> 1) & 0x1) << 3; + elem |= ((y >> 0) & 0x1) << 4; + elem |= ((y >> 2) & 0x1) << 5; + break; + case 16: + elem |= ((x >> 0) & 0x1) << 0; + elem |= ((x >> 1) & 0x1) << 1; + elem |= ((x >> 2) & 0x1) << 2; + elem |= ((y >> 0) & 0x1) << 3; + elem |= ((y >> 1) & 0x1) << 4; + elem |= ((y >> 2) & 0x1) << 5; + break; + case 32: + elem |= ((x >> 0) & 0x1) << 0; + elem |= ((x >> 1) & 0x1) << 1; + elem |= ((y >> 0) & 0x1) << 2; + elem |= ((x >> 2) & 0x1) << 3; + elem |= ((y >> 1) & 0x1) << 4; + elem |= ((y >> 2) & 0x1) << 5; + break; + case 64: + elem |= ((x >> 0) & 0x1) << 0; + elem |= ((y >> 0) & 0x1) << 1; + elem |= ((x >> 1) & 0x1) << 2; + elem |= ((x >> 2) & 0x1) << 3; + elem |= ((y >> 1) & 0x1) << 4; + elem |= ((y >> 2) & 0x1) << 5; + break; + default: + std::abort(); + } + } else if (microTileMode == kMicroTileModeThin || + microTileMode == kMicroTileModeDepth) { + elem |= ((x >> 0) & 0x1) << 0; + elem |= ((y >> 0) & 0x1) << 1; + elem |= ((x >> 1) & 0x1) << 2; + elem |= ((y >> 1) & 0x1) << 3; + elem |= ((x >> 2) & 0x1) << 4; + elem |= ((y >> 2) & 0x1) << 5; + + switch (arrayMode) { + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledXThick: + elem |= ((z >> 2) & 0x1) << 8; + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThick: + case kArrayMode3dTiledThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThickPrt: + elem |= ((z >> 0) & 0x1) << 6; + elem |= ((z >> 1) & 0x1) << 7; + default: + break; + } + } else if (microTileMode == kMicroTileModeThick) { + switch (arrayMode) { + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledXThick: + elem |= ((z >> 2) & 0x1) << 8; + + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThick: + case kArrayMode3dTiledThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThickPrt: + if (bitsPerElement == 8 || bitsPerElement == 16) { + elem |= ((x >> 0) & 0x1) << 0; + elem |= ((y >> 0) & 0x1) << 1; + elem |= ((x >> 1) & 0x1) << 2; + elem |= ((y >> 1) & 0x1) << 3; + elem |= ((z >> 0) & 0x1) << 4; + elem |= ((z >> 1) & 0x1) << 5; + elem |= ((x >> 2) & 0x1) << 6; + elem |= ((y >> 2) & 0x1) << 7; + } else if (bitsPerElement == 32) { + elem |= ((x >> 0) & 0x1) << 0; + elem |= ((y >> 0) & 0x1) << 1; + elem |= ((x >> 1) & 0x1) << 2; + elem |= ((z >> 0) & 0x1) << 3; + elem |= ((y >> 1) & 0x1) << 4; + elem |= ((z >> 1) & 0x1) << 5; + elem |= ((x >> 2) & 0x1) << 6; + elem |= ((y >> 2) & 0x1) << 7; + } else if (bitsPerElement == 64 || bitsPerElement == 128) { + elem |= ((x >> 0) & 0x1) << 0; + elem |= ((y >> 0) & 0x1) << 1; + elem |= ((z >> 0) & 0x1) << 2; + elem |= ((x >> 1) & 0x1) << 3; + elem |= ((y >> 1) & 0x1) << 4; + elem |= ((z >> 1) & 0x1) << 5; + elem |= ((x >> 2) & 0x1) << 6; + elem |= ((y >> 2) & 0x1) << 7; + } else { + std::abort(); + } + break; + default: + std::abort(); + } + } + return elem; +} + +constexpr uint32_t getPipeIndex(uint32_t x, uint32_t y, PipeConfig pipeCfg) { + uint32_t pipe = 0; + switch (pipeCfg) { + case kPipeConfigP8_32x32_8x16: + pipe |= (((x >> 4) ^ (y >> 3) ^ (x >> 5)) & 0x1) << 0; + pipe |= (((x >> 3) ^ (y >> 4)) & 0x1) << 1; + pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2; + break; + case kPipeConfigP8_32x32_16x16: + pipe |= (((x >> 3) ^ (y >> 3) ^ (x >> 4)) & 0x1) << 0; + pipe |= (((x >> 4) ^ (y >> 4)) & 0x1) << 1; + pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2; + break; + case kPipeConfigP16: + pipe |= (((x >> 3) ^ (y >> 3) ^ (x >> 4)) & 0x1) << 0; + pipe |= (((x >> 4) ^ (y >> 4)) & 0x1) << 1; + pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2; + pipe |= (((x >> 6) ^ (y >> 5)) & 0x1) << 3; + break; + default: + std::abort(); + } + return pipe; +} + +constexpr uint32_t getBankIndex(std::uint32_t x, std::uint32_t y, + std::uint32_t bank_width, + std::uint32_t bank_height, + std::uint32_t num_banks, + std::uint32_t num_pipes) { + std::uint32_t x_shift_offset = std::countr_zero(bank_width * num_pipes); + std::uint32_t y_shift_offset = std::countr_zero(bank_height); + std::uint32_t xs = x >> x_shift_offset; + std::uint32_t ys = y >> y_shift_offset; + std::uint32_t bank = 0; + switch (num_banks) { + case 2: + bank |= (((xs >> 3) ^ (ys >> 3)) & 0x1) << 0; + break; + case 4: + bank |= (((xs >> 3) ^ (ys >> 4)) & 0x1) << 0; + bank |= (((xs >> 4) ^ (ys >> 3)) & 0x1) << 1; + break; + case 8: + bank |= (((xs >> 3) ^ (ys >> 5)) & 0x1) << 0; + bank |= (((xs >> 4) ^ (ys >> 4) ^ (ys >> 5)) & 0x1) << 1; + bank |= (((xs >> 5) ^ (ys >> 3)) & 0x1) << 2; + break; + case 16: + bank |= (((xs >> 3) ^ (ys >> 6)) & 0x1) << 0; + bank |= (((xs >> 4) ^ (ys >> 5) ^ (ys >> 6)) & 0x1) << 1; + bank |= (((xs >> 5) ^ (ys >> 4)) & 0x1) << 2; + bank |= (((xs >> 6) ^ (ys >> 3)) & 0x1) << 3; + break; + default: + std::abort(); + } + + return bank; +} + +constexpr std::uint32_t getPipeCount(PipeConfig pipeConfig) { + switch (pipeConfig) { + case kPipeConfigP8_32x32_8x16: + case kPipeConfigP8_32x32_16x16: + return 8; + case kPipeConfigP16: + return 16; + default: + std::abort(); + } +} + +SurfaceInfo computeSurfaceInfo(TileMode tileMode, gnm::TextureType type, + gnm::DataFormat dfmt, std::uint32_t width, + std::uint32_t height, std::uint32_t depth, + std::uint32_t pitch, int baseArrayLayer, + int arrayCount, int baseMipLevel, int mipCount, + bool pow2pad); +SurfaceInfo computeSurfaceInfo(const gnm::TBuffer &tbuffer, TileMode tileMode); +} // namespace amdgpu diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_cpu.hpp b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_cpu.hpp new file mode 100644 index 00000000..2f060810 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_cpu.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "gnm/constants.hpp" +#include "tiler.hpp" +#include + +namespace amdgpu { +std::uint64_t getTiledOffset(gnm::TextureType texType, bool isPow2Padded, + int numFragments, gnm::DataFormat dfmt, + amdgpu::TileMode tileMode, + amdgpu::MacroTileMode macroTileMode, int mipLevel, + int arraySlice, int width, int height, int depth, + int pitch, int x, int y, int z, int fragmentIndex); +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp new file mode 100644 index 00000000..658dc7d5 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp @@ -0,0 +1,24 @@ +#pragma once +#include "tiler.hpp" +#include +#include + +namespace amdgpu { +struct GpuTiler { + struct Impl; + GpuTiler(); + ~GpuTiler(); + + void detile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info, + amdgpu::TileMode tileMode, std::uint64_t srcTiledAddress, + std::uint64_t dstLinearAddress, int mipLevel, int baseArray, + int arrayCount); + void tile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info, + amdgpu::TileMode tileMode, std::uint64_t srcLinearAddress, + std::uint64_t dstTiledAddress, int mipLevel, int baseArray, + int arrayCount); + +private: + std::unique_ptr mImpl; +}; +} // namespace amdgpu diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl new file mode 100644 index 00000000..2f65b404 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl @@ -0,0 +1,76 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_atomic_int64 : enable +#extension GL_EXT_shader_atomic_float : enable +#extension GL_EXT_shader_image_load_formatted : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_EXT_shared_memory_block : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_EXT_null_initializer : enable +#extension GL_EXT_buffer_reference2 : enable +#extension GL_EXT_buffer_reference_uvec2 : enable + +#include "tiler.glsl" + +void main() { + uvec3 pos = gl_GlobalInvocationID; + uint64_t tiledSliceOffset = 0; + uint64_t linearSliceOffset = 0; + if (config.tiledSurfaceSize != 0) { + tiledSliceOffset = pos.z * config.tiledSurfaceSize; + linearSliceOffset = pos.z * config.linearSurfaceSize; + pos.z = 0; + } + + uint64_t tiledByteOffset = getTiledBitOffset1D( + config.tileMode, + pos, + config.dataSize, + config.bitsPerElement + ) / 8; + + tiledByteOffset += tiledSliceOffset; + + uint64_t linearByteOffset = computeLinearElementByteOffset( + pos, + 0, + config.dataSize.x, + config.dataSize.x * config.dataSize.y, + config.bitsPerElement, + 1 << config.numFragments + ); + + linearByteOffset += linearSliceOffset; + + switch ((config.bitsPerElement + 7) / 8) { + case 1: + buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data; + break; + + case 2: + buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data; + break; + + case 4: + buffer_reference_uint32_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint32_t(config.srcAddress + tiledByteOffset).data; + break; + + case 8: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + break; + + case 16: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data; + break; + + case 32: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 16).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 24).data; + break; + } +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl new file mode 100644 index 00000000..2f65b404 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl @@ -0,0 +1,76 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_atomic_int64 : enable +#extension GL_EXT_shader_atomic_float : enable +#extension GL_EXT_shader_image_load_formatted : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_EXT_shared_memory_block : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_EXT_null_initializer : enable +#extension GL_EXT_buffer_reference2 : enable +#extension GL_EXT_buffer_reference_uvec2 : enable + +#include "tiler.glsl" + +void main() { + uvec3 pos = gl_GlobalInvocationID; + uint64_t tiledSliceOffset = 0; + uint64_t linearSliceOffset = 0; + if (config.tiledSurfaceSize != 0) { + tiledSliceOffset = pos.z * config.tiledSurfaceSize; + linearSliceOffset = pos.z * config.linearSurfaceSize; + pos.z = 0; + } + + uint64_t tiledByteOffset = getTiledBitOffset1D( + config.tileMode, + pos, + config.dataSize, + config.bitsPerElement + ) / 8; + + tiledByteOffset += tiledSliceOffset; + + uint64_t linearByteOffset = computeLinearElementByteOffset( + pos, + 0, + config.dataSize.x, + config.dataSize.x * config.dataSize.y, + config.bitsPerElement, + 1 << config.numFragments + ); + + linearByteOffset += linearSliceOffset; + + switch ((config.bitsPerElement + 7) / 8) { + case 1: + buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data; + break; + + case 2: + buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data; + break; + + case 4: + buffer_reference_uint32_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint32_t(config.srcAddress + tiledByteOffset).data; + break; + + case 8: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + break; + + case 16: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data; + break; + + case 32: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 16).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 24).data; + break; + } +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl new file mode 100644 index 00000000..c2780682 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl @@ -0,0 +1,76 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_atomic_int64 : enable +#extension GL_EXT_shader_atomic_float : enable +#extension GL_EXT_shader_image_load_formatted : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_EXT_shared_memory_block : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_EXT_null_initializer : enable +#extension GL_EXT_buffer_reference2 : enable +#extension GL_EXT_buffer_reference_uvec2 : enable + +#include "tiler.glsl" + +void main() { + uvec3 pos = gl_GlobalInvocationID; + uint64_t tiledSliceOffset = 0; + uint64_t linearSliceOffset = 0; + if (config.tiledSurfaceSize != 0) { + tiledSliceOffset = pos.z * config.tiledSurfaceSize; + linearSliceOffset = pos.z * config.linearSurfaceSize; + pos.z = 0; + } + + uint64_t tiledByteOffset = computeLinearOffset( + config.bitsPerElement, + config.dataSize.y, + config.dataSize.x, + pos + ) / 8; + + tiledByteOffset += tiledSliceOffset; + + uint64_t linearByteOffset = computeLinearElementByteOffset( + pos, + 0, + config.dataSize.x, + config.dataSize.x * config.dataSize.y, + config.bitsPerElement, + 1 << config.numFragments + ); + + linearByteOffset += linearSliceOffset; + + switch ((config.bitsPerElement + 7) / 8) { + case 1: + buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data; + break; + + case 2: + buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data; + break; + + case 4: + buffer_reference_uint32_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint32_t(config.srcAddress + tiledByteOffset).data; + break; + + case 8: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + break; + + case 16: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data; + break; + + case 32: + buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 16).data; + buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 24).data; + break; + } +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl new file mode 100644 index 00000000..04c9dbd0 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl @@ -0,0 +1,716 @@ + +#define FOR_ALL_BASE_TYPES(OP) \ + OP(int8_t) \ + OP(uint8_t) \ + OP(int16_t) \ + OP(uint16_t) \ + OP(float16_t) \ + OP(int32_t) \ + OP(uint32_t) \ + OP(float32_t) \ + OP(int64_t) \ + OP(uint64_t) \ + OP(float64_t) \ + +#define DEFINE_BUFFER_REFERENCE(TYPE) \ + layout(buffer_reference) buffer buffer_reference_##TYPE { \ + TYPE data; \ + }; \ + +FOR_ALL_BASE_TYPES(DEFINE_BUFFER_REFERENCE) + +#define U32ARRAY_FETCH_BITS(ARRAY, START, BITCOUNT) ((ARRAY[(START) >> 5] >> ((START) & 31)) & ((1 << (BITCOUNT)) - 1)) +#define U64ARRAY_FETCH_BITS(ARRAY, START, BITCOUNT) ((ARRAY[(START) >> 6] >> ((START) & 63)) & ((uint64_t(1) << (BITCOUNT)) - 1)) + +uint64_t tbuffer_base(u64vec4 tbuffer) { + return U64ARRAY_FETCH_BITS(tbuffer, 0, 38); +} +uint32_t tbuffer_mtype_L2(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 38, 2)); +} +uint32_t tbuffer_min_lod(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 40, 12)); +} +uint32_t tbuffer_dfmt(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 52, 6)); +} +uint32_t tbuffer_nfmt(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 58, 4)); +} +uint32_t tbuffer_mtype_l1(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 62, 2) | (U64ARRAY_FETCH_BITS(tbuffer, 122, 1) << 2)); +} +uint32_t tbuffer_width(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 64, 14)); +} +uint32_t tbuffer_height(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 78, 14)); +} +uint32_t tbuffer_perfMod(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 92, 3)); +} +bool tbuffer_interlaced(u64vec4 tbuffer) { + return U64ARRAY_FETCH_BITS(tbuffer, 95, 1) != 0; +} +uint32_t tbuffer_dst_sel_x(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 96, 3)); +} +uint32_t tbuffer_dst_sel_y(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 99, 3)); +} +uint32_t tbuffer_dst_sel_z(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 102, 3)); +} +uint32_t tbuffer_dst_sel_w(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 105, 3)); +} +uint32_t tbuffer_base_level(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 108, 4)); +} +uint32_t tbuffer_last_level(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 112, 4)); +} +uint32_t tbuffer_tiling_idx(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 116, 5)); +} +bool tbuffer_pow2pad(u64vec4 tbuffer) { + return U64ARRAY_FETCH_BITS(tbuffer, 121, 1) != 0; +} +uint32_t tbuffer_type(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 124, 4)); +} +uint32_t tbuffer_depth(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 128, 13)); +} +uint32_t tbuffer_pitch(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 141, 14)); +} +uint32_t tbuffer_base_array(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 160, 13)); +} +uint32_t tbuffer_last_array(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 173, 13)); +} +uint32_t tbuffer_min_lod_warn(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 192, 12)); +} +uint32_t tbuffer_counter_bank_id(u64vec4 tbuffer) { + return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 204, 8)); +} +bool tbuffer_LOD_hdw_cnt_en(u64vec4 tbuffer) { + return U64ARRAY_FETCH_BITS(tbuffer, 212, 1) != 0; +} + +const int kTextureType1D = 8; +const int kTextureType2D = 9; +const int kTextureType3D = 10; +const int kTextureTypeCube = 11; +const int kTextureTypeArray1D = 12; +const int kTextureTypeArray2D = 13; +const int kTextureTypeMsaa2D = 14; +const int kTextureTypeMsaaArray2D = 15; + +const uint32_t kMicroTileWidth = 8; +const uint32_t kMicroTileHeight = 8; +const uint32_t kDramRowSize = 0x400; +const uint32_t kPipeInterleaveBytes = 256; + + +const uint32_t kDataFormatInvalid = 0x00000000; +const uint32_t kDataFormat8 = 0x00000001; +const uint32_t kDataFormat16 = 0x00000002; +const uint32_t kDataFormat8_8 = 0x00000003; +const uint32_t kDataFormat32 = 0x00000004; +const uint32_t kDataFormat16_16 = 0x00000005; +const uint32_t kDataFormat10_11_11 = 0x00000006; +const uint32_t kDataFormat11_11_10 = 0x00000007; +const uint32_t kDataFormat10_10_10_2 = 0x00000008; +const uint32_t kDataFormat2_10_10_10 = 0x00000009; +const uint32_t kDataFormat8_8_8_8 = 0x0000000a; +const uint32_t kDataFormat32_32 = 0x0000000b; +const uint32_t kDataFormat16_16_16_16 = 0x0000000c; +const uint32_t kDataFormat32_32_32 = 0x0000000d; +const uint32_t kDataFormat32_32_32_32 = 0x0000000e; +const uint32_t kDataFormat5_6_5 = 0x00000010; +const uint32_t kDataFormat1_5_5_5 = 0x00000011; +const uint32_t kDataFormat5_5_5_1 = 0x00000012; +const uint32_t kDataFormat4_4_4_4 = 0x00000013; +const uint32_t kDataFormat8_24 = 0x00000014; +const uint32_t kDataFormat24_8 = 0x00000015; +const uint32_t kDataFormatX24_8_32 = 0x00000016; +const uint32_t kDataFormatGB_GR = 0x00000020; +const uint32_t kDataFormatBG_RG = 0x00000021; +const uint32_t kDataFormat5_9_9_9 = 0x00000022; +const uint32_t kDataFormatBc1 = 0x00000023; +const uint32_t kDataFormatBc2 = 0x00000024; +const uint32_t kDataFormatBc3 = 0x00000025; +const uint32_t kDataFormatBc4 = 0x00000026; +const uint32_t kDataFormatBc5 = 0x00000027; +const uint32_t kDataFormatBc6 = 0x00000028; +const uint32_t kDataFormatBc7 = 0x00000029; +const uint32_t kDataFormatFmask8_S2_F1 = 0x0000002C; +const uint32_t kDataFormatFmask8_S4_F1 = 0x0000002D; +const uint32_t kDataFormatFmask8_S8_F1 = 0x0000002E; +const uint32_t kDataFormatFmask8_S2_F2 = 0x0000002F; +const uint32_t kDataFormatFmask8_S4_F2 = 0x00000030; +const uint32_t kDataFormatFmask8_S4_F4 = 0x00000031; +const uint32_t kDataFormatFmask16_S16_F1 = 0x00000032; +const uint32_t kDataFormatFmask16_S8_F2 = 0x00000033; +const uint32_t kDataFormatFmask32_S16_F2 = 0x00000034; +const uint32_t kDataFormatFmask32_S8_F4 = 0x00000035; +const uint32_t kDataFormatFmask32_S8_F8 = 0x00000036; +const uint32_t kDataFormatFmask64_S16_F4 = 0x00000037; +const uint32_t kDataFormatFmask64_S16_F8 = 0x00000038; +const uint32_t kDataFormat4_4 = 0x00000039; +const uint32_t kDataFormat6_5_5 = 0x0000003A; +const uint32_t kDataFormat1 = 0x0000003B; +const uint32_t kDataFormat1Reversed = 0x0000003C; + +const uint32_t kNumericFormatUNorm = 0x00000000; +const uint32_t kNumericFormatSNorm = 0x00000001; +const uint32_t kNumericFormatUScaled = 0x00000002; +const uint32_t kNumericFormatSScaled = 0x00000003; +const uint32_t kNumericFormatUInt = 0x00000004; +const uint32_t kNumericFormatSInt = 0x00000005; +const uint32_t kNumericFormatSNormNoZero = 0x00000006; +const uint32_t kNumericFormatFloat = 0x00000007; +const uint32_t kNumericFormatSrgb = 0x00000009; +const uint32_t kNumericFormatUBNorm = 0x0000000A; +const uint32_t kNumericFormatUBNormNoZero = 0x0000000B; +const uint32_t kNumericFormatUBInt = 0x0000000C; +const uint32_t kNumericFormatUBScaled = 0x0000000D; + +const uint32_t kArrayModeLinearGeneral = 0x00000000; +const uint32_t kArrayModeLinearAligned = 0x00000001; +const uint32_t kArrayMode1dTiledThin = 0x00000002; +const uint32_t kArrayMode1dTiledThick = 0x00000003; +const uint32_t kArrayMode2dTiledThin = 0x00000004; +const uint32_t kArrayModeTiledThinPrt = 0x00000005; +const uint32_t kArrayMode2dTiledThinPrt = 0x00000006; +const uint32_t kArrayMode2dTiledThick = 0x00000007; +const uint32_t kArrayMode2dTiledXThick = 0x00000008; +const uint32_t kArrayModeTiledThickPrt = 0x00000009; +const uint32_t kArrayMode2dTiledThickPrt = 0x0000000a; +const uint32_t kArrayMode3dTiledThinPrt = 0x0000000b; +const uint32_t kArrayMode3dTiledThin = 0x0000000c; +const uint32_t kArrayMode3dTiledThick = 0x0000000d; +const uint32_t kArrayMode3dTiledXThick = 0x0000000e; +const uint32_t kArrayMode3dTiledThickPrt = 0x0000000f; + +const uint32_t kMicroTileModeDisplay = 0x00000000; +const uint32_t kMicroTileModeThin = 0x00000001; +const uint32_t kMicroTileModeDepth = 0x00000002; +const uint32_t kMicroTileModeRotated = 0x00000003; +const uint32_t kMicroTileModeThick = 0x00000004; + +const uint32_t kPipeConfigP8_32x32_8x16 = 0x0000000a; +const uint32_t kPipeConfigP8_32x32_16x16 = 0x0000000c; +const uint32_t kPipeConfigP16 = 0x00000012; + + + +uint32_t getMicroTileThickness(uint32_t arrayMode) { + switch (arrayMode) { + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThick: + case kArrayMode3dTiledThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThickPrt: + return 4; + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledXThick: + return 8; + case kArrayModeLinearGeneral: + case kArrayModeLinearAligned: + case kArrayMode1dTiledThin: + case kArrayMode2dTiledThin: + case kArrayModeTiledThinPrt: + case kArrayMode2dTiledThinPrt: + case kArrayMode3dTiledThinPrt: + case kArrayMode3dTiledThin: + return 1; + } + + return 1; +} + +bool isMacroTiled(uint32_t arrayMode) { + switch (arrayMode) { + case kArrayModeLinearGeneral: + case kArrayModeLinearAligned: + case kArrayMode1dTiledThin: + case kArrayMode1dTiledThick: + return false; + case kArrayMode2dTiledThin: + case kArrayModeTiledThinPrt: + case kArrayMode2dTiledThinPrt: + case kArrayMode2dTiledThick: + case kArrayMode2dTiledXThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThinPrt: + case kArrayMode3dTiledThin: + case kArrayMode3dTiledThick: + case kArrayMode3dTiledXThick: + case kArrayMode3dTiledThickPrt: + return true; + } + + return false; +} + +bool isPrt(uint32_t arrayMode) { + switch (arrayMode) { + case kArrayModeLinearGeneral: + case kArrayModeLinearAligned: + case kArrayMode1dTiledThin: + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThin: + case kArrayMode2dTiledThick: + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledThin: + case kArrayMode3dTiledThick: + case kArrayMode3dTiledXThick: + return false; + + case kArrayModeTiledThinPrt: + case kArrayMode2dTiledThinPrt: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThinPrt: + case kArrayMode3dTiledThickPrt: + return true; + } + + return false; +} + +int getTexelsPerElement(uint32_t dfmt) { + switch (dfmt) { + case kDataFormatBc1: + case kDataFormatBc2: + case kDataFormatBc3: + case kDataFormatBc4: + case kDataFormatBc5: + case kDataFormatBc6: + case kDataFormatBc7: + return 16; + case kDataFormat1: + case kDataFormat1Reversed: + return 8; + case kDataFormatGB_GR: + case kDataFormatBG_RG: + return 2; + default: + return 1; + } +} + +int getBitsPerElement(uint32_t dfmt) { + switch (dfmt) { + case kDataFormatInvalid: + return 0; + case kDataFormat8: + return 8; + case kDataFormat16: + return 16; + case kDataFormat8_8: + return 16; + case kDataFormat32: + return 32; + case kDataFormat16_16: + return 32; + case kDataFormat10_11_11: + return 32; + case kDataFormat11_11_10: + return 32; + case kDataFormat10_10_10_2: + return 32; + case kDataFormat2_10_10_10: + return 32; + case kDataFormat8_8_8_8: + return 32; + case kDataFormat32_32: + return 64; + case kDataFormat16_16_16_16: + return 64; + case kDataFormat32_32_32: + return 96; + case kDataFormat32_32_32_32: + return 128; + case kDataFormat5_6_5: + return 16; + case kDataFormat1_5_5_5: + return 16; + case kDataFormat5_5_5_1: + return 16; + case kDataFormat4_4_4_4: + return 16; + case kDataFormat8_24: + return 32; + case kDataFormat24_8: + return 32; + case kDataFormatX24_8_32: + return 64; + case kDataFormatGB_GR: + return 16; + case kDataFormatBG_RG: + return 16; + case kDataFormat5_9_9_9: + return 32; + case kDataFormatBc1: + return 4; + case kDataFormatBc2: + return 8; + case kDataFormatBc3: + return 8; + case kDataFormatBc4: + return 4; + case kDataFormatBc5: + return 8; + case kDataFormatBc6: + return 8; + case kDataFormatBc7: + return 8; + case kDataFormatFmask8_S2_F1: + return 8; + case kDataFormatFmask8_S4_F1: + return 8; + case kDataFormatFmask8_S8_F1: + return 8; + case kDataFormatFmask8_S2_F2: + return 8; + case kDataFormatFmask8_S4_F2: + return 8; + case kDataFormatFmask8_S4_F4: + return 8; + case kDataFormatFmask16_S16_F1: + return 16; + case kDataFormatFmask16_S8_F2: + return 16; + case kDataFormatFmask32_S16_F2: + return 32; + case kDataFormatFmask32_S8_F4: + return 32; + case kDataFormatFmask32_S8_F8: + return 32; + case kDataFormatFmask64_S16_F4: + return 64; + case kDataFormatFmask64_S16_F8: + return 64; + case kDataFormat4_4: + return 8; + case kDataFormat6_5_5: + return 16; + case kDataFormat1: + return 1; + case kDataFormat1Reversed: + return 1; + } + + return -1; +} + +int getTotalBitsPerElement(uint32_t dfmt) { + return getBitsPerElement(dfmt) * getTexelsPerElement(dfmt); +} + +int getNumComponentsPerElement(uint32_t dfmt) { + switch (dfmt) { + case kDataFormatInvalid: + return 0; + case kDataFormat8: + return 1; + case kDataFormat16: + return 1; + case kDataFormat8_8: + return 2; + case kDataFormat32: + return 1; + case kDataFormat16_16: + return 2; + case kDataFormat10_11_11: + return 3; + case kDataFormat11_11_10: + return 3; + case kDataFormat10_10_10_2: + return 4; + case kDataFormat2_10_10_10: + return 4; + case kDataFormat8_8_8_8: + return 4; + case kDataFormat32_32: + return 2; + case kDataFormat16_16_16_16: + return 4; + case kDataFormat32_32_32: + return 3; + case kDataFormat32_32_32_32: + return 4; + case kDataFormat5_6_5: + return 3; + case kDataFormat1_5_5_5: + return 4; + case kDataFormat5_5_5_1: + return 4; + case kDataFormat4_4_4_4: + return 4; + case kDataFormat8_24: + return 2; + case kDataFormat24_8: + return 2; + case kDataFormatX24_8_32: + return 2; + case kDataFormatGB_GR: + return 3; + case kDataFormatBG_RG: + return 3; + case kDataFormat5_9_9_9: + return 3; + case kDataFormatBc1: + return 4; + case kDataFormatBc2: + return 4; + case kDataFormatBc3: + return 4; + case kDataFormatBc4: + return 1; + case kDataFormatBc5: + return 2; + case kDataFormatBc6: + return 3; + case kDataFormatBc7: + return 4; + case kDataFormatFmask8_S2_F1: + return 2; + case kDataFormatFmask8_S4_F1: + return 2; + case kDataFormatFmask8_S8_F1: + return 2; + case kDataFormatFmask8_S2_F2: + return 2; + case kDataFormatFmask8_S4_F2: + return 2; + case kDataFormatFmask8_S4_F4: + return 2; + case kDataFormatFmask16_S16_F1: + return 2; + case kDataFormatFmask16_S8_F2: + return 2; + case kDataFormatFmask32_S16_F2: + return 2; + case kDataFormatFmask32_S8_F4: + return 2; + case kDataFormatFmask32_S8_F8: + return 2; + case kDataFormatFmask64_S16_F4: + return 2; + case kDataFormatFmask64_S16_F8: + return 2; + case kDataFormat4_4: + return 2; + case kDataFormat6_5_5: + return 3; + case kDataFormat1: + return 1; + case kDataFormat1Reversed: + return 1; + } + + return -1; +} + +uint32_t tileMode_getArrayMode(uint32_t tileMode) { + return (tileMode & 0x0000003c) >> 2; +} +uint32_t tileMode_getPipeConfig(uint32_t tileMode) { + return (tileMode & 0x000007c0) >> 6; +} +uint32_t tileMode_getTileSplit(uint32_t tileMode) { + return (tileMode & 0x00003800) >> 11; +} +uint32_t tileMode_getMicroTileMode(uint32_t tileMode) { + return (tileMode & 0x01c00000) >> 22; +} +uint32_t tileMode_getSampleSplit(uint32_t tileMode) { + return (tileMode & 0x06000000) >> 25; +} + +uint32_t bit_ceil(uint32_t x) { + x = x - 1; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return x + 1; +} + +uint32_t getElementIndex(uvec3 pos, uint32_t bitsPerElement, uint32_t microTileMode, uint32_t arrayMode) { + uint32_t elem = 0; + + if (microTileMode == kMicroTileModeDisplay) { + switch (bitsPerElement) { + case 8: + elem |= ((pos.x >> 0) & 0x1) << 0; + elem |= ((pos.x >> 1) & 0x1) << 1; + elem |= ((pos.x >> 2) & 0x1) << 2; + elem |= ((pos.y >> 1) & 0x1) << 3; + elem |= ((pos.y >> 0) & 0x1) << 4; + elem |= ((pos.y >> 2) & 0x1) << 5; + break; + case 16: + elem |= ((pos.x >> 0) & 0x1) << 0; + elem |= ((pos.x >> 1) & 0x1) << 1; + elem |= ((pos.x >> 2) & 0x1) << 2; + elem |= ((pos.y >> 0) & 0x1) << 3; + elem |= ((pos.y >> 1) & 0x1) << 4; + elem |= ((pos.y >> 2) & 0x1) << 5; + break; + case 32: + elem |= ((pos.x >> 0) & 0x1) << 0; + elem |= ((pos.x >> 1) & 0x1) << 1; + elem |= ((pos.y >> 0) & 0x1) << 2; + elem |= ((pos.x >> 2) & 0x1) << 3; + elem |= ((pos.y >> 1) & 0x1) << 4; + elem |= ((pos.y >> 2) & 0x1) << 5; + break; + case 64: + elem |= ((pos.x >> 0) & 0x1) << 0; + elem |= ((pos.y >> 0) & 0x1) << 1; + elem |= ((pos.x >> 1) & 0x1) << 2; + elem |= ((pos.x >> 2) & 0x1) << 3; + elem |= ((pos.y >> 1) & 0x1) << 4; + elem |= ((pos.y >> 2) & 0x1) << 5; + break; + } + } else if (microTileMode == kMicroTileModeThin || + microTileMode == kMicroTileModeDepth) { + elem |= ((pos.x >> 0) & 0x1) << 0; + elem |= ((pos.y >> 0) & 0x1) << 1; + elem |= ((pos.x >> 1) & 0x1) << 2; + elem |= ((pos.y >> 1) & 0x1) << 3; + elem |= ((pos.x >> 2) & 0x1) << 4; + elem |= ((pos.y >> 2) & 0x1) << 5; + + switch (arrayMode) { + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledXThick: + elem |= ((pos.z >> 2) & 0x1) << 8; + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThick: + case kArrayMode3dTiledThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThickPrt: + elem |= ((pos.z >> 0) & 0x1) << 6; + elem |= ((pos.z >> 1) & 0x1) << 7; + default: + break; + } + } else if (microTileMode == kMicroTileModeThick) { + switch (arrayMode) { + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledXThick: + elem |= ((pos.z >> 2) & 0x1) << 8; + + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThick: + case kArrayMode3dTiledThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThickPrt: + if (bitsPerElement == 8 || bitsPerElement == 16) { + elem |= ((pos.x >> 0) & 0x1) << 0; + elem |= ((pos.y >> 0) & 0x1) << 1; + elem |= ((pos.x >> 1) & 0x1) << 2; + elem |= ((pos.y >> 1) & 0x1) << 3; + elem |= ((pos.z >> 0) & 0x1) << 4; + elem |= ((pos.z >> 1) & 0x1) << 5; + elem |= ((pos.x >> 2) & 0x1) << 6; + elem |= ((pos.y >> 2) & 0x1) << 7; + } else if (bitsPerElement == 32) { + elem |= ((pos.x >> 0) & 0x1) << 0; + elem |= ((pos.y >> 0) & 0x1) << 1; + elem |= ((pos.x >> 1) & 0x1) << 2; + elem |= ((pos.z >> 0) & 0x1) << 3; + elem |= ((pos.y >> 1) & 0x1) << 4; + elem |= ((pos.z >> 1) & 0x1) << 5; + elem |= ((pos.x >> 2) & 0x1) << 6; + elem |= ((pos.y >> 2) & 0x1) << 7; + } else if (bitsPerElement == 64 || bitsPerElement == 128) { + elem |= ((pos.x >> 0) & 0x1) << 0; + elem |= ((pos.y >> 0) & 0x1) << 1; + elem |= ((pos.z >> 0) & 0x1) << 2; + elem |= ((pos.x >> 1) & 0x1) << 3; + elem |= ((pos.y >> 1) & 0x1) << 4; + elem |= ((pos.z >> 1) & 0x1) << 5; + elem |= ((pos.x >> 2) & 0x1) << 6; + elem |= ((pos.y >> 2) & 0x1) << 7; + } + break; + } + } + return elem; +} + +uint64_t computeLinearElementByteOffset( + uvec3 pos, uint32_t fragmentIndex, uint32_t pitch, + uint32_t slicePitchElems, uint32_t bitsPerElement, + uint32_t numFragmentsPerPixel) { + uint64_t absoluteElementIndex = pos.z * slicePitchElems + pos.y * pitch + pos.x; + return ((absoluteElementIndex * bitsPerElement * numFragmentsPerPixel) + + (bitsPerElement * fragmentIndex)) / 8; +} + +uint64_t computeLinearOffset(uint32_t bitsPerElement, uint height, uint pitch, uvec3 pos) { + uint paddedHeight = height; + uint paddedWidth = pitch; + + if (bitsPerElement == 1) { + bitsPerElement *= 8; + paddedWidth = max((paddedWidth + 7) / 8, 1); + } + + uint64_t tiledRowSizeBits = uint64_t(bitsPerElement) * paddedWidth; + uint64_t tiledSliceBits = uint64_t(paddedWidth) * paddedHeight * bitsPerElement; + return tiledSliceBits * pos.z + tiledRowSizeBits * pos.y + bitsPerElement * pos.x; +} + +uint64_t getTiledBitOffset1D(uint32_t tileMode, uvec3 pos, uvec2 dataSize, uint32_t bitsPerElement) { + uint32_t arrayMode = tileMode_getArrayMode(tileMode); + + uint32_t paddedWidth = dataSize.x; + uint32_t paddedHeight = dataSize.y; + + int tileThickness = (arrayMode == kArrayMode1dTiledThick) ? 4 : 1; + + uint64_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness * bitsPerElement + 7) / 8; + uint32_t tilesPerRow = paddedWidth / kMicroTileWidth; + uint32_t tilesPerSlice = max(tilesPerRow * (paddedHeight / kMicroTileHeight), 1); + + uint64_t elementIndex = getElementIndex(pos, bitsPerElement, + tileMode_getMicroTileMode(tileMode), arrayMode); + + uint64_t sliceOffset = (pos.z / tileThickness) * tilesPerSlice * tileBytes; + + uint64_t tileRowIndex = pos.y / kMicroTileHeight; + uint64_t tileColumnIndex = pos.x / kMicroTileWidth; + uint64_t tileOffset = + (tileRowIndex * tilesPerRow + tileColumnIndex) * tileBytes; + + uint64_t elementOffset = elementIndex * bitsPerElement; + return (sliceOffset + tileOffset) * 8 + elementOffset; +} + +layout(binding=0) uniform Config { + uint64_t srcAddress; + uint64_t dstAddress; + uvec2 dataSize; + uint32_t tileMode; + uint32_t numFragments; + uint32_t bitsPerElement; + uint32_t tiledSurfaceSize; + uint32_t linearSurfaceSize; +} config; diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl new file mode 100644 index 00000000..db92aae0 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl @@ -0,0 +1,76 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_atomic_int64 : enable +#extension GL_EXT_shader_atomic_float : enable +#extension GL_EXT_shader_image_load_formatted : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_EXT_shared_memory_block : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_EXT_null_initializer : enable +#extension GL_EXT_buffer_reference2 : enable +#extension GL_EXT_buffer_reference_uvec2 : enable + +#include "tiler.glsl" + +void main() { + uvec3 pos = gl_GlobalInvocationID; + uint64_t tiledSliceOffset = 0; + uint64_t linearSliceOffset = 0; + if (config.tiledSurfaceSize != 0) { + tiledSliceOffset = pos.z * config.tiledSurfaceSize; + linearSliceOffset = pos.z * config.linearSurfaceSize; + pos.z = 0; + } + + uint64_t tiledByteOffset = getTiledBitOffset1D( + config.tileMode, + pos, + config.dataSize, + config.bitsPerElement + ) / 8; + + tiledByteOffset += tiledSliceOffset; + + uint64_t linearByteOffset = computeLinearElementByteOffset( + pos, + 0, + config.dataSize.x, + config.dataSize.x * config.dataSize.y, + config.bitsPerElement, + 1 << config.numFragments + ); + + linearByteOffset += linearSliceOffset; + + switch ((config.bitsPerElement + 7) / 8) { + case 1: + buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data; + break; + + case 2: + buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data; + break; + + case 4: + buffer_reference_uint32_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint32_t(config.srcAddress + linearByteOffset).data; + break; + + case 8: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + break; + + case 16: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data; + break; + + case 32: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 16).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 24).data; + break; + } +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl new file mode 100644 index 00000000..db92aae0 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl @@ -0,0 +1,76 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_atomic_int64 : enable +#extension GL_EXT_shader_atomic_float : enable +#extension GL_EXT_shader_image_load_formatted : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_EXT_shared_memory_block : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_EXT_null_initializer : enable +#extension GL_EXT_buffer_reference2 : enable +#extension GL_EXT_buffer_reference_uvec2 : enable + +#include "tiler.glsl" + +void main() { + uvec3 pos = gl_GlobalInvocationID; + uint64_t tiledSliceOffset = 0; + uint64_t linearSliceOffset = 0; + if (config.tiledSurfaceSize != 0) { + tiledSliceOffset = pos.z * config.tiledSurfaceSize; + linearSliceOffset = pos.z * config.linearSurfaceSize; + pos.z = 0; + } + + uint64_t tiledByteOffset = getTiledBitOffset1D( + config.tileMode, + pos, + config.dataSize, + config.bitsPerElement + ) / 8; + + tiledByteOffset += tiledSliceOffset; + + uint64_t linearByteOffset = computeLinearElementByteOffset( + pos, + 0, + config.dataSize.x, + config.dataSize.x * config.dataSize.y, + config.bitsPerElement, + 1 << config.numFragments + ); + + linearByteOffset += linearSliceOffset; + + switch ((config.bitsPerElement + 7) / 8) { + case 1: + buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data; + break; + + case 2: + buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data; + break; + + case 4: + buffer_reference_uint32_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint32_t(config.srcAddress + linearByteOffset).data; + break; + + case 8: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + break; + + case 16: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data; + break; + + case 32: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 16).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 24).data; + break; + } +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl new file mode 100644 index 00000000..35013d57 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl @@ -0,0 +1,76 @@ +#version 460 + +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_shader_explicit_arithmetic_types : enable +#extension GL_EXT_shader_atomic_int64 : enable +#extension GL_EXT_shader_atomic_float : enable +#extension GL_EXT_shader_image_load_formatted : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_EXT_shared_memory_block : enable +#extension GL_EXT_scalar_block_layout : enable +#extension GL_EXT_null_initializer : enable +#extension GL_EXT_buffer_reference2 : enable +#extension GL_EXT_buffer_reference_uvec2 : enable + +#include "tiler.glsl" + +void main() { + uvec3 pos = gl_GlobalInvocationID; + uint64_t tiledSliceOffset = 0; + uint64_t linearSliceOffset = 0; + if (config.tiledSurfaceSize != 0) { + tiledSliceOffset = pos.z * config.tiledSurfaceSize; + linearSliceOffset = pos.z * config.linearSurfaceSize; + pos.z = 0; + } + + uint64_t tiledByteOffset = computeLinearOffset( + config.bitsPerElement, + config.dataSize.y, + config.dataSize.x, + pos + ) / 8; + + tiledByteOffset += tiledSliceOffset; + + uint64_t linearByteOffset = computeLinearElementByteOffset( + pos, + 0, + config.dataSize.x, + config.dataSize.x * config.dataSize.y, + config.bitsPerElement, + 1 << config.numFragments + ); + + linearByteOffset += linearSliceOffset; + + switch ((config.bitsPerElement + 7) / 8) { + case 1: + buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data; + break; + + case 2: + buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data; + break; + + case 4: + buffer_reference_uint32_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint32_t(config.srcAddress + linearByteOffset).data; + break; + + case 8: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + break; + + case 16: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data; + break; + + case 32: + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 16).data; + buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 24).data; + break; + } +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp new file mode 100644 index 00000000..7404340a --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp @@ -0,0 +1,387 @@ +#include "gnm/constants.hpp" +#include +#include +#include + +using namespace amdgpu; + +static constexpr SurfaceInfo +computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type, + gnm::DataFormat dfmt, std::uint32_t width, + std::uint32_t height, std::uint32_t depth, + std::uint32_t pitch, int baseArrayLayer, int arrayCount, + int baseMipLevel, int mipCount, bool pow2pad) { + bool isCubemap = type == gnm::TextureType::Cube; + bool isVolume = type == gnm::TextureType::Dim3D; + + auto bitsPerFragment = getBitsPerElement(dfmt); + std::uint32_t arraySliceCount = depth; + + if (isCubemap) { + arraySliceCount *= 6; + } else if (isVolume) { + arraySliceCount = 1; + } + + int numFragments = (type == gnm::TextureType::Msaa2D || + type == gnm::TextureType::MsaaArray2D) + ? (baseArrayLayer + arrayCount - 1) + : 0; + + auto numFragmentsPerPixel = 1 << numFragments; + auto isBlockCompressed = getTexelsPerElement(dfmt) > 1; + + auto bitsPerElement = bitsPerFragment; + depth = isVolume ? depth : 1; + + if (isBlockCompressed) { + switch (bitsPerFragment) { + case 1: + bitsPerElement *= 8; + break; + case 4: + case 8: + bitsPerElement *= 16; + break; + case 16: + std::abort(); + break; + + default: + std::abort(); + break; + } + } + + if (pow2pad) { + arraySliceCount = std::bit_ceil(arraySliceCount); + } + + std::uint64_t surfaceOffset = 0; + std::uint64_t surfaceSize = 0; + + SurfaceInfo result; + result.width = width; + result.height = height; + result.depth = depth; + result.pitch = pitch; + result.numFragments = numFragments; + result.bitsPerElement = bitsPerElement; + result.arrayLayerCount = arraySliceCount; + + auto thickness = getMicroTileThickness(arrayMode); + + for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) { + std::uint32_t elemWidth = std::max(width >> mipLevel, 1); + std::uint32_t elemPitch = std::max(pitch >> mipLevel, 1); + std::uint32_t elemHeight = std::max(height >> mipLevel, 1); + std::uint32_t elemDepth = std::max(depth >> mipLevel, 1); + + std::uint32_t linearPitch = elemPitch; + std::uint32_t linearWidth = elemWidth; + std::uint32_t linearHeight = elemHeight; + std::uint32_t linearDepth = elemDepth; + + if (isBlockCompressed) { + switch (bitsPerFragment) { + case 1: + linearWidth = std::max((linearWidth + 7) / 8, 1); + linearPitch = std::max((linearPitch + 7) / 8, 1); + break; + case 4: + case 8: + linearWidth = std::max((linearWidth + 3) / 4, 1); + linearPitch = std::max((linearPitch + 3) / 4, 1); + linearHeight = std::max((linearHeight + 3) / 4, 1); + break; + case 16: + std::abort(); + break; + + default: + std::abort(); + break; + } + } + + if (pow2pad) { + linearPitch = std::bit_ceil(linearPitch); + linearWidth = std::bit_ceil(linearWidth); + linearHeight = std::bit_ceil(linearHeight); + linearDepth = std::bit_ceil(linearDepth); + } + + if (mipLevel > 0 && pitch > 0) { + linearPitch = linearWidth; + } + + std::uint32_t paddedPitch = + (linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1); + std::uint32_t paddedHeight = + (linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1); + std::uint32_t paddedDepth = linearDepth; + + if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) { + if (isCubemap) { + linearDepth = std::bit_ceil(linearDepth); + } + + paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1); + } + + std::uint32_t tempPitch = paddedPitch; + std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) * + paddedHeight * bitsPerElement * + numFragmentsPerPixel; + logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8; + + uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness; + while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) { + tempPitch += kMicroTileWidth; + logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight * + bitsPerElement * numFragmentsPerPixel; + logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8; + physicalSliceSizeBytes = logicalSliceSizeBytes * thickness; + } + + surfaceSize = logicalSliceSizeBytes * paddedDepth; + auto linearSize = + linearDepth * + (linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel + + 7) / + 8; + + result.setSubresourceInfo(mipLevel, { + .dataWidth = linearPitch, + .dataHeight = linearHeight, + .dataDepth = linearDepth, + .offset = surfaceOffset, + .tiledSize = surfaceSize, + .linearSize = linearSize, + }); + + surfaceOffset += arraySliceCount * surfaceSize; + } + + result.totalSize = surfaceOffset; + return result; +} + +static constexpr SurfaceInfo computeTextureLinearInfo( + ArrayMode arrayMode, gnm::TextureType type, gnm::DataFormat dfmt, + std::uint32_t width, std::uint32_t height, std::uint32_t depth, + std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel, + int mipCount, bool pow2pad) { + bool isCubemap = type == gnm::TextureType::Cube; + bool isVolume = type == gnm::TextureType::Dim3D; + + auto bitsPerFragment = getBitsPerElement(dfmt); + std::uint32_t arraySliceCount = depth; + + if (isCubemap) { + arraySliceCount *= 6; + } else if (isVolume) { + arraySliceCount = 1; + } + + int numFragments = (type == gnm::TextureType::Msaa2D || + type == gnm::TextureType::MsaaArray2D) + ? (baseArrayLayer + arrayCount - 1) + : 0; + + auto numFragmentsPerPixel = 1 << numFragments; + auto isBlockCompressed = getTexelsPerElement(dfmt) > 1; + + auto bitsPerElement = bitsPerFragment; + depth = isVolume ? depth : 1; + + if (isBlockCompressed) { + switch (bitsPerFragment) { + case 1: + bitsPerElement *= 8; + break; + case 4: + case 8: + bitsPerElement *= 16; + break; + case 16: + std::abort(); + break; + + default: + std::abort(); + break; + } + } + + if (pow2pad) { + arraySliceCount = std::bit_ceil(arraySliceCount); + } + + std::uint64_t surfaceOffset = 0; + std::uint64_t surfaceSize = 0; + + SurfaceInfo result; + result.width = width; + result.height = height; + result.depth = depth; + result.pitch = pitch; + result.numFragments = numFragments; + result.bitsPerElement = bitsPerElement; + result.arrayLayerCount = arraySliceCount; + + for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) { + std::uint32_t elemWidth = std::max(width >> mipLevel, 1); + std::uint32_t elemPitch = std::max(pitch >> mipLevel, 1); + std::uint32_t elemHeight = std::max(height >> mipLevel, 1); + std::uint32_t elemDepth = std::max(depth >> mipLevel, 1); + + std::uint32_t linearPitch = elemPitch; + std::uint32_t linearWidth = elemWidth; + std::uint32_t linearHeight = elemHeight; + std::uint32_t linearDepth = elemDepth; + + if (isBlockCompressed) { + switch (bitsPerFragment) { + case 1: + linearWidth = std::max((linearWidth + 7) / 8, 1); + linearPitch = std::max((linearPitch + 7) / 8, 1); + break; + case 4: + case 8: + linearWidth = std::max((linearWidth + 3) / 4, 1); + linearPitch = std::max((linearPitch + 3) / 4, 1); + linearHeight = std::max((linearHeight + 3) / 4, 1); + break; + case 16: + std::abort(); + break; + + default: + std::abort(); + break; + } + } + + if (pow2pad) { + linearPitch = std::bit_ceil(linearPitch); + linearWidth = std::bit_ceil(linearWidth); + linearHeight = std::bit_ceil(linearHeight); + linearDepth = std::bit_ceil(linearDepth); + } + + if (mipLevel > 0 && pitch > 0) { + linearPitch = linearWidth; + } + + if (arrayMode == kArrayModeLinearGeneral) { + surfaceSize = (static_cast(linearPitch) * + (linearHeight)*bitsPerElement * numFragmentsPerPixel + + 7) / + 8; + surfaceSize *= linearDepth; + + result.setSubresourceInfo(mipLevel, { + .dataWidth = linearPitch, + .dataHeight = linearHeight, + .dataDepth = linearDepth, + .offset = surfaceOffset, + .tiledSize = surfaceSize, + .linearSize = surfaceSize, + }); + } else { + if (mipLevel > 0 && pitch > 0) { + linearPitch = linearWidth; + } + + auto pitchAlign = std::max(8UL, 64UL / ((bitsPerElement + 7) / 8UL)); + std::uint32_t paddedPitch = + (linearPitch + pitchAlign - 1) & ~(pitchAlign - 1); + std::uint32_t paddedHeight = linearHeight; + std::uint32_t paddedDepth = linearDepth; + + if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) { + if (isCubemap) { + linearDepth = std::bit_ceil(linearDepth); + } + + auto thickness = getMicroTileThickness(arrayMode); + paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1); + } + + std::uint32_t pixelsPerPipeInterleave = + kPipeInterleaveBytes / ((bitsPerElement + 7) / 8); + std::uint32_t sliceAlignInPixel = + pixelsPerPipeInterleave < 64 ? 64 : pixelsPerPipeInterleave; + auto pixelsPerSlice = static_cast(paddedPitch) * paddedHeight * + numFragmentsPerPixel; + while (pixelsPerSlice % sliceAlignInPixel) { + paddedPitch += pitchAlign; + pixelsPerSlice = static_cast(paddedPitch) * paddedHeight * + numFragmentsPerPixel; + } + + surfaceSize = (pixelsPerSlice * bitsPerElement + 7) / 8 * paddedDepth; + + result.setSubresourceInfo(mipLevel, { + .dataWidth = paddedPitch, + .dataHeight = paddedHeight, + .dataDepth = paddedDepth, + .offset = surfaceOffset, + .tiledSize = surfaceSize, + .linearSize = surfaceSize, + }); + } + + surfaceOffset += arraySliceCount * surfaceSize; + } + + result.totalSize = surfaceOffset; + return result; +} + +SurfaceInfo amdgpu::computeSurfaceInfo( + TileMode tileMode, gnm::TextureType type, gnm::DataFormat dfmt, + std::uint32_t width, std::uint32_t height, std::uint32_t depth, + std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel, + int mipCount, bool pow2pad) { + switch (tileMode.arrayMode()) { + case kArrayModeLinearGeneral: + case kArrayModeLinearAligned: + return computeTextureLinearInfo( + tileMode.arrayMode(), type, dfmt, width, height, depth, pitch, + baseArrayLayer, arrayCount, baseMipLevel, mipCount, pow2pad); + + case kArrayMode1dTiledThin: + case kArrayMode1dTiledThick: + return computeTexture1dInfo(tileMode.arrayMode(), type, dfmt, width, height, + depth, pitch, baseArrayLayer, arrayCount, + baseMipLevel, mipCount, pow2pad); + + case kArrayMode2dTiledThin: + case kArrayMode2dTiledThick: + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledThin: + case kArrayMode3dTiledThick: + case kArrayMode3dTiledXThick: + case kArrayModeTiledThinPrt: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThinPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThinPrt: + case kArrayMode3dTiledThickPrt: + std::abort(); + } + + std::abort(); +} + +SurfaceInfo amdgpu::computeSurfaceInfo(const gnm::TBuffer &tbuffer, + TileMode tileMode) { + return computeSurfaceInfo( + tileMode, tbuffer.type, tbuffer.dfmt, tbuffer.width + 1, + tbuffer.height + 1, tbuffer.depth + 1, tbuffer.pitch + 1, + tbuffer.base_array, tbuffer.last_array - tbuffer.base_array + 1, + tbuffer.base_level, tbuffer.last_level - tbuffer.base_level + 1, + tbuffer.pow2pad != 0); +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp new file mode 100644 index 00000000..206def23 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp @@ -0,0 +1,441 @@ +#include "amdgpu/tiler_cpu.hpp" +#include "amdgpu/tiler.hpp" +#include "gnm/gnm.hpp" + +constexpr std::uint64_t +getTiledOffset1D(gnm::TextureType texType, bool isPow2Padded, + gnm::DataFormat dfmt, amdgpu::TileMode tileMode, int mipLevel, + int arraySlice, int numFragments, int width, int height, + int depth, int pitch, int x, int y, int z) { + + using namespace amdgpu; + bool isCubemap = texType == gnm::TextureType::Cube; + bool isVolume = texType == gnm::TextureType::Dim3D; + + auto bitsPerFragment = getBitsPerElement(dfmt); + uint32_t arraySliceCount = depth; + + if (isCubemap) { + arraySliceCount *= 6; + } else if (isVolume) { + arraySliceCount = 1; + } + + auto numFragmentsPerPixel = 1 << numFragments; + auto isBlockCompressed = getTexelsPerElement(dfmt) > 1; + auto arrayMode = tileMode.arrayMode(); + + auto bitsPerElement = bitsPerFragment; + auto paddedWidth = std::max((mipLevel != 0 ? pitch : width) >> mipLevel, 1); + auto paddedHeight = std::max(height >> mipLevel, 1); + + auto tileThickness = (arrayMode == amdgpu::kArrayMode1dTiledThick) ? 4 : 1; + + if (isBlockCompressed) { + switch (bitsPerFragment) { + case 1: + bitsPerElement *= 8; + paddedWidth = std::max((paddedWidth + 7) / 8, 1); + break; + case 4: + case 8: + bitsPerElement *= 16; + paddedWidth = std::max((paddedWidth + 3) / 4, 1); + paddedHeight = std::max((paddedHeight + 3) / 4, 1); + break; + case 16: + std::abort(); + break; + + default: + std::abort(); + break; + } + } + + if (isPow2Padded) { + arraySliceCount = std::bit_ceil(arraySliceCount); + paddedWidth = std::bit_ceil(unsigned(paddedWidth)); + paddedHeight = std::bit_ceil(unsigned(paddedHeight)); + } + + uint64_t finalSurfaceOffset = 0; + uint64_t finalSurfaceSize = 0; + + auto thickness = getMicroTileThickness(arrayMode); + + for (int i = 0; i <= mipLevel; i++) { + finalSurfaceOffset += arraySliceCount * finalSurfaceSize; + + std::uint32_t elemWidth = + std::max((i > 0 ? pitch : width) >> i, 1); + std::uint32_t elemHeight = std::max(height >> i, 1); + std::uint32_t elemDepth = + std::max((isVolume ? depth : 1) >> i, 1); + + if (isBlockCompressed) { + switch (bitsPerFragment) { + case 1: + elemWidth = std::max((elemWidth + 7) / 8, 1); + break; + case 4: + case 8: + elemWidth = std::max((elemWidth + 3) / 4, 1); + elemHeight = std::max((elemHeight + 3) / 4, 1); + break; + case 16: + std::abort(); + break; + + default: + std::abort(); + break; + } + } + + if (isPow2Padded) { + elemWidth = std::bit_ceil(elemWidth); + elemHeight = std::bit_ceil(elemHeight); + elemDepth = std::bit_ceil(elemDepth); + } + + elemWidth = (elemWidth + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1); + elemHeight = (elemHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1); + elemDepth = (elemDepth + thickness - 1) & ~(thickness - 1); + + std::uint32_t tempPitch = elemWidth; + std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) * + elemHeight * bitsPerElement * + numFragmentsPerPixel; + logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8; + + uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness; + while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) { + tempPitch += 8; + logicalSliceSizeBytes = std::uint64_t(tempPitch) * elemHeight * + bitsPerElement * numFragmentsPerPixel; + logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8; + physicalSliceSizeBytes = logicalSliceSizeBytes * thickness; + } + + finalSurfaceSize = logicalSliceSizeBytes * elemDepth; + } + + finalSurfaceOffset += finalSurfaceSize * (uint64_t)arraySlice; + + auto tileBytes = + (kMicroTileWidth * kMicroTileHeight * tileThickness * bitsPerElement + + 7) / + 8; + auto tilesPerRow = paddedWidth / kMicroTileWidth; + auto tilesPerSlice = + std::max(tilesPerRow * (paddedHeight / kMicroTileHeight), 1U); + + uint64_t elementIndex = getElementIndex(x, y, z, bitsPerElement, + tileMode.microTileMode(), arrayMode); + + uint64_t sliceOffset = (z / tileThickness) * tilesPerSlice * tileBytes; + + uint64_t tileRowIndex = y / kMicroTileHeight; + uint64_t tileColumnIndex = x / kMicroTileWidth; + uint64_t tileOffset = + (tileRowIndex * tilesPerRow + tileColumnIndex) * tileBytes; + + uint64_t elementOffset = elementIndex * bitsPerElement; + uint64_t finalOffset = (sliceOffset + tileOffset) * 8 + elementOffset; + + return finalOffset + finalSurfaceOffset * 8; +} + +constexpr std::uint64_t getTiledOffsetLinear(gnm::DataFormat dfmt, int height, + int pitch, int x, int y, int z) { + auto bitsPerFragment = getBitsPerElement(dfmt); + + auto bitsPerElement = bitsPerFragment; + auto paddedHeight = height; + auto paddedWidth = pitch; + + if (bitsPerFragment == 1) { + bitsPerElement *= 8; + paddedWidth = std::max((paddedWidth + 7) / 8, 1); + } + + uint64_t tiledRowSizeBits = bitsPerElement * paddedWidth; + uint64_t tiledSliceBits = paddedWidth * paddedHeight * bitsPerElement; + return tiledSliceBits * z + tiledRowSizeBits * y + bitsPerElement * x; +} + +constexpr std::uint64_t +getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded, + gnm::DataFormat dfmt, amdgpu::TileMode tileMode, + amdgpu::MacroTileMode macroTileMode, int mipLevel, + int arraySlice, int numFragments, int width, int height, + int depth, int pitch, int x, int y, int z, int fragmentIndex) { + using namespace amdgpu; + + bool isCubemap = texType == gnm::TextureType::Cube; + bool isVolume = texType == gnm::TextureType::Dim3D; + auto m_bitsPerFragment = getBitsPerElement(dfmt); + + auto m_isBlockCompressed = getTexelsPerElement(dfmt) > 1; + auto tileSwizzleMask = 0; + auto numFragmentsPerPixel = 1 << numFragments; + auto arrayMode = tileMode.arrayMode(); + + auto tileThickness = 1; + + switch (arrayMode) { + case amdgpu::kArrayMode2dTiledThin: + case amdgpu::kArrayMode3dTiledThin: + case amdgpu::kArrayModeTiledThinPrt: + case amdgpu::kArrayMode2dTiledThinPrt: + case amdgpu::kArrayMode3dTiledThinPrt: + tileThickness = 1; + break; + case amdgpu::kArrayMode1dTiledThick: + case amdgpu::kArrayMode2dTiledThick: + case amdgpu::kArrayMode3dTiledThick: + case amdgpu::kArrayModeTiledThickPrt: + case amdgpu::kArrayMode2dTiledThickPrt: + case amdgpu::kArrayMode3dTiledThickPrt: + tileThickness = 4; + break; + case amdgpu::kArrayMode2dTiledXThick: + case amdgpu::kArrayMode3dTiledXThick: + tileThickness = 8; + break; + default: + break; + } + + auto bitsPerElement = m_bitsPerFragment; + auto paddedWidth = pitch; + auto paddedHeight = height; + + if (m_isBlockCompressed) { + switch (m_bitsPerFragment) { + case 1: + bitsPerElement *= 8; + paddedWidth = std::max((paddedWidth + 7) / 8, 1); + break; + case 4: + case 8: + bitsPerElement *= 16; + paddedWidth = std::max((paddedWidth + 3) / 4, 1); + paddedHeight = std::max((paddedHeight + 3) / 4, 1); + break; + case 16: + std::abort(); + break; + default: + std::abort(); + break; + } + } + + auto bankWidthHW = macroTileMode.bankWidth(); + auto bankHeightHW = macroTileMode.bankHeight(); + auto macroAspectHW = macroTileMode.macroTileAspect(); + auto numBanksHW = macroTileMode.numBanks(); + + auto bankWidth = 1 << bankWidthHW; + auto bankHeight = 1 << bankHeightHW; + unsigned numBanks = 2 << numBanksHW; + auto macroTileAspect = 1 << macroAspectHW; + + uint32_t tileBytes1x = + (tileThickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight + + 7) / + 8; + + auto sampleSplitHw = tileMode.sampleSplit(); + auto tileSplitHw = tileMode.tileSplit(); + uint32_t sampleSplit = 1 << sampleSplitHw; + uint32_t tileSplitC = + (tileMode.microTileMode() == amdgpu::kMicroTileModeDepth) + ? (64 << tileSplitHw) + : std::max(256U, tileBytes1x * sampleSplit); + + auto tileSplitBytes = std::min(kDramRowSize, tileSplitC); + + auto numPipes = getPipeCount(tileMode.pipeConfig()); + auto pipeInterleaveBits = std::countr_zero(kPipeInterleaveBytes); + auto pipeInterleaveMask = (1 << pipeInterleaveBits) - 1; + auto pipeBits = std::countr_zero(numPipes); + auto bankBits = std::countr_zero(numBanks); + // auto pipeMask = (numPipes - 1) << pipeInterleaveBits; + auto bankSwizzleMask = tileSwizzleMask; + auto pipeSwizzleMask = 0; + auto macroTileWidth = + (kMicroTileWidth * bankWidth * numPipes) * macroTileAspect; + auto macroTileHeight = + (kMicroTileHeight * bankHeight * numBanks) / macroTileAspect; + + auto microTileMode = tileMode.microTileMode(); + + uint64_t elementIndex = + getElementIndex(x, y, z, bitsPerElement, microTileMode, arrayMode); + + uint32_t xh = x, yh = y; + if (arrayMode == amdgpu::kArrayModeTiledThinPrt || + arrayMode == amdgpu::kArrayModeTiledThickPrt) { + xh %= macroTileWidth; + yh %= macroTileHeight; + } + uint64_t pipe = getPipeIndex(xh, yh, tileMode.pipeConfig()); + uint64_t bank = + getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes); + + uint32_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness * + bitsPerElement * numFragmentsPerPixel + + 7) / + 8; + + uint64_t elementOffset = 0; + if (microTileMode == amdgpu::kMicroTileModeDepth) { + uint64_t pixelOffset = elementIndex * bitsPerElement * numFragmentsPerPixel; + elementOffset = pixelOffset + (fragmentIndex * bitsPerElement); + } else { + uint64_t fragmentOffset = + fragmentIndex * (tileBytes / numFragmentsPerPixel) * 8; + elementOffset = fragmentOffset + (elementIndex * bitsPerElement); + } + + uint64_t slicesPerTile = 1; + uint64_t tileSplitSlice = 0; + if (tileBytes > tileSplitBytes && tileThickness == 1) { + slicesPerTile = tileBytes / tileSplitBytes; + tileSplitSlice = elementOffset / (tileSplitBytes * 8); + elementOffset %= (tileSplitBytes * 8); + tileBytes = tileSplitBytes; + } + + uint64_t macroTileBytes = (macroTileWidth / kMicroTileWidth) * + (macroTileHeight / kMicroTileHeight) * tileBytes / + (numPipes * numBanks); + uint64_t macroTilesPerRow = paddedWidth / macroTileWidth; + uint64_t macroTileRowIndex = y / macroTileHeight; + uint64_t macroTileColumnIndex = x / macroTileWidth; + uint64_t macroTileIndex = + (macroTileRowIndex * macroTilesPerRow) + macroTileColumnIndex; + uint64_t macro_tile_offset = macroTileIndex * macroTileBytes; + uint64_t macroTilesPerSlice = + macroTilesPerRow * (paddedHeight / macroTileHeight); + uint64_t sliceBytes = macroTilesPerSlice * macroTileBytes; + + uint32_t slice = z; + uint64_t sliceOffset = + (tileSplitSlice + slicesPerTile * slice / tileThickness) * sliceBytes; + if (arraySlice != 0) { + slice = arraySlice; + } + + uint64_t tileRowIndex = (y / kMicroTileHeight) % bankHeight; + uint64_t tileColumnIndex = ((x / kMicroTileWidth) / numPipes) % bankWidth; + uint64_t tileIndex = (tileRowIndex * bankWidth) + tileColumnIndex; + uint64_t tileOffset = tileIndex * tileBytes; + + uint64_t bankSwizzle = bankSwizzleMask; + uint64_t pipeSwizzle = pipeSwizzleMask; + + uint64_t pipeSliceRotation = 0; + switch (arrayMode) { + case amdgpu::kArrayMode3dTiledThin: + case amdgpu::kArrayMode3dTiledThick: + case amdgpu::kArrayMode3dTiledXThick: + pipeSliceRotation = + std::max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness); + break; + default: + break; + } + pipeSwizzle += pipeSliceRotation; + pipeSwizzle &= (numPipes - 1); + pipe = pipe ^ pipeSwizzle; + + uint32_t sliceRotation = 0; + switch (arrayMode) { + case amdgpu::kArrayMode2dTiledThin: + case amdgpu::kArrayMode2dTiledThick: + case amdgpu::kArrayMode2dTiledXThick: + sliceRotation = ((numBanks / 2) - 1) * (slice / tileThickness); + break; + case amdgpu::kArrayMode3dTiledThin: + case amdgpu::kArrayMode3dTiledThick: + case amdgpu::kArrayMode3dTiledXThick: + sliceRotation = std::max(1UL, (numPipes / 2UL) - 1UL) * + (slice / tileThickness) / numPipes; + break; + default: + break; + } + uint64_t tileSplitSliceRotation = 0; + switch (arrayMode) { + case amdgpu::kArrayMode2dTiledThin: + case amdgpu::kArrayMode3dTiledThin: + case amdgpu::kArrayMode2dTiledThinPrt: + case amdgpu::kArrayMode3dTiledThinPrt: + tileSplitSliceRotation = ((numBanks / 2) + 1) * tileSplitSlice; + break; + default: + break; + } + bank ^= bankSwizzle + sliceRotation; + bank ^= tileSplitSliceRotation; + bank &= (numBanks - 1); + + uint64_t totalOffset = + (sliceOffset + macro_tile_offset + tileOffset) * 8 + elementOffset; + uint64_t bitOffset = totalOffset & 0x7; + totalOffset /= 8; + + uint64_t pipeInterleaveOffset = totalOffset & pipeInterleaveMask; + uint64_t offset = totalOffset >> pipeInterleaveBits; + + uint64_t finalByteOffset = + pipeInterleaveOffset | (pipe << (pipeInterleaveBits)) | + (bank << (pipeInterleaveBits + pipeBits)) | + (offset << (pipeInterleaveBits + pipeBits + bankBits)); + return (finalByteOffset << 3) | bitOffset; +} + +std::uint64_t amdgpu::getTiledOffset(gnm::TextureType texType, + bool isPow2Padded, int numFragments, + gnm::DataFormat dfmt, + amdgpu::TileMode tileMode, + amdgpu::MacroTileMode macroTileMode, + int mipLevel, int arraySlice, int width, + int height, int depth, int pitch, int x, + int y, int z, int fragmentIndex) { + switch (tileMode.arrayMode()) { + case amdgpu::kArrayModeLinearGeneral: + case amdgpu::kArrayModeLinearAligned: + return getTiledOffsetLinear(dfmt, height, pitch, x, y, z); + + case amdgpu::kArrayMode1dTiledThin: + case amdgpu::kArrayMode1dTiledThick: { + return getTiledOffset1D(texType, isPow2Padded, dfmt, tileMode, mipLevel, + arraySlice, numFragments, width, height, depth, + pitch, x, y, z); + } + + case amdgpu::kArrayMode2dTiledThin: + case amdgpu::kArrayMode2dTiledThick: + case amdgpu::kArrayMode2dTiledXThick: + case amdgpu::kArrayMode3dTiledThin: + case amdgpu::kArrayMode3dTiledThick: + case amdgpu::kArrayMode3dTiledXThick: + case amdgpu::kArrayModeTiledThinPrt: + case amdgpu::kArrayModeTiledThickPrt: + case amdgpu::kArrayMode2dTiledThinPrt: + case amdgpu::kArrayMode2dTiledThickPrt: + case amdgpu::kArrayMode3dTiledThinPrt: + case amdgpu::kArrayMode3dTiledThickPrt: + return getTiledOffset2D(texType, isPow2Padded, dfmt, tileMode, + macroTileMode, mipLevel, arraySlice, numFragments, + width, height, depth, pitch, x, y, z, + fragmentIndex); + } + + std::abort(); +} diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp new file mode 100644 index 00000000..63bb1763 --- /dev/null +++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp @@ -0,0 +1,354 @@ +#include "amdgpu/tiler_vulkan.hpp" +#include "Scheduler.hpp" +#include "amdgpu/tiler.hpp" +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct TilerDecriptorSetLayout { + VkDescriptorSetLayout layout; + + TilerDecriptorSetLayout() { + std::vector bindings{{ + .binding = 0, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + }}; + + VkDescriptorSetLayoutCreateInfo layoutInfo{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .bindingCount = static_cast(bindings.size()), + .pBindings = bindings.data(), + }; + + VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, &layoutInfo, + nullptr, &layout)); + } + + ~TilerDecriptorSetLayout() { + vkDestroyDescriptorSetLayout(vk::context->device, layout, + vk::context->allocator); + } +}; + +struct TilerShader { + VkShaderEXT shader; + + TilerShader(TilerDecriptorSetLayout &setLayout, + std::span spirv) { + + VkShaderCreateInfoEXT shaderInfo{ + .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT, + .flags = 0, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, + .nextStage = 0, + .codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT, + .codeSize = spirv.size_bytes(), + .pCode = spirv.data(), + .pName = "main", + .setLayoutCount = 1, + .pSetLayouts = &setLayout.layout, + .pushConstantRangeCount = 0, + .pPushConstantRanges = 0, + .pSpecializationInfo = 0, + }; + + VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &shaderInfo, nullptr, + &shader)); + } + + ~TilerShader() { + vk::DestroyShaderEXT(vk::context->device, shader, vk::context->allocator); + } +}; + +struct amdgpu::GpuTiler::Impl { + TilerDecriptorSetLayout descriptorSetLayout; + std::mutex descriptorMtx; + VkDescriptorSet descriptorSets[4]{}; + VkDescriptorPool descriptorPool; + std::uint32_t inUseDescriptorSets = 0; + + vk::Buffer configData; + TilerShader detilerLinear{descriptorSetLayout, spirv_detilerLinear_comp}; + TilerShader detiler1d{descriptorSetLayout, spirv_detiler1d_comp}; + TilerShader detiler2d{descriptorSetLayout, spirv_detilerLinear_comp}; + TilerShader tilerLinear{descriptorSetLayout, spirv_tiler2d_comp}; + TilerShader tiler1d{descriptorSetLayout, spirv_tiler1d_comp}; + TilerShader tiler2d{descriptorSetLayout, spirv_tiler2d_comp}; + VkPipelineLayout pipelineLayout; + + struct Config { + uint64_t srcAddress; + uint64_t dstAddress; + uint32_t dataWidth; + uint32_t dataHeight; + uint32_t tileMode; + uint32_t numFragments; + uint32_t bitsPerElement; + uint32_t tiledSurfaceSize; + uint32_t linearSurfaceSize; + }; + + Impl() { + std::size_t count = 256; + + configData = vk::Buffer::Allocate( + vk::getHostVisibleMemory(), sizeof(Config) * count, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + + VkPipelineLayoutCreateInfo piplineLayoutInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, + .pSetLayouts = &descriptorSetLayout.layout, + }; + + VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &piplineLayoutInfo, + nullptr, &pipelineLayout)); + + { + VkDescriptorPoolSize poolSizes[]{{ + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = 1, + }}; + + VkDescriptorPoolCreateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = static_cast(std::size(descriptorSets)) * 4, + .poolSizeCount = static_cast(std::size(poolSizes)), + .pPoolSizes = poolSizes, + }; + + VK_VERIFY(vkCreateDescriptorPool( + vk::context->device, &info, vk::context->allocator, &descriptorPool)); + } + + VkDescriptorSetAllocateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = descriptorPool, + .descriptorSetCount = 1, + .pSetLayouts = &descriptorSetLayout.layout, + }; + for (std::size_t i = 0; i < std::size(descriptorSets); ++i) { + VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info, + descriptorSets + i)); + } + } + + ~Impl() { + vkDestroyDescriptorPool(vk::context->device, descriptorPool, + vk::context->allocator); + vkDestroyPipelineLayout(vk::context->device, pipelineLayout, + vk::context->allocator); + } + + std::uint32_t allocateDescriptorSlot() { + std::lock_guard lock(descriptorMtx); + + auto result = std::countl_one(inUseDescriptorSets); + rx::dieIf(result >= std::size(descriptorSets), + "out of tiler descriptor sets"); + inUseDescriptorSets |= (1 << result); + + return result; + } + + void releaseDescriptorSlot(std::uint32_t slot) { + std::lock_guard lock(descriptorMtx); + inUseDescriptorSets &= ~(1u << slot); + } +}; + +amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique(); } +amdgpu::GpuTiler::~GpuTiler() = default; + +void amdgpu::GpuTiler::detile(Scheduler &scheduler, + const amdgpu::SurfaceInfo &info, + amdgpu::TileMode tileMode, + std::uint64_t srcTiledAddress, + std::uint64_t dstLinearAddress, int mipLevel, + int baseArray, int arrayCount) { + auto commandBuffer = scheduler.getCommandBuffer(); + auto slot = mImpl->allocateDescriptorSlot(); + + auto configOffset = slot * sizeof(Impl::Config); + auto config = reinterpret_cast(mImpl->configData.getData() + + configOffset); + + auto &subresource = info.getSubresourceInfo(mipLevel); + config->srcAddress = srcTiledAddress + subresource.offset + + (subresource.tiledSize * baseArray); + config->dstAddress = dstLinearAddress + (subresource.linearSize * baseArray); + config->dataWidth = subresource.dataWidth; + config->dataHeight = subresource.dataHeight; + config->tileMode = tileMode.raw; + config->numFragments = info.numFragments; + config->bitsPerElement = info.bitsPerElement; + uint32_t groupCountZ = subresource.dataDepth; + + if (arrayCount > 1) { + config->tiledSurfaceSize = subresource.tiledSize; + config->linearSurfaceSize = subresource.linearSize; + groupCountZ = arrayCount; + } else { + config->tiledSurfaceSize = 0; + config->linearSurfaceSize = 0; + } + + VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT}; + + switch (tileMode.arrayMode()) { + case amdgpu::kArrayModeLinearGeneral: + case amdgpu::kArrayModeLinearAligned: + vk::CmdBindShadersEXT(commandBuffer, 1, stages, + &mImpl->detilerLinear.shader); + break; + + case amdgpu::kArrayMode1dTiledThin: + case amdgpu::kArrayMode1dTiledThick: + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader); + break; + + case amdgpu::kArrayMode2dTiledThin: + case amdgpu::kArrayModeTiledThinPrt: + case amdgpu::kArrayMode2dTiledThinPrt: + case amdgpu::kArrayMode2dTiledThick: + case amdgpu::kArrayMode2dTiledXThick: + case amdgpu::kArrayModeTiledThickPrt: + case amdgpu::kArrayMode2dTiledThickPrt: + case amdgpu::kArrayMode3dTiledThinPrt: + case amdgpu::kArrayMode3dTiledThin: + case amdgpu::kArrayMode3dTiledThick: + case amdgpu::kArrayMode3dTiledXThick: + case amdgpu::kArrayMode3dTiledThickPrt: + std::abort(); + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader); + break; + } + + VkDescriptorBufferInfo bufferInfo{ + .buffer = mImpl->configData.getHandle(), + .offset = configOffset, + .range = sizeof(Impl::Config), + }; + + VkWriteDescriptorSet writeDescSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = mImpl->descriptorSets[slot], + .dstBinding = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .pBufferInfo = &bufferInfo, + }; + + vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); + + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + mImpl->pipelineLayout, 0, 1, + &mImpl->descriptorSets[slot], 0, nullptr); + + vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight, + groupCountZ); + + scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); }); +} + +void amdgpu::GpuTiler::tile(Scheduler &scheduler, + const amdgpu::SurfaceInfo &info, + amdgpu::TileMode tileMode, + std::uint64_t srcLinearAddress, + std::uint64_t dstTiledAddress, int mipLevel, + int baseArray, int arrayCount) { + auto commandBuffer = scheduler.getCommandBuffer(); + auto slot = mImpl->allocateDescriptorSlot(); + + auto configOffset = slot * sizeof(Impl::Config); + auto config = reinterpret_cast(mImpl->configData.getData() + + configOffset); + + auto &subresource = info.getSubresourceInfo(mipLevel); + config->srcAddress = srcLinearAddress + subresource.offset + + subresource.linearSize * baseArray; + config->dstAddress = dstTiledAddress; + config->dataWidth = subresource.dataWidth; + config->dataHeight = subresource.dataHeight; + config->tileMode = tileMode.raw; + config->numFragments = info.numFragments; + config->bitsPerElement = info.bitsPerElement; + uint32_t groupCountZ = subresource.dataDepth; + + if (arrayCount > 1) { + config->tiledSurfaceSize = subresource.tiledSize; + config->linearSurfaceSize = subresource.linearSize; + groupCountZ = arrayCount; + } else { + config->tiledSurfaceSize = 0; + config->linearSurfaceSize = 0; + } + + VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT}; + + switch (tileMode.arrayMode()) { + case amdgpu::kArrayModeLinearGeneral: + case amdgpu::kArrayModeLinearAligned: + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tilerLinear.shader); + break; + + case amdgpu::kArrayMode1dTiledThin: + case amdgpu::kArrayMode1dTiledThick: + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader); + break; + + case amdgpu::kArrayMode2dTiledThin: + case amdgpu::kArrayModeTiledThinPrt: + case amdgpu::kArrayMode2dTiledThinPrt: + case amdgpu::kArrayMode2dTiledThick: + case amdgpu::kArrayMode2dTiledXThick: + case amdgpu::kArrayModeTiledThickPrt: + case amdgpu::kArrayMode2dTiledThickPrt: + case amdgpu::kArrayMode3dTiledThinPrt: + case amdgpu::kArrayMode3dTiledThin: + case amdgpu::kArrayMode3dTiledThick: + case amdgpu::kArrayMode3dTiledXThick: + case amdgpu::kArrayMode3dTiledThickPrt: + std::abort(); + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader); + break; + } + + VkDescriptorBufferInfo bufferInfo{ + .buffer = mImpl->configData.getHandle(), + .offset = configOffset, + .range = sizeof(Impl::Config), + }; + + VkWriteDescriptorSet writeDescSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = mImpl->descriptorSets[slot], + .dstBinding = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .pBufferInfo = &bufferInfo, + }; + + vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); + + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, + mImpl->pipelineLayout, 0, 1, + &mImpl->descriptorSets[slot], 0, nullptr); + + vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight, + groupCountZ); + + scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); }); +} diff --git a/rpcsx-gpu2/lib/gcn-shader/CMakeLists.txt b/rpcsx-gpu2/lib/gcn-shader/CMakeLists.txt new file mode 100644 index 00000000..9aff3961 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/CMakeLists.txt @@ -0,0 +1,48 @@ +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/) + +add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/spv.hpp + COMMAND $ ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/spv.hpp + DEPENDS spv-gen + WORKING_DIRECTORY $/spirv/unified1 + COMMENT "Generating ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/spv.hpp..." +) + +add_custom_target(shader-spv-dialect-gen DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/spv.hpp) +add_library(shader-spv-dialect INTERFACE) +add_dependencies(shader-spv-dialect shader-spv-dialect-gen) +target_include_directories(shader-spv-dialect INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/include/) + +add_library(gcn-shader STATIC + src/analyze.cpp + src/eval.cpp + src/Evaluator.cpp + src/gcn.cpp + src/GcnConverter.cpp + src/GcnInstruction.cpp + src/glsl.cpp + src/ModuleInfo.cpp + src/opt.cpp + src/SemanticModuleInfo.cpp + src/spv.cpp + src/SpvConverter.cpp + src/SpvTypeInfo.cpp + src/transform.cpp +) + +target_include_directories(gcn-shader PUBLIC include PRIVATE include/shader) + +target_link_libraries(gcn-shader +PUBLIC + shader-spv-dialect + rx + +PRIVATE + glslang::glslang + glslang::SPIRV + SPIRV-Tools + SPIRV-Tools-opt + spirv-cross-c-shared +) + +add_subdirectory(shaders) diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/Access.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/Access.hpp new file mode 100644 index 00000000..df9c01cb --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/Access.hpp @@ -0,0 +1,26 @@ +#pragma once + +namespace shader { +enum class Access { + None = 0, + Read = 1 << 0, + Write = 1 << 1, + ReadWrite = Read | Write +}; + +constexpr Access operator|(Access lhs, Access rhs) { + return static_cast(static_cast(lhs) | static_cast(rhs)); +} +constexpr Access operator&(Access lhs, Access rhs) { + return static_cast(static_cast(lhs) & static_cast(rhs)); +} +constexpr Access operator~(Access rhs) { + return static_cast(~static_cast(rhs)); +} +constexpr Access &operator|=(Access &lhs, Access rhs) { + return ((lhs = lhs | rhs)); +} +constexpr Access &operator&=(Access &lhs, Access rhs) { + return ((lhs = lhs & rhs)); +} +} // namespace shader diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp new file mode 100644 index 00000000..58e8226c --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp @@ -0,0 +1,20 @@ +#pragma once +#include "eval.hpp" +#include + +namespace shader::eval { +class Evaluator { + std::map values; + +public: + virtual ~Evaluator() = default; + + void invalidate(ir::Value node) { values.erase(node); } + void setValue(ir::Value node, Value value) { values[node] = value; } + + Value eval(const ir::Operand &op, ir::Value type = nullptr); + virtual Value eval(ir::Value op); + virtual Value eval(ir::InstructionId instId, + std::span operands); +}; +} // namespace shader::eval diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnConverter.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnConverter.hpp new file mode 100644 index 00000000..a35ff109 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnConverter.hpp @@ -0,0 +1,131 @@ +#pragma once + +#include "gcn.hpp" +#include "rx/MemoryTable.hpp" +#include +#include +#include + +namespace shader::gcn { +enum class PsVGprInput { + IPerspSample, + JPerspSample, + IPerspCenter, + JPerspCenter, + IPerspCentroid, + JPerspCentroid, + IW, + JW, + _1W, + ILinearSample, + JLinearSample, + ILinearCenter, + JLinearCenter, + ILinearCentroid, + JLinearCentroid, + X, + Y, + Z, + W, + FrontFace, + Ancillary, + SampleCoverage, + PosFixed, + + Count +}; +enum class ConfigType { + Imm, + UserSgpr, + ResourceSlot, + MemoryTable, + Gds, + PsInputVGpr, + VsPrimType, + CbCompSwap, + ViewPortOffsetX, + ViewPortOffsetY, + ViewPortOffsetZ, + ViewPortScaleX, + ViewPortScaleY, + ViewPortScaleZ, +}; + +struct ConfigSlot { + ConfigType type; + std::uint64_t data; +}; + +struct Resources { + struct Resource { + std::uint32_t resourceSlot; + }; + + struct Pointer : Resource { + std::uint32_t size; + ir::Value base; + ir::Value offset; + }; + + struct Texture : Resource { + Access access; + ir::Value words[8]; + }; + + struct Buffer : Resource { + Access access; + ir::Value words[4]; + }; + + struct Sampler : Resource { + bool unorm; + ir::Value words[4]; + }; + + spv::Context context; + bool hasUnknown = false; + std::uint32_t slots = 0; + std::vector pointers; + std::vector textures; + std::vector buffers; + std::vector samplers; + + void print(std::ostream &os, ir::NameStorage &ns) const; + void dump(); +}; + +struct ShaderInfo { + std::vector configSlots; + rx::MemoryAreaTable<> memoryMap; + std::vector> requiredSgprs; + Resources resources; + + std::uint32_t create(ConfigType type, std::uint64_t data) { + for (std::size_t slotIndex = 0; auto &slotInfo : configSlots) { + if (slotInfo.type == type && slotInfo.data == data) { + return slotIndex; + } + + slotIndex++; + } + + configSlots.push_back({ + .type = type, + .data = data, + }); + + return configSlots.size() - 1; + } +}; + +struct ConvertedShader { + std::vector spv; + ShaderInfo info; +}; + +std::optional +convertToSpv(Context &context, ir::Region body, + const SemanticModuleInfo &semanticModule, Stage stage, + const Environment &state); + +} // namespace shader::gcn diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnInstruction.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnInstruction.hpp new file mode 100644 index 00000000..4c4fad05 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnInstruction.hpp @@ -0,0 +1,256 @@ +#pragma once + +#include "dialect.hpp" +#include "ir/Kind.hpp" + +#include +#include +#include +#include + +namespace shader { +struct GcnOperand { + enum class Kind : std::uint8_t { + Invalid, + Constant, + Immediate, + VccLo, + VccHi, + M0, + ExecLo, + ExecHi, + Scc, + VccZ, + ExecZ, + LdsDirect, + Vgpr, + Sgpr, + Attr, + Buffer, + Texture128, + Texture256, + Sampler, + Pointer, + }; + + static constexpr auto R = 1 << 0; + static constexpr auto W = 1 << 1; + + union { + std::uint32_t value; + std::uint64_t address = 0; + + struct { + std::uint16_t attrId; + std::uint16_t attrChannel; + }; + + struct { + Kind firstRegisterKind; + union { + struct { + Kind pointerOffsetKind; + std::uint16_t pointeeSize; + }; + bool samplerUnorm; + }; + std::uint32_t firstRegisterIndex; + + union { + std::uint32_t pointerOffsetValue; + std::uint64_t pointerOffsetAddress; + }; + }; + }; + + Kind kind = Kind::Invalid; + std::uint8_t access = 0; + std::uint8_t omod : 4 = 0; + bool abs : 1 = false; + bool clamp : 1 = false; + bool neg : 1 = false; + + constexpr GcnOperand getUnderlyingOperand(int offset = 0) const { + return { + .value = firstRegisterIndex + offset, + .kind = firstRegisterKind, + }; + } + + constexpr GcnOperand getPointerOffsetOperand() const { + return { + .address = pointerOffsetAddress, + .kind = pointerOffsetKind, + }; + } + + static constexpr GcnOperand createImmediateConstant(std::uint64_t address) { + return GcnOperand{ + .address = address, + .kind = Kind::Immediate, + .access = R, + }; + } + + static constexpr GcnOperand createConstant(std::uint32_t value) { + return GcnOperand{ + .value = value, + .kind = Kind::Constant, + .access = R, + }; + } + + static constexpr GcnOperand createConstant(bool value) { + return createConstant(std::uint32_t(value ? 1 : 0)); + } + + static constexpr GcnOperand createConstant(float value) { + return createConstant(std::bit_cast(value)); + } + + static constexpr GcnOperand createVgpr(std::uint32_t index) { + return { + .value = index, + .kind = Kind::Vgpr, + }; + } + + static constexpr GcnOperand createSgpr(std::uint32_t index) { + return { + .value = index, + .kind = Kind::Sgpr, + }; + } + + static constexpr GcnOperand createSampler(GcnOperand firstReg, bool unorm) { + return { + .firstRegisterKind = firstReg.kind, + .samplerUnorm = unorm, + .firstRegisterIndex = static_cast(firstReg.value), + .kind = Kind::Sampler, + }; + } + static constexpr GcnOperand createTexture(GcnOperand firstReg, bool is128) { + return { + .firstRegisterKind = firstReg.kind, + .firstRegisterIndex = static_cast(firstReg.value), + .kind = (is128 ? Kind::Texture128 : Kind::Texture256), + }; + } + static constexpr GcnOperand createBuffer(GcnOperand firstReg) { + return { + .firstRegisterKind = firstReg.kind, + .firstRegisterIndex = static_cast(firstReg.value), + .kind = Kind::Buffer, + }; + } + static constexpr GcnOperand + createPointer(GcnOperand firstReg, std::uint16_t size, GcnOperand offset) { + return { + .firstRegisterKind = firstReg.kind, + .pointerOffsetKind = offset.kind, + .pointeeSize = size, + .firstRegisterIndex = static_cast(firstReg.value), + .pointerOffsetAddress = offset.address, + .kind = Kind::Pointer, + }; + } + + static constexpr GcnOperand createAttr(std::uint16_t id, + std::uint16_t channel) { + return { + .attrId = id, + .attrChannel = channel, + .kind = Kind::Attr, + }; + } + + constexpr GcnOperand withRW() const { return withAccess(R | W); } + constexpr GcnOperand withR() const { return withAccess(R); } + constexpr GcnOperand withW() const { return withAccess(W); } + + constexpr GcnOperand withAccess(std::uint8_t access) const { + GcnOperand result = *this; + result.access = access; + return result; + } + + constexpr GcnOperand withNeg(bool value) const { + GcnOperand result = *this; + result.neg = value; + return result; + } + + constexpr GcnOperand withAbs(bool value) const { + GcnOperand result = *this; + result.abs = value; + return result; + } + + constexpr GcnOperand withClamp(bool value) const { + GcnOperand result = *this; + result.clamp = value; + return result; + } + + constexpr GcnOperand withOutputModifier(std::uint8_t value) const { + GcnOperand result = *this; + result.omod = value; + return result; + } + + static constexpr GcnOperand createVccLo() { return {.kind = Kind::VccLo}; } + static constexpr GcnOperand createVccHi() { return {.kind = Kind::VccHi}; } + static constexpr GcnOperand createM0() { return {.kind = Kind::M0}; } + static constexpr GcnOperand createExecLo() { return {.kind = Kind::ExecLo}; } + static constexpr GcnOperand createExecHi() { return {.kind = Kind::ExecHi}; } + static constexpr GcnOperand createVccZ() { return {.kind = Kind::VccZ}; } + static constexpr GcnOperand createExecZ() { return {.kind = Kind::ExecZ}; } + static constexpr GcnOperand createScc() { return {.kind = Kind::Scc}; } + static constexpr GcnOperand createLdsDirect() { + return {.kind = Kind::LdsDirect}; + } + + void print(std::ostream &os) const; + void dump() const; +}; + +struct GcnInstruction { + ir::Kind kind = ir::Kind::Builtin; + unsigned op = ir::builtin::INVALID_INSTRUCTION; + GcnOperand operands[16]; + std::size_t operandCount{}; + + std::span getOperands() const { + return {operands, operandCount}; + } + + const GcnOperand &getOperand(std::size_t index) const { + if (index >= operandCount) { + std::abort(); + } + return operands[index]; + } + + void addOperand(GcnOperand op) { + if (operandCount >= std::size(operands)) { + std::abort(); + } + + operands[operandCount++] = op; + } + + template + bool operator==(T testOp) + requires(ir::kOpToKind> != ir::Kind::Count) + { + return ir::kOpToKind> == kind && op == testOp; + } + + void print(std::ostream &os) const; + void dump() const; +}; + +void readGcnInst(GcnInstruction &isaInst, std::uint64_t &address, + const std::function &readMemory); +} // namespace shader \ No newline at end of file diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/ModuleInfo.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/ModuleInfo.hpp new file mode 100644 index 00000000..af013a38 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ModuleInfo.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "Access.hpp" +#include "ir/Value.hpp" +#include "spv.hpp" +#include +#include + +namespace shader { +struct ModuleInfo { + struct Param { + ir::Value type; + Access access = Access::None; + }; + + struct Function { + std::map variables; + std::vector parameters; + ir::Value returnType; + }; + + std::map functions; +}; + +ModuleInfo::Function &collectFunctionInfo(ModuleInfo &moduleInfo, + ir::Value function); +void collectModuleInfo(ModuleInfo &moduleInfo, const spv::BinaryLayout &layout); +} // namespace shader diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/SemanticInfo.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/SemanticInfo.hpp new file mode 100644 index 00000000..5f6e62e4 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/SemanticInfo.hpp @@ -0,0 +1,46 @@ +#pragma once + +#include "ModuleInfo.hpp" +#include "SpvTypeInfo.hpp" + +namespace shader { +struct SemanticModuleInfo : ModuleInfo { + std::unordered_map semantics; + + ir::Value findSemanticOf(ir::InstructionId sem) const { + auto semIt = semantics.find(sem); + if (semIt == semantics.end()) { + return nullptr; + } + + return semIt->second; + } +}; + +struct SemanticInfo { + struct Param { + spv::TypeInfo type; + Access access = Access::None; + }; + + struct Function { + std::unordered_map registerAccesses; + std::vector parameters; + spv::TypeInfo returnType; + Access bufferAccess = Access::None; + }; + + std::unordered_map semantics; + + const Function *findSemantic(ir::InstructionId sem) const { + if (auto it = semantics.find(sem); it != semantics.end()) { + return &it->second; + } + + return nullptr; + } +}; + +void collectSemanticModuleInfo(SemanticModuleInfo &moduleInfo, + const spv::BinaryLayout &layout); +} // namespace shader diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvConverter.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvConverter.hpp new file mode 100644 index 00000000..0c4ae8c3 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvConverter.hpp @@ -0,0 +1,154 @@ +#pragma once +#include "SpvTypeInfo.hpp" +#include "dialect/spv.hpp" +#include "spv.hpp" + +namespace shader::spv { +struct Import : ir::CloneMap { + ir::Node getOrCloneImpl(ir::Context &context, ir::Node node, + bool isOperand) override; +}; + +struct Context : ir::Context { + BinaryLayout layout; + ir::Location rootLocation; + + ir::NameStorage ns; + ir::Value perVertex; + std::map outputs; + std::map inputs; + + ir::RegionLike localVariables; + ir::RegionLike epilogue; + ir::Value entryPoint; + + std::map> globals; + std::map> constants; + + Context(); + + ir::Value createRegionWithLabel(ir::Location loc); + + void setName(ir::spv::IdRef inst, std::string name); + void setConstantName(ir::Value constant); + + ir::Value getOrCreateConstant(ir::Value typeValue, const ir::Operand &value); + + ir::Value getType(ir::spv::Op baseType, int width, bool isSigned); + ir::Value getType(const TypeInfo &info); + + ir::Value imm64(std::uint64_t value) { + return getOrCreateConstant(getTypeUInt64(), value); + } + ir::Value imm32(std::uint32_t value) { + return getOrCreateConstant(getTypeUInt32(), value); + } + + ir::Value simm64(std::int64_t value) { + return getOrCreateConstant(getTypeSInt64(), value); + } + ir::Value simm32(std::int32_t value) { + return getOrCreateConstant(getTypeSInt32(), value); + } + ir::Value fimm64(double value) { + return getOrCreateConstant(getTypeFloat(64), value); + } + ir::Value fimm32(float value) { + return getOrCreateConstant(getTypeFloat(32), value); + } + ir::Value getBool(bool value) { return value ? getTrue() : getFalse(); } + ir::Value getTrue() { + return getOrCreateGlobal(ir::spv::OpConstantTrue, {{getTypeBool()}}); + } + ir::Value getFalse() { + return getOrCreateGlobal(ir::spv::OpConstantFalse, {{getTypeBool()}}); + } + + ir::Value getIndex(std::int32_t index) { return simm32(index); } + + void setTypeName(ir::Value type); + + void addGlobal(ir::Value type) { + globals[type.getInstId()].push_back(type); + setTypeName(type); + } + + ir::Value findGlobal(ir::spv::Op op, + std::span operands = {}) const; + ir::Value createGlobal(ir::spv::Op op, std::span operands); + ir::Value getOrCreateGlobal(ir::spv::Op op, + std::span operands = {}); + + ir::Value getTypeInt(int width, bool sign) { + return getOrCreateGlobal(ir::spv::OpTypeInt, {{width, sign ? 1 : 0}}); + } + ir::Value getTypeFloat(int width) { + return getOrCreateGlobal(ir::spv::OpTypeFloat, {{width}}); + } + ir::Value getTypeVoid() { return getOrCreateGlobal(ir::spv::OpTypeVoid); } + ir::Value getTypeBool() { return getOrCreateGlobal(ir::spv::OpTypeBool); } + ir::Value getTypeSampler() { + return getOrCreateGlobal(ir::spv::OpTypeSampler); + } + ir::Value getTypeArray(ir::Value elementType, ir::Value count) { + return getOrCreateGlobal(ir::spv::OpTypeArray, {{elementType, count}}); + } + ir::Value getTypeVector(ir::Value elementType, int count) { + return getOrCreateGlobal(ir::spv::OpTypeVector, {{elementType, count}}); + } + + ir::Value getTypeStruct(auto... elements) { + return getOrCreateGlobal(ir::spv::OpTypeStruct, {{elements...}}); + } + ir::Value getTypeSInt8() { return getTypeInt(8, true); } + ir::Value getTypeUInt8() { return getTypeInt(8, false); } + ir::Value getTypeSInt16() { return getTypeInt(16, true); } + ir::Value getTypeUInt16() { return getTypeInt(16, false); } + ir::Value getTypeSInt32() { return getTypeInt(32, true); } + ir::Value getTypeUInt32() { return getTypeInt(32, false); } + ir::Value getTypeSInt64() { return getTypeInt(64, true); } + ir::Value getTypeUInt64() { return getTypeInt(64, false); } + ir::Value getTypeFloat16() { return getTypeFloat(16); } + ir::Value getTypeFloat32() { return getTypeFloat(32); } + ir::Value getTypeFloat64() { return getTypeFloat(64); } + + ir::Value getTypeFunction(ir::Value returnType, + std::span params) { + std::vector operands; + operands.reserve(1 + params.size()); + operands.push_back(returnType); + for (auto param : params) { + operands.push_back(param); + } + return getOrCreateGlobal(ir::spv::OpTypeFunction, operands); + } + + ir::Value getTypePointer(ir::spv::StorageClass storageClass, + ir::spv::IdRef pointeeType) { + return getOrCreateGlobal(ir::spv::OpTypePointer, + {{storageClass, pointeeType}}); + } + + ir::Value getTypeImage(ir::spv::IdRef sampledType, ir::spv::Dim dim, + std::int32_t depth, bool arrayed, bool multisampled, + std::int32_t sampled, ir::spv::ImageFormat format) { + return getOrCreateGlobal( + ir::spv::OpTypeImage, + {{sampledType, dim, depth, arrayed, multisampled, sampled, format}}); + } + + ir::Value getOperandValue(const ir::Operand &op, ir::Value type = {}); + + void createPerVertex(); + + ir::Value createUniformBuffer(int descriptorSet, int binding, + ir::Value structType); + + ir::Value createRuntimeArrayUniformBuffer(int descriptorSet, int binding, + ir::Value elementType); + + ir::Value createOutput(ir::Location loc, int index); + ir::Value createInput(ir::Location loc, int index); + ir::Value createAttr(ir::Location loc, int attrId, bool perVertex, bool flat); +}; +} // namespace shader::spv diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvTypeInfo.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvTypeInfo.hpp new file mode 100644 index 00000000..8279306b --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvTypeInfo.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "dialect/spv.hpp" + +namespace shader::spv { +struct TypeInfo { + ir::spv::Op baseType = {}; + ir::spv::Op componentType = {}; + int componentWidth = 0; + int componentsCount = 1; + bool isSigned = false; + + int width() const { return componentWidth * componentsCount; } + bool operator==(const TypeInfo &other) const = default; +}; + +TypeInfo getTypeInfo(ir::Value type); +} // namespace shader::spv diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/Vector.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/Vector.hpp new file mode 100644 index 00000000..5cc827f2 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/Vector.hpp @@ -0,0 +1,129 @@ +#pragma once + +#include +#include + +namespace shader { +template struct Vector : std::array { + using std::array::array; + + template + constexpr explicit operator Vector() const { + Vector result; + for (std::size_t i = 0; i < N; ++i) { + result[i] = static_cast((*this)[i]); + } + return result; + } + +#define DEFINE_BINOP(OP) \ + constexpr auto operator OP(const Vector &other) const \ + requires requires(T lhs, T rhs) { lhs OP rhs; } \ + { \ + using ResultElementT = \ + std::remove_cvref_t() OP std::declval())>; \ + Vector result; \ + for (std::size_t i = 0; i < N; ++i) { \ + result[i] = (*this)[i] OP other[i]; \ + } \ + return result; \ + } \ + constexpr auto operator OP(const T &other) const \ + requires requires(T lhs, T rhs) { lhs OP rhs; } \ + { \ + using ResultElementT = \ + std::remove_cvref_t() OP std::declval())>; \ + Vector result; \ + for (std::size_t i = 0; i < N; ++i) { \ + result[i] = (*this)[i] OP other; \ + } \ + return result; \ + } + +#define DEFINE_UNOP(OP) \ + constexpr auto operator OP() const \ + requires requires(T rhs) { OP rhs; } \ + { \ + using ResultElementT = \ + std::remove_cvref_t())>; \ + Vector result; \ + for (std::size_t i = 0; i < N; ++i) { \ + result[i] = OP(*this)[i]; \ + } \ + return result; \ + } + + DEFINE_BINOP(+) + DEFINE_BINOP(-) + DEFINE_BINOP(*) + DEFINE_BINOP(/) + DEFINE_BINOP(%) + DEFINE_BINOP(&) + DEFINE_BINOP(|) + DEFINE_BINOP(^) + DEFINE_BINOP(>>) + DEFINE_BINOP(<<) + DEFINE_BINOP(&&) + DEFINE_BINOP(||) + DEFINE_BINOP(<) + DEFINE_BINOP(>) + DEFINE_BINOP(<=) + DEFINE_BINOP(>=) + DEFINE_BINOP(==) + DEFINE_BINOP(!=) + + DEFINE_UNOP(-) + DEFINE_UNOP(~) + DEFINE_UNOP(!) + +#undef DEFINE_BINOP +#undef DEFINE_UNOP +}; + +using float16_t = _Float16; +using float32_t = float; +using float64_t = double; + +using u8vec2 = Vector; +using u8vec3 = Vector; +using u8vec4 = Vector; +using i8vec2 = Vector; +using i8vec3 = Vector; +using i8vec4 = Vector; + +using u16vec2 = Vector; +using u16vec3 = Vector; +using u16vec4 = Vector; +using i16vec2 = Vector; +using i16vec3 = Vector; +using i16vec4 = Vector; + +using u32vec2 = Vector; +using u32vec3 = Vector; +using u32vec4 = Vector; +using i32vec2 = Vector; +using i32vec3 = Vector; +using i32vec4 = Vector; + +using u64vec2 = Vector; +using u64vec3 = Vector; +using u64vec4 = Vector; +using i64vec2 = Vector; +using i64vec3 = Vector; +using i64vec4 = Vector; + +using f32vec2 = Vector; +using f32vec3 = Vector; +using f32vec4 = Vector; +using f64vec2 = Vector; +using f64vec3 = Vector; +using f64vec4 = Vector; + +using f16vec2 = Vector; +using f16vec3 = Vector; +using f16vec4 = Vector; + +using bvec2 = Vector; +using bvec3 = Vector; +using bvec4 = Vector; +} // namespace shader diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/analyze.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/analyze.hpp new file mode 100644 index 00000000..d028de69 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/analyze.hpp @@ -0,0 +1,445 @@ +#pragma once + +#include "ModuleInfo.hpp" +#include "SemanticInfo.hpp" +#include "dialect/memssa.hpp" +#include "graph.hpp" +#include "ir/Instruction.hpp" +#include "ir/Value.hpp" +#include "rx/FunctionRef.hpp" +#include "rx/TypeId.hpp" +#include +#include +#include +#include + +namespace shader { +struct DomTree; +struct PostDomTree; +class CFG { +public: + class Node { + ir::Value mLabel; + ir::Instruction mTerminator; + std::unordered_set mPredecessors; + std::unordered_set mSuccessors; + + public: + using Iterator = std::unordered_set::iterator; + + Node() = default; + Node(ir::Value label) : mLabel(label) {} + + ir::Value getLabel() { return mLabel; } + + void setTerminator(ir::Instruction inst) { mTerminator = inst; } + bool hasTerminator() { return mTerminator != nullptr; } + ir::Instruction getTerminator() { return mTerminator; } + + void addEdge(Node *to) { + to->mPredecessors.insert(this); + mSuccessors.insert(to); + } + + bool hasPredecessor(Node *node) { return mPredecessors.contains(node); } + bool hasSuccessor(Node *node) { return mSuccessors.contains(node); } + auto &getPredecessors() { return mPredecessors; } + auto &getSuccessors() { return mSuccessors; } + std::size_t getPredecessorCount() { return mPredecessors.size(); } + std::size_t getSuccessorCount() { return mSuccessors.size(); } + bool hasPredecessors() { return !mPredecessors.empty(); } + bool hasSuccessors() { return !mSuccessors.empty(); } + + template auto range() { + return ir::range(mLabel, mTerminator.getNext()); + } + + template auto rangeWithoutLabel() { + return ir::range(mLabel.getNext(), + mTerminator ? mTerminator.getNext() : nullptr); + } + + template auto rangeWithoutTerminator() { + return ir::range(mLabel, mTerminator); + } + + template + auto rangeWithoutLabelAndTerminator() { + return ir::range(mLabel.getNext(), mTerminator); + } + }; + +private: + std::map mNodes; + std::vector mPreorderNodes; + std::vector mPostorderNodes; + Node *mEntryNode = nullptr; + +public: + bool empty() { return mNodes.empty(); } + void clear() { + mNodes.clear(); + mPreorderNodes.clear(); + mPostorderNodes.clear(); + mEntryNode = nullptr; + } + + void addPreorderNode(Node *node) { mPreorderNodes.push_back(node); } + void addPostorderNode(Node *node) { mPostorderNodes.push_back(node); } + + Node *getEntryNode() { return mEntryNode; } + ir::Value getEntryLabel() { return getEntryNode()->getLabel(); } + void setEntryNode(Node *node) { mEntryNode = node; } + + std::span getPreorderNodes() { return mPreorderNodes; } + std::span getPostorderNodes() { return mPostorderNodes; } + + Node *getOrCreateNode(ir::Value label) { + return &mNodes.emplace(label, label).first->second; + } + + Node *getNode(ir::Value label) { + if (auto it = mNodes.find(label); it != mNodes.end()) { + return &it->second; + } + + return nullptr; + } + + auto &getSuccessors(ir::Value label) { + return getNode(label)->getSuccessors(); + } + + auto &getPredecessors(ir::Value label) { + return getNode(label)->getPredecessors(); + } + + void print(std::ostream &os, ir::NameStorage &ns, bool subgraph = false, + std::string_view nameSuffix = ""); + std::string genTest(); + + CFG buildView(CFG::Node *from, PostDomTree *domTree = nullptr, + const std::unordered_set &stopLabels = {}, + ir::Value continueLabel = nullptr); + + CFG buildView(ir::Value from, PostDomTree *domTree = nullptr, + const std::unordered_set &stopLabels = {}, + ir::Value continueLabel = nullptr) { + return buildView(getNode(from), domTree, stopLabels, continueLabel); + } +}; + +class MemorySSA { +public: + ir::Context context; + ir::Region region; + std::map variableToVar; + std::map> + userDefs; + + ir::memssa::Var getVar(ir::Value variable, std::span path); + ir::memssa::Var getVar(ir::Value pointer); + + ir::memssa::Def getDef(ir::Instruction user, ir::memssa::Var var) { + auto userIt = userDefs.find(user); + if (userIt == userDefs.end()) { + return {}; + } + + if (auto it = userIt->second.find(var); it != userIt->second.end()) { + return it->second; + } + + return {}; + } + + ir::memssa::Def getDef(ir::Instruction user, ir::Value pointer) { + if (auto var = getVar(pointer)) { + return getDef(user, var); + } + + return {}; + } + + ir::Instruction getDefInst(ir::Instruction user, ir::Value pointer) { + if (auto def = getDef(user, pointer)) { + return def.getLinkedInst(); + } + + return {}; + } + + void print(std::ostream &os, ir::Region irRegion, ir::NameStorage &ns); + void print(std::ostream &os, ir::NameStorage &ns); + void dump(); + +private: + ir::memssa::Var getVarImpl(ir::Value variable); +}; + +bool isWithoutSideEffects(ir::InstructionId id); +bool isTerminator(ir::Instruction inst); +bool isBranch(ir::Instruction inst); +ir::Value unwrapPointer(ir::Value pointer); +graph::DomTree buildDomTree(CFG &cfg, ir::Value root = nullptr); +graph::DomTree buildPostDomTree(CFG &cfg, ir::Value root); + +CFG buildCFG(ir::Instruction firstInstruction, + const std::unordered_set &exitLabels = {}, + ir::Value continueLabel = nullptr); +MemorySSA buildMemorySSA(CFG &cfg, ModuleInfo *moduleInfo = nullptr); + +MemorySSA buildMemorySSA(CFG &cfg, const SemanticInfo &instructionSemantic, + std::function getRegisterVarCb); + +bool dominates(ir::Instruction a, ir::Instruction b, bool isPostDom, + graph::DomTree &domTree); + +ir::Value findNearestCommonDominator(ir::Instruction a, ir::Instruction b, + graph::DomTree &domTree); + +class BackEdgeStorage { + std::unordered_map> backEdges; + +public: + BackEdgeStorage() = default; + BackEdgeStorage(CFG &cfg); + + const std::unordered_set *get(ir::Value value) { + if (auto it = backEdges.find(value); it != backEdges.end()) { + return &it->second; + } + return nullptr; + } + + auto &all() { return backEdges; } +}; + +struct AnalysisStorage { + template + requires(sizeof...(T) > 0) + bool invalidate() { + bool invalidated = false; + ((invalidated = invalidate(rx::TypeId::get()) || invalidated), ...); + return invalidated; + } + + bool invalidate(rx::TypeId id) { + if (auto it = mStorage.find(id); it != mStorage.end()) { + return std::exchange(it->second.invalid, true) == false; + } + + return false; + } + void invalidateAll() { + for (auto &entry : mStorage) { + entry.second.invalid = true; + } + } + + template + T &get(ArgsT &&...args) + requires requires { T(std::forward(args)...); } + { + void *result = getImpl( + rx::TypeId::get(), getDeleter(), + [&] { + return std::make_unique(std::forward(args)...).release(); + }, + [&](void *object) { + *reinterpret_cast(object) = T(std::forward(args)...); + }); + + return *static_cast(result); + } + + template + T &get(BuilderFn &&builder) + requires requires { T(std::forward(builder)()); } + { + void *result = getImpl( + rx::TypeId::get(), getDeleter(), + [&] { + return std::make_unique(std::forward(builder)()) + .release(); + }, + [&](void *object) { + *reinterpret_cast(object) = std::forward(builder)(); + }); + + return *static_cast(result); + } + +private: + template static void (*getDeleter())(void *) { + return +[](void *data) { delete static_cast(data); }; + } + + void *getImpl(rx::TypeId typeId, void (*deleter)(void *), + rx::FunctionRef constructor, + rx::FunctionRef placementConstructor) { + auto [it, inserted] = mStorage.emplace(typeId, getNullPointer()); + + if (inserted) { + it->second.object = + std::unique_ptr(constructor(), deleter); + } else if (it->second.invalid) { + placementConstructor(it->second.object.get()); + it->second.invalid = false; + } + + return it->second.object.get(); + } + static constexpr std::unique_ptr getNullPointer() { + return {nullptr, [](void *) {}}; + } + + struct Entry { + std::unique_ptr object; + bool invalid = false; + }; + + std::map mStorage; +}; + +struct PostDomTree : graph::DomTree { + PostDomTree() = default; + PostDomTree(graph::DomTree &&other) + : graph::DomTree::DomTree(std::move(other)) {} + PostDomTree(CFG &cfg, ir::Value root) + : PostDomTree(buildPostDomTree(cfg, root)) {} +}; + +struct DomTree : graph::DomTree { + DomTree() = default; + DomTree(graph::DomTree &&other) + : graph::DomTree::DomTree(std::move(other)) {} + DomTree(CFG &cfg, ir::Value root = nullptr) + : DomTree(buildDomTree(cfg, root)) {} +}; + +template struct Tag : T { + using T::T; + using T::operator=; + + Tag(T &&other) : T(std::move(other)) {} + Tag(const T &other) : T(other) {} + + Tag &operator=(T &&other) { + T::operator=(std::move(other)); + return *this; + } + Tag &operator=(const T &other) { + T::operator=(other); + return *this; + } +}; + +struct Construct { + Construct *parent; + std::forward_list children; + ir::Value header; + ir::Value merge; + ir::Value loopBody; + ir::Value loopContinue; + AnalysisStorage analysis; + + static std::unique_ptr createRoot(ir::RegionLike region, + ir::Value merge) { + auto result = std::make_unique(); + auto &cfg = + result->analysis.get([&] { return buildCFG(region.getFirst()); }); + result->header = cfg.getEntryLabel(); + result->merge = merge; + return result; + } + + Construct *createChild(ir::Value header, ir::Value merge) { + auto &result = children.emplace_front(); + result.parent = this; + result.header = header; + result.merge = merge; + return &result; + } + + Construct *createChild(ir::Value header, ir::Value merge, + ir::Value loopContinue, ir::Value loopBody) { + auto &result = children.emplace_front(); + result.parent = this; + result.header = header; + result.merge = merge; + result.loopContinue = loopContinue; + result.loopBody = loopBody; + return &result; + } + + Construct createTemporaryChild(ir::Value header, ir::Value merge) { + Construct result; + result.parent = this; + result.header = header; + result.merge = merge; + return result; + } + + CFG &getCfg() { + return analysis.get([this] { + if (parent != nullptr) { + return parent->getCfg().buildView( + header, + &parent->getPostDomTree(), + {header, merge}); + } + + return buildCFG(header); + }); + } + + CFG &getCfgWithoutContinue() { + if (loopContinue == nullptr) { + return getCfg(); + } + + return analysis.get>([this] { + if (parent != nullptr) { + return parent->getCfg().buildView( + header, + &parent->getPostDomTree(), + {header, merge}, loopContinue); + } + + return buildCFG(header, {}, loopContinue); + }); + } + + DomTree &getDomTree() { return analysis.get(getCfg(), header); } + PostDomTree &getPostDomTree() { + return analysis.get(getCfg(), merge); + } + BackEdgeStorage &getBackEdgeStorage() { + return analysis.get(getCfg()); + } + BackEdgeStorage &getBackEdgeWithoutContinueStorage() { + if (loopContinue == nullptr) { + return getBackEdgeStorage(); + } + return analysis.get>( + getCfgWithoutContinue()); + } + auto getBackEdges(ir::Value node) { return getBackEdgeStorage().get(node); } + auto getBackEdgesWithoutContinue(ir::Value node) { + return getBackEdgeWithoutContinueStorage().get(node); + } + auto getBackEdges() { return getBackEdges(header); } + void invalidate(); + void invalidateAll(); + + bool isNull() const { return header == nullptr; } + + void removeLastChild() { children.pop_front(); } + +private: + enum { + kWithoutContinue, + }; +}; +} // namespace shader diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect.hpp new file mode 100644 index 00000000..4f678812 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect.hpp @@ -0,0 +1,78 @@ +#pragma once + +#include "dialect/builtin.hpp" // IWYU pragma: export +#include "dialect/ds.hpp" // IWYU pragma: export +#include "dialect/exp.hpp" // IWYU pragma: export +#include "dialect/memssa.hpp" // IWYU pragma: export +#include "dialect/mimg.hpp" // IWYU pragma: export +#include "dialect/mtbuf.hpp" // IWYU pragma: export +#include "dialect/mubuf.hpp" // IWYU pragma: export +#include "dialect/smrd.hpp" // IWYU pragma: export +#include "dialect/sop1.hpp" // IWYU pragma: export +#include "dialect/sop2.hpp" // IWYU pragma: export +#include "dialect/sopc.hpp" // IWYU pragma: export +#include "dialect/sopk.hpp" // IWYU pragma: export +#include "dialect/sopp.hpp" // IWYU pragma: export +#include "dialect/vintrp.hpp" // IWYU pragma: export +#include "dialect/vop1.hpp" // IWYU pragma: export +#include "dialect/vop2.hpp" // IWYU pragma: export +#include "dialect/vop3.hpp" // IWYU pragma: export +#include "dialect/vopc.hpp" // IWYU pragma: export + +#include "dialect/spv.hpp" // IWYU pragma: export + +#include "dialect/amdgpu.hpp" // IWYU pragma: export +#include + +namespace shader::ir { +template <> inline constexpr Kind kOpToKind = Kind::Spv; +template <> inline constexpr Kind kOpToKind = Kind::Builtin; +template <> inline constexpr Kind kOpToKind = Kind::AmdGpu; +template <> inline constexpr Kind kOpToKind = Kind::Vop2; +template <> inline constexpr Kind kOpToKind = Kind::Sop2; +template <> inline constexpr Kind kOpToKind = Kind::Sopk; +template <> inline constexpr Kind kOpToKind = Kind::Smrd; +template <> inline constexpr Kind kOpToKind = Kind::Vop3; +template <> inline constexpr Kind kOpToKind = Kind::Mubuf; +template <> inline constexpr Kind kOpToKind = Kind::Mtbuf; +template <> inline constexpr Kind kOpToKind = Kind::Mimg; +template <> inline constexpr Kind kOpToKind = Kind::Ds; +template <> inline constexpr Kind kOpToKind = Kind::Vintrp; +template <> inline constexpr Kind kOpToKind = Kind::Exp; +template <> inline constexpr Kind kOpToKind = Kind::Vop1; +template <> inline constexpr Kind kOpToKind = Kind::Vopc; +template <> inline constexpr Kind kOpToKind = Kind::Sop1; +template <> inline constexpr Kind kOpToKind = Kind::Sopc; +template <> inline constexpr Kind kOpToKind = Kind::Sopp; +template <> inline constexpr Kind kOpToKind = Kind::MemSSA; + +template + requires(kOpToKind> != Kind::Count) +constexpr InstructionId getInstructionId(T op) { + return getInstructionId(kOpToKind>, op); +} + +constexpr bool operator==(ir::Instruction lhs, InstructionId rhs) { + return lhs && lhs.getInstId() == rhs; +} + +template +constexpr bool operator==(L lhs, R rhs) + requires requires { + requires(!std::is_same_v); + { getInstructionId(lhs) == rhs } -> std::convertible_to; + } +{ + return getInstructionId(lhs) == rhs; +} + +template +constexpr bool operator==(L lhs, R rhs) + requires requires { + requires(!std::is_same_v); + { getTypeId(lhs) == rhs } -> std::convertible_to; + } +{ + return getTypeId(lhs) == rhs; +} +} // namespace ir diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/amdgpu.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/amdgpu.hpp new file mode 100644 index 00000000..468be5ca --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/amdgpu.hpp @@ -0,0 +1,57 @@ +#pragma once + +namespace shader::ir::amdgpu { + +enum Op { + EXEC_TEST, + BRANCH, + IMM, + USER_SGPR, + VBUFFER, + SAMPLER, + TBUFFER, + POINTER, + OMOD, + NEG_ABS, + PS_INPUT_VGPR, + PS_COMP_SWAP, + VS_GET_INDEX, + RESOURCE_PHI, + + OpCount, +}; + +inline const char *getInstructionName(unsigned op) { + switch (op) { + case EXEC_TEST: + return "exec_test"; + case BRANCH: + return "branch"; + case IMM: + return "imm"; + case USER_SGPR: + return "user_sgpr"; + case VBUFFER: + return "vbuffer"; + case SAMPLER: + return "sampler"; + case TBUFFER: + return "tbuffer"; + case POINTER: + return "pointer"; + case OMOD: + return "omod"; + case NEG_ABS: + return "neg_abs"; + case PS_INPUT_VGPR: + return "ps_input_vgpr"; + case PS_COMP_SWAP: + return "ps_comp_swap"; + case VS_GET_INDEX: + return "vs_get_index"; + case RESOURCE_PHI: + return "resource_phi"; + } + return nullptr; +} +} // namespace shader::ir::amdgpu diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/builtin.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/builtin.hpp new file mode 100644 index 00000000..ac965896 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/builtin.hpp @@ -0,0 +1,193 @@ +#pragma once +#include "../ir/Block.hpp" +#include "../ir/Builder.hpp" +#include "../ir/Value.hpp" + +namespace shader::ir { +template inline constexpr Kind kOpToKind = Kind::Count; +} + +namespace shader::ir::builtin { +enum Op { + INVALID_INSTRUCTION, + BLOCK, + IF_ELSE, + LOOP, +}; + +inline const char *getInstructionName(unsigned id) { + switch (id) { + case INVALID_INSTRUCTION: + return ""; + + case BLOCK: + return "block"; + + case IF_ELSE: + return "ifElse"; + + case LOOP: + return "loop"; + } + return nullptr; +} + +template +struct Builder : BuilderFacade, ImplT> { + /** + * Creates an invalid instruction with the given location. + * + * @param location the location of the instruction + * + * @return the created invalid instruction + */ + Instruction createInvalidInstruction(Location location) { + return this->template create(location, Kind::Builtin, + INVALID_INSTRUCTION); + } + + Instruction createIfElse(Location location, Value cond, Block ifTrue, + Block ifFalse = {}) { + std::vector operands = {{cond, ifTrue}}; + if (ifFalse) { + operands.push_back(ifFalse); + } + return this->template create(location, Kind::Builtin, IF_ELSE, + operands); + } + + Instruction createLoop(Location location, Block body) { + return this->template create(location, Kind::Builtin, IF_ELSE, + {{body}}); + } + + auto createBlock(Location location) { + return this->template create(location); + } + + auto createRegion(Location location) { + return this->getContext().template create(location); + } + + /** + * Creates an instruction with the given location, kind, op, and operands. + * + * @param location the location of the instruction + * @param kind the kind of the instruction + * @param op the opcode of the instruction + * @param operands the operands of the instruction + * + * @return the created instruction + */ + Instruction createInstruction(Location location, Kind kind, unsigned op, + std::span operands = {}) { + return this->template create(location, kind, op, operands); + } + + template + Instruction createInstruction(Location location, OpT &&op, + std::span operands = {}) + requires requires { + this->template create( + location, kOpToKind>, op, operands); + } + { + return this->template create( + location, kOpToKind>, op, operands); + } + + /** + * Creates an Instruction object with the given location, kind, opcode, and + * operands. + * + * @param location the location of the instruction + * @param kind the kind of the instruction + * @param op the opcode of the instruction + * @param operands variadic parameter pack of operands for the instruction + * + * @return the created Instruction object + */ + template + Instruction createInstruction(Location location, Kind kind, unsigned op, + T &&...operands) + requires requires { + createInstruction(location, kind, op, + {{Operand(std::forward(operands))...}}); + } + { + return createInstruction(location, kind, op, + {{Operand(std::forward(operands))...}}); + } + + template + Instruction createInstruction(Location location, OpT &&op, T &&...operands) + requires requires { + createInstruction(location, std::forward(op), + {{Operand(std::forward(operands))...}}); + } + { + return createInstruction(location, std::forward(op), + {{Operand(std::forward(operands))...}}); + } + + /** + * Creates a Value object with the given location, kind, opcode, and operands. + * + * @param location the location of the Value object + * @param kind the kind of the Value object + * @param op the opcode of the Value object + * @param operands a span of operands for the Value object + * + * @return the created Value object + */ + auto createValue(Location location, Kind kind, unsigned op, + std::span operands = {}) { + return this->template create(location, kind, op, operands); + } + + template + auto createValue(Location location, OpT &&op, + std::span operands = {}) + requires requires { + this->template create( + location, kOpToKind>, op, operands); + } + { + return this->template create( + location, kOpToKind>, op, operands); + } + + /** + * Creates a Value object with the given location, kind, opcode, and operands. + * + * @param location the location of the Value object + * @param kind the kind of the Value object + * @param op the opcode of the Value object + * @param operands variadic parameter pack of operands for the Value object + * + * @return the created Value object + */ + template + auto createValue(Location location, Kind kind, unsigned op, T &&...operands) + requires requires { + createValue(location, kind, op, + {{Operand(std::forward(operands))...}}); + } + { + return createValue(location, kind, op, + {{Operand(std::forward(operands))...}}); + } + + template + requires requires { kOpToKind>; } + auto createValue(Location location, OpT &&op, T &&...operands) + requires requires { + createValue(location, std::forward(op), + {{Operand(std::forward(operands))...}}); + } + { + return createValue(location, std::forward(op), + {{Operand(std::forward(operands))...}}); + } +}; +} // namespace shader::ir::builtin diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/ds.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/ds.hpp new file mode 100644 index 00000000..0c166c10 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/ds.hpp @@ -0,0 +1,294 @@ +#pragma once + +namespace shader::ir::ds { +enum Op { + ADD_U32, + SUB_U32, + RSUB_U32, + INC_U32, + DEC_U32, + MIN_I32, + MAX_I32, + MIN_U32, + MAX_U32, + AND_B32, + OR_B32, + XOR_B32, + MSKOR_B32, + WRITE_B32, + WRITE2_B32, + WRITE2ST64_B32, + CMPST_B32, + CMPST_F32, + MIN_F32, + MAX_F32, + NOP, + GWS_SEMA_RELEASE_ALL = 24, + GWS_INIT, + GWS_SEMA_V, + GWS_SEMA_BR, + GWS_SEMA_P, + GWS_BARRIER, + WRITE_B8, + WRITE_B16, + ADD_RTN_U32, + SUB_RTN_U32, + RSUB_RTN_U32, + INC_RTN_U32, + DEC_RTN_U32, + MIN_RTN_I32, + MAX_RTN_I32, + MIN_RTN_U32, + MAX_RTN_U32, + AND_RTN_B32, + OR_RTN_B32, + XOR_RTN_B32, + MSKOR_RTN_B32, + WRXCHG_RTN_B32, + WRXCHG2_RTN_B32, + WRXCHG2ST64_RTN_B32, + CMPST_RTN_B32, + CMPST_RTN_F32, + MIN_RTN_F32, + MAX_RTN_F32, + WRAP_RTN_B32, + SWIZZLE_B32, + READ_B32, + READ2_B32, + READ2ST64_B32, + READ_I8, + READ_U8, + READ_I16, + READ_U16, + CONSUME, + APPEND, + ORDERED_COUNT, + ADD_U64, + SUB_U64, + RSUB_U64, + INC_U64, + DEC_U64, + MIN_I64, + MAX_I64, + MIN_U64, + MAX_U64, + AND_B64, + OR_B64, + XOR_B64, + MSKOR_B64, + WRITE_B64, + WRITE2_B64, + WRITE2ST64_B64, + CMPST_B64, + CMPST_F64, + MIN_F64, + MAX_F64, + ADD_RTN_U64 = 96, + SUB_RTN_U64, + RSUB_RTN_U64, + INC_RTN_U64, + DEC_RTN_U64, + MIN_RTN_I64, + MAX_RTN_I64, + MIN_RTN_U64, + MAX_RTN_U64, + AND_RTN_B64, + OR_RTN_B64, + XOR_RTN_B64, + MSKOR_RTN_B64, + WRXCHG_RTN_B64, + WRXCHG2_RTN_B64, + WRXCHG2ST64_RTN_B64, + CMPST_RTN_B64, + CMPST_RTN_F64, + MIN_RTN_F64, + MAX_RTN_F64, + READ_B64 = 118, + READ2_B64, + READ2ST64_B64, + CONDXCHG32_RTN_B64 = 126, + ADD_SRC2_U32 = 128, + SUB_SRC2_U32, + RSUB_SRC2_U32, + INC_SRC2_U32, + DEC_SRC2_U32, + MIN_SRC2_I32, + MAX_SRC2_I32, + MIN_SRC2_U32, + MAX_SRC2_U32, + AND_SRC2_B32, + OR_SRC2_B32, + XOR_SRC2_B32, + WRITE_SRC2_B32, + MIN_SRC2_F32 = 146, + MAX_SRC2_F32, + ADD_SRC2_U64 = 192, + SUB_SRC2_U64, + RSUB_SRC2_U64, + INC_SRC2_U64, + DEC_SRC2_U64, + MIN_SRC2_I64, + MAX_SRC2_I64, + MIN_SRC2_U64, + MAX_SRC2_U64, + AND_SRC2_B64, + OR_SRC2_B64, + XOR_SRC2_B64, + WRITE_SRC2_B64, + MIN_SRC2_F64 = 210, + MAX_SRC2_F64, + WRITE_B96 = 222, + WRITE_B128, + CONDXCHG32_RTN_B128 = 253, + READ_B96, + READ_B128, + + OpCount +}; + +inline const char *getInstructionName(unsigned id) { + switch (id) { + case ADD_U32: return "ds_add_u32"; + case SUB_U32: return "ds_sub_u32"; + case RSUB_U32: return "ds_rsub_u32"; + case INC_U32: return "ds_inc_u32"; + case DEC_U32: return "ds_dec_u32"; + case MIN_I32: return "ds_min_i32"; + case MAX_I32: return "ds_max_i32"; + case MIN_U32: return "ds_min_u32"; + case MAX_U32: return "ds_max_u32"; + case AND_B32: return "ds_and_b32"; + case OR_B32: return "ds_or_b32"; + case XOR_B32: return "ds_xor_b32"; + case MSKOR_B32: return "ds_mskor_b32"; + case WRITE_B32: return "ds_write_b32"; + case WRITE2_B32: return "ds_write2_b32"; + case WRITE2ST64_B32: return "ds_write2st64_b32"; + case CMPST_B32: return "ds_cmpst_b32"; + case CMPST_F32: return "ds_cmpst_f32"; + case MIN_F32: return "ds_min_f32"; + case MAX_F32: return "ds_max_f32"; + case NOP: return "ds_nop"; + case GWS_SEMA_RELEASE_ALL: return "ds_gws_sema_release_all"; + case GWS_INIT: return "ds_gws_init"; + case GWS_SEMA_V: return "ds_gws_sema_v"; + case GWS_SEMA_BR: return "ds_gws_sema_br"; + case GWS_SEMA_P: return "ds_gws_sema_p"; + case GWS_BARRIER: return "ds_gws_barrier"; + case WRITE_B8: return "ds_write_b8"; + case WRITE_B16: return "ds_write_b16"; + case ADD_RTN_U32: return "ds_add_rtn_u32"; + case SUB_RTN_U32: return "ds_sub_rtn_u32"; + case RSUB_RTN_U32: return "ds_rsub_rtn_u32"; + case INC_RTN_U32: return "ds_inc_rtn_u32"; + case DEC_RTN_U32: return "ds_dec_rtn_u32"; + case MIN_RTN_I32: return "ds_min_rtn_i32"; + case MAX_RTN_I32: return "ds_max_rtn_i32"; + case MIN_RTN_U32: return "ds_min_rtn_u32"; + case MAX_RTN_U32: return "ds_max_rtn_u32"; + case AND_RTN_B32: return "ds_and_rtn_b32"; + case OR_RTN_B32: return "ds_or_rtn_b32"; + case XOR_RTN_B32: return "ds_xor_rtn_b32"; + case MSKOR_RTN_B32: return "ds_mskor_rtn_b32"; + case WRXCHG_RTN_B32: return "ds_wrxchg_rtn_b32"; + case WRXCHG2_RTN_B32: return "ds_wrxchg2_rtn_b32"; + case WRXCHG2ST64_RTN_B32: return "ds_wrxchg2st64_rtn_b32"; + case CMPST_RTN_B32: return "ds_cmpst_rtn_b32"; + case CMPST_RTN_F32: return "ds_cmpst_rtn_f32"; + case MIN_RTN_F32: return "ds_min_rtn_f32"; + case MAX_RTN_F32: return "ds_max_rtn_f32"; + case WRAP_RTN_B32: return "ds_wrap_rtn_b32"; + case SWIZZLE_B32: return "ds_swizzle_b32"; + case READ_B32: return "ds_read_b32"; + case READ2_B32: return "ds_read2_b32"; + case READ2ST64_B32: return "ds_read2st64_b32"; + case READ_I8: return "ds_read_i8"; + case READ_U8: return "ds_read_u8"; + case READ_I16: return "ds_read_i16"; + case READ_U16: return "ds_read_u16"; + case CONSUME: return "ds_consume"; + case APPEND: return "ds_append"; + case ORDERED_COUNT: return "ds_ordered_count"; + case ADD_U64: return "ds_add_u64"; + case SUB_U64: return "ds_sub_u64"; + case RSUB_U64: return "ds_rsub_u64"; + case INC_U64: return "ds_inc_u64"; + case DEC_U64: return "ds_dec_u64"; + case MIN_I64: return "ds_min_i64"; + case MAX_I64: return "ds_max_i64"; + case MIN_U64: return "ds_min_u64"; + case MAX_U64: return "ds_max_u64"; + case AND_B64: return "ds_and_b64"; + case OR_B64: return "ds_or_b64"; + case XOR_B64: return "ds_xor_b64"; + case MSKOR_B64: return "ds_mskor_b64"; + case WRITE_B64: return "ds_write_b64"; + case WRITE2_B64: return "ds_write2_b64"; + case WRITE2ST64_B64: return "ds_write2st64_b64"; + case CMPST_B64: return "ds_cmpst_b64"; + case CMPST_F64: return "ds_cmpst_f64"; + case MIN_F64: return "ds_min_f64"; + case MAX_F64: return "ds_max_f64"; + case ADD_RTN_U64: return "ds_add_rtn_u64"; + case SUB_RTN_U64: return "ds_sub_rtn_u64"; + case RSUB_RTN_U64: return "ds_rsub_rtn_u64"; + case INC_RTN_U64: return "ds_inc_rtn_u64"; + case DEC_RTN_U64: return "ds_dec_rtn_u64"; + case MIN_RTN_I64: return "ds_min_rtn_i64"; + case MAX_RTN_I64: return "ds_max_rtn_i64"; + case MIN_RTN_U64: return "ds_min_rtn_u64"; + case MAX_RTN_U64: return "ds_max_rtn_u64"; + case AND_RTN_B64: return "ds_and_rtn_b64"; + case OR_RTN_B64: return "ds_or_rtn_b64"; + case XOR_RTN_B64: return "ds_xor_rtn_b64"; + case MSKOR_RTN_B64: return "ds_mskor_rtn_b64"; + case WRXCHG_RTN_B64: return "ds_wrxchg_rtn_b64"; + case WRXCHG2_RTN_B64: return "ds_wrxchg2_rtn_b64"; + case WRXCHG2ST64_RTN_B64: return "ds_wrxchg2st64_rtn_b64"; + case CMPST_RTN_B64: return "ds_cmpst_rtn_b64"; + case CMPST_RTN_F64: return "ds_cmpst_rtn_f64"; + case MIN_RTN_F64: return "ds_min_rtn_f64"; + case MAX_RTN_F64: return "ds_max_rtn_f64"; + case READ_B64: return "ds_read_b64"; + case READ2_B64: return "ds_read2_b64"; + case READ2ST64_B64: return "ds_read2st64_b64"; + case CONDXCHG32_RTN_B64: return "ds_condxchg32_rtn_b64"; + case ADD_SRC2_U32: return "ds_add_src2_u32"; + case SUB_SRC2_U32: return "ds_sub_src2_u32"; + case RSUB_SRC2_U32: return "ds_rsub_src2_u32"; + case INC_SRC2_U32: return "ds_inc_src2_u32"; + case DEC_SRC2_U32: return "ds_dec_src2_u32"; + case MIN_SRC2_I32: return "ds_min_src2_i32"; + case MAX_SRC2_I32: return "ds_max_src2_i32"; + case MIN_SRC2_U32: return "ds_min_src2_u32"; + case MAX_SRC2_U32: return "ds_max_src2_u32"; + case AND_SRC2_B32: return "ds_and_src2_b32"; + case OR_SRC2_B32: return "ds_or_src2_b32"; + case XOR_SRC2_B32: return "ds_xor_src2_b32"; + case WRITE_SRC2_B32: return "ds_write_src2_b32"; + case MIN_SRC2_F32: return "ds_min_src2_f32"; + case MAX_SRC2_F32: return "ds_max_src2_f32"; + case ADD_SRC2_U64: return "ds_add_src2_u64"; + case SUB_SRC2_U64: return "ds_sub_src2_u64"; + case RSUB_SRC2_U64: return "ds_rsub_src2_u64"; + case INC_SRC2_U64: return "ds_inc_src2_u64"; + case DEC_SRC2_U64: return "ds_dec_src2_u64"; + case MIN_SRC2_I64: return "ds_min_src2_i64"; + case MAX_SRC2_I64: return "ds_max_src2_i64"; + case MIN_SRC2_U64: return "ds_min_src2_u64"; + case MAX_SRC2_U64: return "ds_max_src2_u64"; + case AND_SRC2_B64: return "ds_and_src2_b64"; + case OR_SRC2_B64: return "ds_or_src2_b64"; + case XOR_SRC2_B64: return "ds_xor_src2_b64"; + case WRITE_SRC2_B64: return "ds_write_src2_b64"; + case MIN_SRC2_F64: return "ds_min_src2_f64"; + case MAX_SRC2_F64: return "ds_max_src2_f64"; + case WRITE_B96: return "ds_write_b96"; + case WRITE_B128: return "ds_write_b128"; + case CONDXCHG32_RTN_B128: return "ds_condxchg32_rtn_b128"; + case READ_B96: return "ds_read_b96"; + case READ_B128: return "ds_read_b128"; + } + return nullptr; +} +} // namespace shader::ir::ds diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/exp.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/exp.hpp new file mode 100644 index 00000000..f704031a --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/exp.hpp @@ -0,0 +1,11 @@ +#pragma once + +namespace shader::ir::exp { +enum Op { + EXP = 0, + + OpCount +}; + +inline const char *getInstructionName(unsigned) { return "exp"; } +} // namespace shader::ir::exp diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/memssa.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/memssa.hpp new file mode 100644 index 00000000..293d9c0a --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/memssa.hpp @@ -0,0 +1,423 @@ +#pragma once + +#include "../ir/Block.hpp" +#include "../ir/Builder.hpp" +#include "../ir/Value.hpp" +#include "../ir/ValueImpl.hpp" + +namespace shader::ir::memssa { +enum Op { + OpVar, + OpDef, + OpPhi, + OpUse, + OpBarrier, + OpJump, + OpExit, + + OpCount, +}; + +template struct BaseImpl : BaseT { + Instruction link; + + using BaseT::BaseT; + using BaseT::operator=; + + void print(std::ostream &os, NameStorage &ns) const override { + BaseT::print(os, ns); + + if (link) { + os << " : "; + link.print(os, ns); + } + } +}; + +template typename BaseT> +struct BaseWrapper : BaseT { + using BaseT::BaseT; + using BaseT::operator=; + + Instruction getLinkedInst() const { return this->impl->link; } +}; + +struct DefImpl : BaseImpl { + using BaseImpl::BaseImpl; + using BaseImpl::operator=; + + Node clone(Context &context, CloneMap &map) const override; +}; +struct UseImpl : BaseImpl { + using BaseImpl::BaseImpl; + using BaseImpl::operator=; + + Node clone(Context &context, CloneMap &map) const override; +}; +struct VarImpl : BaseImpl { + using BaseImpl::BaseImpl; + using BaseImpl::operator=; + + Node clone(Context &context, CloneMap &map) const override; +}; +struct PhiImpl : DefImpl { + using DefImpl::DefImpl; + using DefImpl::operator=; + + Node clone(Context &context, CloneMap &map) const override; +}; + +using Use = BaseWrapper; +using Var = BaseWrapper; + +template struct DefWrapper : BaseWrapper { + using BaseWrapper::BaseWrapper; + using BaseWrapper::operator=; + + void addVariable(Var variable) { + this->addOperand(variable); + + std::vector workList; + + for (auto &comp : variable.getOperands()) { + auto compVar = comp.getAsValue().staticCast(); + this->addOperand(compVar); + + if (compVar.getOperandCount() > 1) { + workList.push_back(compVar); + } else if (compVar.getOperandCount() == 1) { + this->addOperand(compVar.getOperand(0).getAsValue().staticCast()); + } + } + + while (!workList.empty()) { + auto var = workList.back(); + workList.pop_back(); + + for (auto &comp : var.getOperands()) { + auto compVar = comp.getAsValue().staticCast(); + this->addOperand(compVar); + + if (compVar.getOperandCount() > 1) { + workList.push_back(var); + } else if (compVar.getOperandCount() == 1) { + this->addOperand( + compVar.getOperand(0).getAsValue().staticCast()); + } + } + } + } + + Var getRootVar() { + return this->getOperand(0).getAsValue().template staticCast(); + } + + Var getVar(std::size_t index) { + return this->getOperand(index).getAsValue().template staticCast(); + } +}; + +struct ScopeImpl : BaseImpl { + using BaseImpl::BaseImpl; + using BaseImpl::operator=; + + Node clone(Context &context, CloneMap &map) const override; +}; + +template struct ScopeWrapper; + +using Scope = ScopeWrapper; +using Def = DefWrapper; + +template struct BarrierWrapper : DefWrapper { + using DefWrapper::DefWrapper; + using DefWrapper::operator=; +}; + +using Barrier = BarrierWrapper; + +template +struct ScopeWrapper : BaseWrapper { + using BaseWrapper::BaseWrapper; + using BaseWrapper::operator=; + + Scope getSingleSuccessor() { + if (this->empty()) { + return {}; + } + auto terminator = this->getLast(); + if (terminator.getKind() != Kind::MemSSA || terminator.getOp() != OpJump) { + return {}; + } + if (terminator.getOperandCount() != 1) { + return {}; + } + + return terminator.getOperand(0).getAsValue().template cast(); + } + + std::vector getSuccessors() { + if (this->empty()) { + return {}; + } + auto terminator = this->getLast(); + if (terminator.getKind() != Kind::MemSSA || terminator.getOp() != OpJump) { + return {}; + } + + std::vector result; + result.reserve(terminator.getOperandCount()); + for (auto &successor : terminator.getOperands()) { + if (auto block = successor.getAsValue().template cast()) { + result.push_back(block); + } + } + return result; + } + + auto getPredecessors() { + std::set predecessors; + for (auto &use : this->getUseList()) { + if (use.user != OpJump) { + continue; + } + + if (auto userParent = use.user.getParent().template cast()) { + predecessors.insert(userParent); + } + } + return predecessors; + } + + auto getSinglePredecessor() { + Scope predecessor; + + for (auto &use : this->getUseList()) { + if (use.user != OpJump) { + continue; + } + + if (auto userParent = use.user.getParent().template cast()) { + if (predecessor == nullptr) { + predecessor = userParent; + } else if (predecessor != userParent) { + return Scope(nullptr); + } + } + } + + return predecessor; + } + + Def findVarDef(Var var, Instruction point = nullptr) { + if (point == nullptr) { + point = this->getLast(); + } + + std::optional> compList; + + auto buildMatchList = [&] { + std::set result; + std::vector workList; + + for (auto comp : var.getOperands()) { + auto compVar = comp.getAsValue().staticCast(); + result.insert(compVar); + + if (compVar.getOperandCount() > 1) { + workList.push_back(compVar); + } else if (compVar.getOperandCount() == 1) { + result.insert(compVar.getOperand(0).getAsValue().staticCast()); + } + } + + while (!workList.empty()) { + auto var = workList.back(); + workList.pop_back(); + + for (auto comp : var.getOperands()) { + auto compVar = comp.getAsValue().staticCast(); + result.insert(compVar); + + if (compVar.getOperandCount() > 1) { + workList.push_back(compVar); + } else if (compVar.getOperandCount() == 1) { + result.insert(compVar.getOperand(0).getAsValue().staticCast()); + } + } + } + + return result; + }; + + for (auto child : revRange(point)) { + if (child.getKind() != Kind::MemSSA) { + continue; + } + + if (child.getOp() == OpDef || child.getOp() == OpPhi) { + if (child.getOperand(0) == var) { + return child.template staticCast(); + } + + if (!compList) { + compList = buildMatchList(); + } + + if (compList->empty()) { + continue; + } + + if (compList->contains( + child.getOperand(0).getAsValue().staticCast())) { + return child.template staticCast(); + } + } + + if (child.getOp() == OpBarrier) { + // barrier is definition for everything + return child.template staticCast(); + } + } + + return {}; + } +}; + +template struct PhiWrapper : ValueWrapper { + using ValueWrapper::ValueWrapper; + using ValueWrapper::operator=; + + void addValue(Scope scope, Def def) { + this->addOperand(scope); + this->addOperand(def); + } + + // Set value for specified block or add new node + // Returns true if node was added + bool setValue(Scope pred, Def def) { + for (std::size_t i = 1, end = this->getOperandCount(); i < end; i += 2) { + if (pred == this->getOperand(i).getAsValue()) { + this->replaceOperand(i + 1, def); + return false; + } + } + + addValue(pred, def); + return true; + } + + Def getDef(Scope pred) { + for (std::size_t i = 1, end = this->getOperandCount(); i < end; i += 2) { + if (pred == this->getOperand(i).getAsValue()) { + return this->getOperand(i + 1).getAsValue().template staticCast(); + } + } + + return {}; + } + + bool empty() { return this->getOperandCount() < 2; } + + Def getUniqDef() { + if (empty()) { + return {}; + } + + Def result = this->getOperand(2).getAsValue().template staticCast(); + + for (std::size_t i = 4, end = this->getOperandCount(); i < end; i += 2) { + if (this->getOperand(i) != result) { + return {}; + } + } + + return result; + } + + Var getVar() { + return this->getOperand(0).getAsValue().template staticCast(); + } +}; + +using Phi = PhiWrapper; + +template +struct Builder : BuilderFacade, ImplT> { + Def createDef(Instruction defInst, Var var) { + auto result = + this->template create(defInst.getLocation(), Kind::MemSSA, OpDef); + result.impl->link = defInst; + result.addOperand(var); + return result; + } + + Scope createScope(ir::Instruction labelInst) { + Scope result = this->template create(labelInst.getLocation()); + result.impl->link = labelInst; + return result; + } + + Phi createPhi(Var var) { + auto result = + this->template create(var.getLocation(), Kind::MemSSA, OpPhi); + result.addOperand(var); + return result; + } + + Use createUse(ir::Instruction useInst) { + Use result = + this->template create(useInst.getLocation(), Kind::MemSSA, OpUse); + result.impl->link = useInst; + return result; + } + + Use createUse(ir::Instruction useInst, Def def) { + auto result = createUse(useInst); + result.addOperand(def); + return result; + } + + Var createVar(ir::Instruction varInst) { + Var result = + this->template create(varInst.getLocation(), Kind::MemSSA, OpVar); + result.impl->link = varInst; + return result; + } + + Barrier createBarrier(ir::Instruction barrierInst) { + Barrier result = this->template create(barrierInst.getLocation(), + Kind::MemSSA, OpBarrier); + result.impl->link = barrierInst; + return result; + } + + Instruction createJump(Location loc) { + return this->template create(loc, Kind::MemSSA, OpJump); + } + + Instruction createExit(Location loc) { + return this->template create(loc, Kind::MemSSA, OpExit); + } +}; + +inline const char *getInstructionName(unsigned op) { + switch (op) { + case OpVar: + return "var"; + case OpDef: + return "def"; + case OpPhi: + return "phi"; + case OpUse: + return "use"; + case OpBarrier: + return "barrier"; + case OpJump: + return "jump"; + case OpExit: + return "exit"; + } + return nullptr; +} +} // namespace shader::ir::memssa diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mimg.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mimg.hpp new file mode 100644 index 00000000..811a2656 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mimg.hpp @@ -0,0 +1,199 @@ +#pragma once + +namespace shader::ir::mimg { +enum Op { + LOAD, + LOAD_MIP, + LOAD_PCK, + LOAD_PCK_SGN, + LOAD_MIP_PCK, + LOAD_MIP_PCK_SGN, + STORE = 8, + STORE_MIP, + STORE_PCK, + STORE_MIP_PCK, + GET_RESINFO = 14, + ATOMIC_SWAP, + ATOMIC_CMPSWAP, + ATOMIC_ADD, + ATOMIC_SUB, + ATOMIC_RSUB, + ATOMIC_SMIN, + ATOMIC_UMIN, + ATOMIC_SMAX, + ATOMIC_UMAX, + ATOMIC_AND, + ATOMIC_OR, + ATOMIC_XOR, + ATOMIC_INC, + ATOMIC_DEC, + ATOMIC_FCMPSWAP, + ATOMIC_FMIN, + ATOMIC_FMAX, + SAMPLE, + SAMPLE_CL, + SAMPLE_D, + SAMPLE_D_CL, + SAMPLE_L, + SAMPLE_B, + SAMPLE_B_CL, + SAMPLE_LZ, + SAMPLE_C, + SAMPLE_C_CL, + SAMPLE_C_D, + SAMPLE_C_D_CL, + SAMPLE_C_L, + SAMPLE_C_B, + SAMPLE_C_B_CL, + SAMPLE_C_LZ, + SAMPLE_O, + SAMPLE_CL_O, + SAMPLE_D_O, + SAMPLE_D_CL_O, + SAMPLE_L_O, + SAMPLE_B_O, + SAMPLE_B_CL_O, + SAMPLE_LZ_O, + SAMPLE_C_O, + SAMPLE_C_CL_O, + SAMPLE_C_D_O, + SAMPLE_C_D_CL_O, + SAMPLE_C_L_O, + SAMPLE_C_B_O, + SAMPLE_C_B_CL_O, + SAMPLE_C_LZ_O, + GATHER4, + GATHER4_CL, + GATHER4_L = 68, + GATHER4_B, + GATHER4_B_CL, + GATHER4_LZ, + GATHER4_C, + GATHER4_C_CL, + GATHER4_C_L = 76, + GATHER4_C_B, + GATHER4_C_B_CL, + GATHER4_C_LZ, + GATHER4_O, + GATHER4_CL_O, + GATHER4_L_O = 84, + GATHER4_B_O, + GATHER4_B_CL_O, + GATHER4_LZ_O, + GATHER4_C_O, + GATHER4_C_CL_O, + GATHER4_C_L_O = 92, + GATHER4_C_B_O, + GATHER4_C_B_CL_O, + GATHER4_C_LZ_O, + GET_LOD, + SAMPLE_CD = 104, + SAMPLE_CD_CL, + SAMPLE_C_CD, + SAMPLE_C_CD_CL, + SAMPLE_CD_O, + SAMPLE_CD_CL_O, + SAMPLE_C_CD_O, + SAMPLE_C_CD_CL_O, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case LOAD: return "image_load"; + case LOAD_MIP: return "image_load_mip"; + case LOAD_PCK: return "image_load_pck"; + case LOAD_PCK_SGN: return "image_load_pck_sgn"; + case LOAD_MIP_PCK: return "image_load_mip_pck"; + case LOAD_MIP_PCK_SGN: return "image_load_mip_pck_sgn"; + case STORE: return "image_store"; + case STORE_MIP: return "image_store_mip"; + case STORE_PCK: return "image_store_pck"; + case STORE_MIP_PCK: return "image_store_mip_pck"; + case GET_RESINFO: return "image_get_resinfo"; + case ATOMIC_SWAP: return "image_atomic_swap"; + case ATOMIC_CMPSWAP: return "image_atomic_cmpswap"; + case ATOMIC_ADD: return "image_atomic_add"; + case ATOMIC_SUB: return "image_atomic_sub"; + case ATOMIC_RSUB: return "image_atomic_rsub"; + case ATOMIC_SMIN: return "image_atomic_smin"; + case ATOMIC_UMIN: return "image_atomic_umin"; + case ATOMIC_SMAX: return "image_atomic_smax"; + case ATOMIC_UMAX: return "image_atomic_umax"; + case ATOMIC_AND: return "image_atomic_and"; + case ATOMIC_OR: return "image_atomic_or"; + case ATOMIC_XOR: return "image_atomic_xor"; + case ATOMIC_INC: return "image_atomic_inc"; + case ATOMIC_DEC: return "image_atomic_dec"; + case ATOMIC_FCMPSWAP: return "image_atomic_fcmpswap"; + case ATOMIC_FMIN: return "image_atomic_fmin"; + case ATOMIC_FMAX: return "image_atomic_fmax"; + case SAMPLE: return "image_sample"; + case SAMPLE_CL: return "image_sample_cl"; + case SAMPLE_D: return "image_sample_d"; + case SAMPLE_D_CL: return "image_sample_d_cl"; + case SAMPLE_L: return "image_sample_l"; + case SAMPLE_B: return "image_sample_b"; + case SAMPLE_B_CL: return "image_sample_b_cl"; + case SAMPLE_LZ: return "image_sample_lz"; + case SAMPLE_C: return "image_sample_c"; + case SAMPLE_C_CL: return "image_sample_c_cl"; + case SAMPLE_C_D: return "image_sample_c_d"; + case SAMPLE_C_D_CL: return "image_sample_c_d_cl"; + case SAMPLE_C_L: return "image_sample_c_l"; + case SAMPLE_C_B: return "image_sample_c_b"; + case SAMPLE_C_B_CL: return "image_sample_c_b_cl"; + case SAMPLE_C_LZ: return "image_sample_c_lz"; + case SAMPLE_O: return "image_sample_o"; + case SAMPLE_CL_O: return "image_sample_cl_o"; + case SAMPLE_D_O: return "image_sample_d_o"; + case SAMPLE_D_CL_O: return "image_sample_d_cl_o"; + case SAMPLE_L_O: return "image_sample_l_o"; + case SAMPLE_B_O: return "image_sample_b_o"; + case SAMPLE_B_CL_O: return "image_sample_b_cl_o"; + case SAMPLE_LZ_O: return "image_sample_lz_o"; + case SAMPLE_C_O: return "image_sample_c_o"; + case SAMPLE_C_CL_O: return "image_sample_c_cl_o"; + case SAMPLE_C_D_O: return "image_sample_c_d_o"; + case SAMPLE_C_D_CL_O: return "image_sample_c_d_cl_o"; + case SAMPLE_C_L_O: return "image_sample_c_l_o"; + case SAMPLE_C_B_O: return "image_sample_c_b_o"; + case SAMPLE_C_B_CL_O: return "image_sample_c_b_cl_o"; + case SAMPLE_C_LZ_O: return "image_sample_c_lz_o"; + case GATHER4: return "image_gather4"; + case GATHER4_CL: return "image_gather4_cl"; + case GATHER4_L: return "image_gather4_l"; + case GATHER4_B: return "image_gather4_b"; + case GATHER4_B_CL: return "image_gather4_b_cl"; + case GATHER4_LZ: return "image_gather4_lz"; + case GATHER4_C: return "image_gather4_c"; + case GATHER4_C_CL: return "image_gather4_c_cl"; + case GATHER4_C_L: return "image_gather4_c_l"; + case GATHER4_C_B: return "image_gather4_c_b"; + case GATHER4_C_B_CL: return "image_gather4_c_b_cl"; + case GATHER4_C_LZ: return "image_gather4_c_lz"; + case GATHER4_O: return "image_gather4_o"; + case GATHER4_CL_O: return "image_gather4_cl_o"; + case GATHER4_L_O: return "image_gather4_l_o"; + case GATHER4_B_O: return "image_gather4_b_o"; + case GATHER4_B_CL_O: return "image_gather4_b_cl_o"; + case GATHER4_LZ_O: return "image_gather4_lz_o"; + case GATHER4_C_O: return "image_gather4_c_o"; + case GATHER4_C_CL_O: return "image_gather4_c_cl_o"; + case GATHER4_C_L_O: return "image_gather4_c_l_o"; + case GATHER4_C_B_O: return "image_gather4_c_b_o"; + case GATHER4_C_B_CL_O: return "image_gather4_c_b_cl_o"; + case GATHER4_C_LZ_O: return "image_gather4_c_lz_o"; + case GET_LOD: return "image_get_lod"; + case SAMPLE_CD: return "image_sample_cd"; + case SAMPLE_CD_CL: return "image_sample_cd_cl"; + case SAMPLE_C_CD: return "image_sample_c_cd"; + case SAMPLE_C_CD_CL: return "image_sample_c_cd_cl"; + case SAMPLE_CD_O: return "image_sample_cd_o"; + case SAMPLE_CD_CL_O: return "image_sample_cd_cl_o"; + case SAMPLE_C_CD_O: return "image_sample_c_cd_o"; + case SAMPLE_C_CD_CL_O: return "image_sample_c_cd_cl_o"; + } + return nullptr; +} +} // namespace shader::ir::mimg diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mtbuf.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mtbuf.hpp new file mode 100644 index 00000000..34db57c4 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mtbuf.hpp @@ -0,0 +1,37 @@ +#pragma once + +namespace shader::ir::mtbuf { +enum Op { + LOAD_FORMAT_X, + LOAD_FORMAT_XY, + LOAD_FORMAT_XYZ, + LOAD_FORMAT_XYZW, + STORE_FORMAT_X, + STORE_FORMAT_XY, + STORE_FORMAT_XYZ, + STORE_FORMAT_XYZW, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case LOAD_FORMAT_X: + return "tbuffer_load_format_x"; + case LOAD_FORMAT_XY: + return "tbuffer_load_format_xy"; + case LOAD_FORMAT_XYZ: + return "tbuffer_load_format_xyz"; + case LOAD_FORMAT_XYZW: + return "tbuffer_load_format_xyzw"; + case STORE_FORMAT_X: + return "tbuffer_store_format_x"; + case STORE_FORMAT_XY: + return "tbuffer_store_format_xy"; + case STORE_FORMAT_XYZ: + return "tbuffer_store_format_xyz"; + case STORE_FORMAT_XYZW: + return "tbuffer_store_format_xyzw"; + } + return nullptr; +} +} // namespace shader::ir::mtbuf diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mubuf.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mubuf.hpp new file mode 100644 index 00000000..aa781352 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mubuf.hpp @@ -0,0 +1,129 @@ +#pragma once + +namespace shader::ir::mubuf { +enum Op { + LOAD_FORMAT_X, + LOAD_FORMAT_XY, + LOAD_FORMAT_XYZ, + LOAD_FORMAT_XYZW, + STORE_FORMAT_X, + STORE_FORMAT_XY, + STORE_FORMAT_XYZ, + STORE_FORMAT_XYZW, + LOAD_UBYTE, + LOAD_SBYTE, + LOAD_USHORT, + LOAD_SSHORT, + LOAD_DWORD, + LOAD_DWORDX2, + LOAD_DWORDX4, + LOAD_DWORDX3, + STORE_BYTE = 24, + STORE_SHORT = 26, + STORE_DWORD = 28, + STORE_DWORDX2, + STORE_DWORDX4, + STORE_DWORDX3, + ATOMIC_SWAP = 48, + ATOMIC_CMPSWAP, + ATOMIC_ADD, + ATOMIC_SUB, + ATOMIC_RSUB, + ATOMIC_SMIN, + ATOMIC_UMIN, + ATOMIC_SMAX, + ATOMIC_UMAX, + ATOMIC_AND, + ATOMIC_OR, + ATOMIC_XOR, + ATOMIC_INC, + ATOMIC_DEC, + ATOMIC_FCMPSWAP, + ATOMIC_FMIN, + ATOMIC_FMAX, + ATOMIC_SWAP_X2 = 80, + ATOMIC_CMPSWAP_X2, + ATOMIC_ADD_X2, + ATOMIC_SUB_X2, + ATOMIC_RSUB_X2, + ATOMIC_SMIN_X2, + ATOMIC_UMIN_X2, + ATOMIC_SMAX_X2, + ATOMIC_UMAX_X2, + ATOMIC_AND_X2, + ATOMIC_OR_X2, + ATOMIC_XOR_X2, + ATOMIC_INC_X2, + ATOMIC_DEC_X2, + ATOMIC_FCMPSWAP_X2, + ATOMIC_FMIN_X2, + ATOMIC_FMAX_X2, + WBINVL1_SC_VOL = 112, + WBINVL1, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case LOAD_FORMAT_X:return "buffer_load_format_x"; + case LOAD_FORMAT_XY:return "buffer_load_format_xy"; + case LOAD_FORMAT_XYZ:return "buffer_load_format_xyz"; + case LOAD_FORMAT_XYZW:return "buffer_load_format_xyzw"; + case STORE_FORMAT_X:return "buffer_store_format_x"; + case STORE_FORMAT_XY:return "buffer_store_format_xy"; + case STORE_FORMAT_XYZ:return "buffer_store_format_xyz"; + case STORE_FORMAT_XYZW:return "buffer_store_format_xyzw"; + case LOAD_UBYTE:return "buffer_load_ubyte"; + case LOAD_SBYTE:return "buffer_load_sbyte"; + case LOAD_USHORT:return "buffer_load_ushort"; + case LOAD_SSHORT:return "buffer_load_sshort"; + case LOAD_DWORD:return "buffer_load_dword"; + case LOAD_DWORDX2:return "buffer_load_dwordx2"; + case LOAD_DWORDX4:return "buffer_load_dwordx4"; + case LOAD_DWORDX3:return "buffer_load_dwordx3"; + case STORE_BYTE:return "buffer_store_byte"; + case STORE_SHORT:return "buffer_store_short"; + case STORE_DWORD:return "buffer_store_dword"; + case STORE_DWORDX2:return "buffer_store_dwordx2"; + case STORE_DWORDX4:return "buffer_store_dwordx4"; + case STORE_DWORDX3:return "buffer_store_dwordx3"; + case ATOMIC_SWAP:return "buffer_atomic_swap"; + case ATOMIC_CMPSWAP:return "buffer_atomic_cmpswap"; + case ATOMIC_ADD:return "buffer_atomic_add"; + case ATOMIC_SUB:return "buffer_atomic_sub"; + case ATOMIC_RSUB:return "buffer_atomic_rsub"; + case ATOMIC_SMIN:return "buffer_atomic_smin"; + case ATOMIC_UMIN:return "buffer_atomic_umin"; + case ATOMIC_SMAX:return "buffer_atomic_smax"; + case ATOMIC_UMAX:return "buffer_atomic_umax"; + case ATOMIC_AND:return "buffer_atomic_and"; + case ATOMIC_OR:return "buffer_atomic_or"; + case ATOMIC_XOR:return "buffer_atomic_xor"; + case ATOMIC_INC:return "buffer_atomic_inc"; + case ATOMIC_DEC:return "buffer_atomic_dec"; + case ATOMIC_FCMPSWAP:return "buffer_atomic_fcmpswap"; + case ATOMIC_FMIN:return "buffer_atomic_fmin"; + case ATOMIC_FMAX:return "buffer_atomic_fmax"; + case ATOMIC_SWAP_X2:return "buffer_atomic_swap_x2"; + case ATOMIC_CMPSWAP_X2:return "buffer_atomic_cmpswap_x2"; + case ATOMIC_ADD_X2:return "buffer_atomic_add_x2"; + case ATOMIC_SUB_X2:return "buffer_atomic_sub_x2"; + case ATOMIC_RSUB_X2:return "buffer_atomic_rsub_x2"; + case ATOMIC_SMIN_X2:return "buffer_atomic_smin_x2"; + case ATOMIC_UMIN_X2:return "buffer_atomic_umin_x2"; + case ATOMIC_SMAX_X2:return "buffer_atomic_smax_x2"; + case ATOMIC_UMAX_X2:return "buffer_atomic_umax_x2"; + case ATOMIC_AND_X2:return "buffer_atomic_and_x2"; + case ATOMIC_OR_X2:return "buffer_atomic_or_x2"; + case ATOMIC_XOR_X2:return "buffer_atomic_xor_x2"; + case ATOMIC_INC_X2:return "buffer_atomic_inc_x2"; + case ATOMIC_DEC_X2:return "buffer_atomic_dec_x2"; + case ATOMIC_FCMPSWAP_X2:return "buffer_atomic_fcmpswap_x2"; + case ATOMIC_FMIN_X2:return "buffer_atomic_fmin_x2"; + case ATOMIC_FMAX_X2:return "buffer_atomic_fmax_x2"; + case WBINVL1_SC_VOL:return "buffer_wbinvl1_sc_vol"; + case WBINVL1:return "buffer_wbinvl1"; + } + return nullptr; +} +} // namespace shader::ir::mubuf diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/smrd.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/smrd.hpp new file mode 100644 index 00000000..acc8e590 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/smrd.hpp @@ -0,0 +1,39 @@ +#pragma once + +namespace shader::ir::smrd { +enum Op { + LOAD_DWORD, + LOAD_DWORDX2, + LOAD_DWORDX4, + LOAD_DWORDX8, + LOAD_DWORDX16, + BUFFER_LOAD_DWORD = 8, + BUFFER_LOAD_DWORDX2, + BUFFER_LOAD_DWORDX4, + BUFFER_LOAD_DWORDX8, + BUFFER_LOAD_DWORDX16, + DCACHE_INV_VOL = 29, + MEMTIME, + DCACHE_INV, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case LOAD_DWORD: return "s_load_dword"; + case LOAD_DWORDX2: return "s_load_dwordx2"; + case LOAD_DWORDX4: return "s_load_dwordx4"; + case LOAD_DWORDX8: return "s_load_dwordx8"; + case LOAD_DWORDX16: return "s_load_dwordx16"; + case BUFFER_LOAD_DWORD: return "s_buffer_load_dword"; + case BUFFER_LOAD_DWORDX2: return "s_buffer_load_dwordx2"; + case BUFFER_LOAD_DWORDX4: return "s_buffer_load_dwordx4"; + case BUFFER_LOAD_DWORDX8: return "s_buffer_load_dwordx8"; + case BUFFER_LOAD_DWORDX16: return "s_buffer_load_dwordx16"; + case DCACHE_INV_VOL: return "s_dcache_inv_vol"; + case MEMTIME: return "s_memtime"; + case DCACHE_INV: return "s_dcache_inv"; + } + return nullptr; +} +} // namespace shader::ir::smrd diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop1.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop1.hpp new file mode 100644 index 00000000..32d2102f --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop1.hpp @@ -0,0 +1,109 @@ +#pragma once + +namespace shader::ir::sop1 { +enum Op { + MOV_B32 = 3, + MOV_B64, + CMOV_B32, + CMOV_B64, + NOT_B32, + NOT_B64, + WQM_B32, + WQM_B64, + BREV_B32, + BREV_B64, + BCNT0_I32_B32, + BCNT0_I32_B64, + BCNT1_I32_B32, + BCNT1_I32_B64, + FF0_I32_B32, + FF0_I32_B64, + FF1_I32_B32, + FF1_I32_B64, + FLBIT_I32_B32, + FLBIT_I32_B64, + FLBIT_I32, + FLBIT_I32_I64, + SEXT_I32_I8, + SEXT_I32_I16, + BITSET0_B32, + BITSET0_B64, + BITSET1_B32, + BITSET1_B64, + GETPC_B64, + SETPC_B64, + SWAPPC_B64, + AND_SAVEEXEC_B64 = 36, + OR_SAVEEXEC_B64, + XOR_SAVEEXEC_B64, + ANDN2_SAVEEXEC_B64, + ORN2_SAVEEXEC_B64, + NAND_SAVEEXEC_B64, + NOR_SAVEEXEC_B64, + XNOR_SAVEEXEC_B64, + QUADMASK_B32, + QUADMASK_B64, + MOVRELS_B32, + MOVRELS_B64, + MOVRELD_B32, + MOVRELD_B64, + CBRANCH_JOIN, + ABS_I32 = 52, + MOV_FED_B32, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case MOV_B32: return "s_mov_b32"; + case MOV_B64: return "s_mov_b64"; + case CMOV_B32: return "s_cmov_b32"; + case CMOV_B64: return "s_cmov_b64"; + case NOT_B32: return "s_not_b32"; + case NOT_B64: return "s_not_b64"; + case WQM_B32: return "s_wqm_b32"; + case WQM_B64: return "s_wqm_b64"; + case BREV_B32: return "s_brev_b32"; + case BREV_B64: return "s_brev_b64"; + case BCNT0_I32_B32: return "s_bcnt0_i32_b32"; + case BCNT0_I32_B64: return "s_bcnt0_i32_b64"; + case BCNT1_I32_B32: return "s_bcnt1_i32_b32"; + case BCNT1_I32_B64: return "s_bcnt1_i32_b64"; + case FF0_I32_B32: return "s_ff0_i32_b32"; + case FF0_I32_B64: return "s_ff0_i32_b64"; + case FF1_I32_B32: return "s_ff1_i32_b32"; + case FF1_I32_B64: return "s_ff1_i32_b64"; + case FLBIT_I32_B32: return "s_flbit_i32_b32"; + case FLBIT_I32_B64: return "s_flbit_i32_b64"; + case FLBIT_I32: return "s_flbit_i32"; + case FLBIT_I32_I64: return "s_flbit_i32_i64"; + case SEXT_I32_I8: return "s_sext_i32_i8"; + case SEXT_I32_I16: return "s_sext_i32_i16"; + case BITSET0_B32: return "s_bitset0_b32"; + case BITSET0_B64: return "s_bitset0_b64"; + case BITSET1_B32: return "s_bitset1_b32"; + case BITSET1_B64: return "s_bitset1_b64"; + case GETPC_B64: return "s_getpc_b64"; + case SETPC_B64: return "s_setpc_b64"; + case SWAPPC_B64: return "s_swappc_b64"; + case AND_SAVEEXEC_B64: return "s_and_saveexec_b64"; + case OR_SAVEEXEC_B64: return "s_or_saveexec_b64"; + case XOR_SAVEEXEC_B64: return "s_xor_saveexec_b64"; + case ANDN2_SAVEEXEC_B64: return "s_andn2_saveexec_b64"; + case ORN2_SAVEEXEC_B64: return "s_orn2_saveexec_b64"; + case NAND_SAVEEXEC_B64: return "s_nand_saveexec_b64"; + case NOR_SAVEEXEC_B64: return "s_nor_saveexec_b64"; + case XNOR_SAVEEXEC_B64: return "s_xnor_saveexec_b64"; + case QUADMASK_B32: return "s_quadmask_b32"; + case QUADMASK_B64: return "s_quadmask_b64"; + case MOVRELS_B32: return "s_movrels_b32"; + case MOVRELS_B64: return "s_movrels_b64"; + case MOVRELD_B32: return "s_movreld_b32"; + case MOVRELD_B64: return "s_movreld_b64"; + case CBRANCH_JOIN: return "s_cbranch_join"; + case ABS_I32: return "s_abs_i32"; + case MOV_FED_B32: return "s_mov_fed_b32"; + } + return nullptr; +} +} diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop2.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop2.hpp new file mode 100644 index 00000000..7b6a0870 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop2.hpp @@ -0,0 +1,171 @@ +#pragma once +#include "../ir.hpp" + +namespace shader::ir::sop2 { +enum Op { + ADD_U32, + SUB_U32, + ADD_I32, + SUB_I32, + ADDC_U32, + SUBB_U32, + MIN_I32, + MIN_U32, + MAX_I32, + MAX_U32, + CSELECT_B32, + CSELECT_B64, + AND_B32 = 14, + AND_B64, + OR_B32, + OR_B64, + XOR_B32, + XOR_B64, + ANDN2_B32, + ANDN2_B64, + ORN2_B32, + ORN2_B64, + NAND_B32, + NAND_B64, + NOR_B32, + NOR_B64, + XNOR_B32, + XNOR_B64, + LSHL_B32, + LSHL_B64, + LSHR_B32, + LSHR_B64, + ASHR_I32, + ASHR_I64, + BFM_B32, + BFM_B64, + MUL_I32, + BFE_U32, + BFE_I32, + BFE_U64, + BFE_I64, + CBRANCH_G_FORK, + ABSDIFF_I32, + LSHL1_ADD_U32, + LSHL2_ADD_U32, + LSHL3_ADD_U32, + LSHL4_ADD_U32, + PACK_LL_B32_B16, + PACK_LH_B32_B16, + PACK_HH_B32_B16, + MUL_HI_U32, + MUL_HI_I32, + + OpCount +}; + +inline const char *getInstructionName(unsigned id) { + switch (id) { + case ADD_U32: + return "s_add_u32"; + case SUB_U32: + return "s_sub_u32"; + case ADD_I32: + return "s_add_i32"; + case SUB_I32: + return "s_sub_i32"; + case ADDC_U32: + return "s_addc_u32"; + case SUBB_U32: + return "s_subb_u32"; + case MIN_I32: + return "s_min_i32"; + case MIN_U32: + return "s_min_u32"; + case MAX_I32: + return "s_max_i32"; + case MAX_U32: + return "s_max_u32"; + case CSELECT_B32: + return "s_cselect_b32"; + case CSELECT_B64: + return "s_cselect_b64"; + case AND_B32: + return "s_and_b32"; + case AND_B64: + return "s_and_b64"; + case OR_B32: + return "s_or_b32"; + case OR_B64: + return "s_or_b64"; + case XOR_B32: + return "s_xor_b32"; + case XOR_B64: + return "s_xor_b64"; + case ANDN2_B32: + return "s_andn2_b32"; + case ANDN2_B64: + return "s_andn2_b64"; + case ORN2_B32: + return "s_orn2_b32"; + case ORN2_B64: + return "s_orn2_b64"; + case NAND_B32: + return "s_nand_b32"; + case NAND_B64: + return "s_nand_b64"; + case NOR_B32: + return "s_nor_b32"; + case NOR_B64: + return "s_nor_b64"; + case XNOR_B32: + return "s_xnor_b32"; + case XNOR_B64: + return "s_xnor_b64"; + case LSHL_B32: + return "s_lshl_b32"; + case LSHL_B64: + return "s_lshl_b64"; + case LSHR_B32: + return "s_lshr_b32"; + case LSHR_B64: + return "s_lshr_b64"; + case ASHR_I32: + return "s_ashr_i32"; + case ASHR_I64: + return "s_ashr_i64"; + case BFM_B32: + return "s_bfm_b32"; + case BFM_B64: + return "s_bfm_b64"; + case MUL_I32: + return "s_mul_i32"; + case BFE_U32: + return "s_bfe_u32"; + case BFE_I32: + return "s_bfe_i32"; + case BFE_U64: + return "s_bfe_u64"; + case BFE_I64: + return "s_bfe_i64"; + case CBRANCH_G_FORK: + return "s_cbranch_g_fork"; + case ABSDIFF_I32: + return "s_absdiff_i32"; + case LSHL1_ADD_U32: + return "s_lshl1_add_u32"; + case LSHL2_ADD_U32: + return "s_lshl2_add_u32"; + case LSHL3_ADD_U32: + return "s_lshl3_add_u32"; + case LSHL4_ADD_U32: + return "s_lshl4_add_u32"; + case PACK_LL_B32_B16: + return "s_pack_ll_b32_b16"; + case PACK_LH_B32_B16: + return "s_pack_lh_b32_b16"; + case PACK_HH_B32_B16: + return "s_pack_hh_b32_b16"; + case MUL_HI_U32: + return "s_mul_hi_u32"; + case MUL_HI_I32: + return "s_mul_hi_i32"; + } + return nullptr; +} +} // namespace shader::ir::sop2 diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopc.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopc.hpp new file mode 100644 index 00000000..860d9e34 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopc.hpp @@ -0,0 +1,67 @@ +#pragma once + +namespace shader::ir::sopc { +enum Op { + CMP_EQ_I32, + CMP_LG_I32, + CMP_GT_I32, + CMP_GE_I32, + CMP_LT_I32, + CMP_LE_I32, + CMP_EQ_U32, + CMP_LG_U32, + CMP_GT_U32, + CMP_GE_U32, + CMP_LT_U32, + CMP_LE_U32, + BITCMP0_B32, + BITCMP1_B32, + BITCMP0_B64, + BITCMP1_B64, + SETVSKIP, + ILLEGALD, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case CMP_EQ_I32: + return "s_cmp_eq_i32"; + case CMP_LG_I32: + return "s_cmp_lg_i32"; + case CMP_GT_I32: + return "s_cmp_gt_i32"; + case CMP_GE_I32: + return "s_cmp_ge_i32"; + case CMP_LT_I32: + return "s_cmp_lt_i32"; + case CMP_LE_I32: + return "s_cmp_le_i32"; + case CMP_EQ_U32: + return "s_cmp_eq_u32"; + case CMP_LG_U32: + return "s_cmp_lg_u32"; + case CMP_GT_U32: + return "s_cmp_gt_u32"; + case CMP_GE_U32: + return "s_cmp_ge_u32"; + case CMP_LT_U32: + return "s_cmp_lt_u32"; + case CMP_LE_U32: + return "s_cmp_le_u32"; + case BITCMP0_B32: + return "bitcmp0_b32"; + case BITCMP1_B32: + return "bitcmp1_b32"; + case BITCMP0_B64: + return "bitcmp0_b64"; + case BITCMP1_B64: + return "bitcmp1_b64"; + case SETVSKIP: + return "setvskip"; + case ILLEGALD: + return "illegald"; + } + return nullptr; +} +} // namespace shader::ir::sopc diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopk.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopk.hpp new file mode 100644 index 00000000..e0eae029 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopk.hpp @@ -0,0 +1,73 @@ +#pragma once + +namespace shader::ir::sopk { +enum Op { + MOVK_I32, + CMOVK_I32 = 2, + CMPK_EQ_I32, + CMPK_LG_I32, + CMPK_GT_I32, + CMPK_GE_I32, + CMPK_LT_I32, + CMPK_LE_I32, + CMPK_EQ_U32, + CMPK_LG_U32, + CMPK_GT_U32, + CMPK_GE_U32, + CMPK_LT_U32, + CMPK_LE_U32, + ADDK_I32, + MULK_I32, + CBRANCH_I_FORK, + GETREG_B32, + SETREG_B32, + SETREG_IMM, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case MOVK_I32: + return "s_movk_i32"; + case CMOVK_I32: + return "s_cmovk_i32"; + case CMPK_EQ_I32: + return "s_cmpk_eq_i32"; + case CMPK_LG_I32: + return "s_cmpk_lg_i32"; + case CMPK_GT_I32: + return "s_cmpk_gt_i32"; + case CMPK_GE_I32: + return "s_cmpk_ge_i32"; + case CMPK_LT_I32: + return "s_cmpk_lt_i32"; + case CMPK_LE_I32: + return "s_cmpk_le_i32"; + case CMPK_EQ_U32: + return "s_cmpk_eq_u32"; + case CMPK_LG_U32: + return "s_cmpk_lg_u32"; + case CMPK_GT_U32: + return "s_cmpk_gt_u32"; + case CMPK_GE_U32: + return "s_cmpk_ge_u32"; + case CMPK_LT_U32: + return "s_cmpk_lt_u32"; + case CMPK_LE_U32: + return "s_cmpk_le_u32"; + case ADDK_I32: + return "s_addk_i32"; + case MULK_I32: + return "s_mulk_i32"; + case CBRANCH_I_FORK: + return "s_cbranch_i_fork"; + case GETREG_B32: + return "s_getreg_b32"; + case SETREG_B32: + return "s_setreg_b32"; + case SETREG_IMM: + return "s_setreg_imm"; + } + return nullptr; +} +} // namespace shader::ir::sopk diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopp.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopp.hpp new file mode 100644 index 00000000..6c200e84 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopp.hpp @@ -0,0 +1,89 @@ +#pragma once + +namespace shader::ir::sopp { +enum Op { + NOP, + ENDPGM, + BRANCH, + CBRANCH_SCC0 = 4, + CBRANCH_SCC1, + CBRANCH_VCCZ, + CBRANCH_VCCNZ, + CBRANCH_EXECZ, + CBRANCH_EXECNZ, + BARRIER, + WAITCNT = 12, + SETHALT, + SLEEP, + SETPRIO, + SENDMSG, + SENDMSGHALT, + TRAP, + ICACHE_INV, + INCPERFLEVEL, + DECPERFLEVEL, + TTRACEDATA, + CBRANCH_CDBGSYS = 23, + CBRANCH_CDBGUSER = 24, + CBRANCH_CDBGSYS_OR_USER = 25, + CBRANCH_CDBGSYS_AND_USER = 26, + + OpCount +}; + +inline const char *getInstructionName(unsigned id) { + switch (id) { + case NOP: + return "s_nop"; + case ENDPGM: + return "s_endpgm"; + case BRANCH: + return "s_branch"; + case CBRANCH_SCC0: + return "s_cbranch_scc0"; + case CBRANCH_SCC1: + return "s_cbranch_scc1"; + case CBRANCH_VCCZ: + return "s_cbranch_vccz"; + case CBRANCH_VCCNZ: + return "s_cbranch_vccnz"; + case CBRANCH_EXECZ: + return "s_cbranch_execz"; + case CBRANCH_EXECNZ: + return "s_cbranch_execnz"; + case BARRIER: + return "s_barrier"; + case WAITCNT: + return "s_waitcnt"; + case SETHALT: + return "s_sethalt"; + case SLEEP: + return "s_sleep"; + case SETPRIO: + return "s_setprio"; + case SENDMSG: + return "s_sendmsg"; + case SENDMSGHALT: + return "s_sendmsghalt"; + case TRAP: + return "s_trap"; + case ICACHE_INV: + return "s_icache_inv"; + case INCPERFLEVEL: + return "s_incperflevel"; + case DECPERFLEVEL: + return "s_decperflevel"; + case TTRACEDATA: + return "s_ttracedata"; + case CBRANCH_CDBGSYS: + return "s_cbranch_cdbgsys"; + case CBRANCH_CDBGUSER: + return "s_cbranch_cdbguser"; + case CBRANCH_CDBGSYS_OR_USER: + return "s_cbranch_cdbgsys_or_user"; + case CBRANCH_CDBGSYS_AND_USER: + return "s_cbranch_cdbgsys_and_user"; + } + return nullptr; +} +} // namespace shader::ir::sopp diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vintrp.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vintrp.hpp new file mode 100644 index 00000000..01160a8d --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vintrp.hpp @@ -0,0 +1,23 @@ +#pragma once + +namespace shader::ir::vintrp { +enum Op { + P1_F32, + P2_F32, + MOV_F32, + + OpCount +}; + +inline const char *getInstructionName(unsigned id) { + switch (id) { + case P1_F32: + return "v_interp_p1_f32"; + case P2_F32: + return "v_interp_p2_f32"; + case MOV_F32: + return "v_interp_mov_f32"; + } + return nullptr; +} +} // namespace shader::ir::vintrp diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop1.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop1.hpp new file mode 100644 index 00000000..5a6b3a2a --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop1.hpp @@ -0,0 +1,259 @@ +#pragma once + +namespace shader::ir::vop1 { +enum Op { + NOP, + MOV_B32, + READFIRSTLANE_B32, + CVT_I32_F64, + CVT_F64_I32, + CVT_F32_I32, + CVT_F32_U32, + CVT_U32_F32, + CVT_I32_F32, + MOV_FED_B32, + CVT_F16_F32, + CVT_F32_F16, + CVT_RPI_I32_F32, + CVT_FLR_I32_F32, + CVT_OFF_F32_I4, + CVT_F32_F64, + CVT_F64_F32, + CVT_F32_UBYTE0, + CVT_F32_UBYTE1, + CVT_F32_UBYTE2, + CVT_F32_UBYTE3, + CVT_U32_F64, + CVT_F64_U32, + FRACT_F32 = 32, + TRUNC_F32, + CEIL_F32, + RNDNE_F32, + FLOOR_F32, + EXP_F32, + LOG_CLAMP_F32, + LOG_F32, + RCP_CLAMP_F32, + RCP_LEGACY_F32, + RCP_F32, + RCP_IFLAG_F32, + RSQ_CLAMP_F32, + RSQ_LEGACY_F32, + RSQ_F32, + RCP_F64, + RCP_CLAMP_F64, + RSQ_F64, + RSQ_CLAMP_F64, + SQRT_F32, + SQRT_F64, + SIN_F32, + COS_F32, + NOT_B32, + BFREV_B32, + FFBH_U32, + FFBL_B32, + FFBH_I32, + FREXP_EXP_I32_F64, + FREXP_MANT_F64, + FRACT_F64, + FREXP_EXP_I32_F32, + FREXP_MANT_F32, + CLREXCP, + MOVRELD_B32, + MOVRELS_B32, + MOVRELSD_B32, + CVT_F16_U16 = 80, + CVT_F16_I16, + CVT_U16_F16, + CVT_I16_F16, + RCP_F16, + SQRT_F16, + RSQ_F16, + LOG_F16, + EXP_F16, + FREXP_MANT_F16, + FREXP_EXP_I16_F16, + FLOOR_F16, + CEIL_F16, + TRUNC_F16, + RNDNE_F16, + FRACT_F16, + SIN_F16, + COS_F16, + SAT_PK_U8_I16, + CVT_NORM_I16_F16, + CVT_NORM_U16_F16, + SWAP_B32, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case NOP: + return "v_nop"; + case MOV_B32: + return "v_mov_b32"; + case READFIRSTLANE_B32: + return "v_readfirstlane_b32"; + case CVT_I32_F64: + return "v_cvt_i32_f64"; + case CVT_F64_I32: + return "v_cvt_f64_i32"; + case CVT_F32_I32: + return "v_cvt_f32_i32"; + case CVT_F32_U32: + return "v_cvt_f32_u32"; + case CVT_U32_F32: + return "v_cvt_u32_f32"; + case CVT_I32_F32: + return "v_cvt_i32_f32"; + case MOV_FED_B32: + return "v_mov_fed_b32"; + case CVT_F16_F32: + return "v_cvt_f16_f32"; + case CVT_F32_F16: + return "v_cvt_f32_f16"; + case CVT_RPI_I32_F32: + return "v_cvt_rpi_i32_f32"; + case CVT_FLR_I32_F32: + return "v_cvt_flr_i32_f32"; + case CVT_OFF_F32_I4: + return "v_cvt_off_f32_i4"; + case CVT_F32_F64: + return "v_cvt_f32_f64"; + case CVT_F64_F32: + return "v_cvt_f64_f32"; + case CVT_F32_UBYTE0: + return "v_cvt_f32_ubyte0"; + case CVT_F32_UBYTE1: + return "v_cvt_f32_ubyte1"; + case CVT_F32_UBYTE2: + return "v_cvt_f32_ubyte2"; + case CVT_F32_UBYTE3: + return "v_cvt_f32_ubyte3"; + case CVT_U32_F64: + return "v_cvt_u32_f64"; + case CVT_F64_U32: + return "v_cvt_f64_u32"; + case FRACT_F32: + return "v_fract_f32"; + case TRUNC_F32: + return "v_trunc_f32"; + case CEIL_F32: + return "v_ceil_f32"; + case RNDNE_F32: + return "v_rndne_f32"; + case FLOOR_F32: + return "v_floor_f32"; + case EXP_F32: + return "v_exp_f32"; + case LOG_CLAMP_F32: + return "v_log_clamp_f32"; + case LOG_F32: + return "v_log_f32"; + case RCP_CLAMP_F32: + return "v_rcp_clamp_f32"; + case RCP_LEGACY_F32: + return "v_rcp_legacy_f32"; + case RCP_F32: + return "v_rcp_f32"; + case RCP_IFLAG_F32: + return "v_rcp_iflag_f32"; + case RSQ_CLAMP_F32: + return "v_rsq_clamp_f32"; + case RSQ_LEGACY_F32: + return "v_rsq_legacy_f32"; + case RSQ_F32: + return "v_rsq_f32"; + case RCP_F64: + return "v_rcp_f64"; + case RCP_CLAMP_F64: + return "v_rcp_clamp_f64"; + case RSQ_F64: + return "v_rsq_f64"; + case RSQ_CLAMP_F64: + return "v_rsq_clamp_f64"; + case SQRT_F32: + return "v_sqrt_f32"; + case SQRT_F64: + return "v_sqrt_f64"; + case SIN_F32: + return "v_sin_f32"; + case COS_F32: + return "v_cos_f32"; + case NOT_B32: + return "v_not_b32"; + case BFREV_B32: + return "v_bfrev_b32"; + case FFBH_U32: + return "v_ffbh_u32"; + case FFBL_B32: + return "v_ffbl_b32"; + case FFBH_I32: + return "v_ffbh_i32"; + case FREXP_EXP_I32_F64: + return "v_frexp_exp_i32_f64"; + case FREXP_MANT_F64: + return "v_frexp_mant_f64"; + case FRACT_F64: + return "v_fract_f64"; + case FREXP_EXP_I32_F32: + return "v_frexp_exp_i32_f32"; + case FREXP_MANT_F32: + return "v_frexp_mant_f32"; + case CLREXCP: + return "v_clrexcp"; + case MOVRELD_B32: + return "v_movreld_b32"; + case MOVRELS_B32: + return "v_movrels_b32"; + case MOVRELSD_B32: + return "v_movrelsd_b32"; + case CVT_F16_U16: + return "v_cvt_f16_u16"; + case CVT_F16_I16: + return "v_cvt_f16_i16"; + case CVT_U16_F16: + return "v_cvt_u16_f16"; + case CVT_I16_F16: + return "v_cvt_i16_f16"; + case RCP_F16: + return "v_rcp_f16"; + case SQRT_F16: + return "v_sqrt_f16"; + case RSQ_F16: + return "v_rsq_f16"; + case LOG_F16: + return "v_log_f16"; + case EXP_F16: + return "v_exp_f16"; + case FREXP_MANT_F16: + return "v_frexp_mant_f16"; + case FREXP_EXP_I16_F16: + return "v_frexp_exp_i16_f16"; + case FLOOR_F16: + return "v_floor_f16"; + case CEIL_F16: + return "v_ceil_f16"; + case TRUNC_F16: + return "v_trunc_f16"; + case RNDNE_F16: + return "v_rndne_f16"; + case FRACT_F16: + return "v_fract_f16"; + case SIN_F16: + return "v_sin_f16"; + case COS_F16: + return "v_cos_f16"; + case SAT_PK_U8_I16: + return "v_sat_pk_u8_i16"; + case CVT_NORM_I16_F16: + return "v_cvt_norm_i16_f16"; + case CVT_NORM_U16_F16: + return "v_cvt_norm_u16_f16"; + case SWAP_B32: + return "v_swap_b32"; + } + return nullptr; +} +} // namespace shader::ir::vop1 diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop2.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop2.hpp new file mode 100644 index 00000000..d6d1a7ce --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop2.hpp @@ -0,0 +1,164 @@ +#pragma once + +namespace shader::ir::vop2 { +enum Op { + CNDMASK_B32, + READLANE_B32, + WRITELANE_B32, + ADD_F32, + SUB_F32, + SUBREV_F32, + MAC_LEGACY_F32, + MUL_LEGACY_F32, + MUL_F32, + MUL_I32_I24, + MUL_HI_I32_I24, + MUL_U32_U24, + MUL_HI_U32_U24, + MIN_LEGACY_F32, + MAX_LEGACY_F32, + MIN_F32, + MAX_F32, + MIN_I32, + MAX_I32, + MIN_U32, + MAX_U32, + LSHR_B32, + LSHRREV_B32, + ASHR_I32, + ASHRREV_I32, + LSHL_B32, + LSHLREV_B32, + AND_B32, + OR_B32, + XOR_B32, + BFM_B32, + MAC_F32, + MADMK_F32, + MADAK_F32, + BCNT_U32_B32, + MBCNT_LO_U32_B32, + MBCNT_HI_U32_B32, + ADD_I32, + SUB_I32, + SUBREV_I32, + ADDC_U32, + SUBB_U32, + SUBBREV_U32, + LDEXP_F32, + CVT_PKACCUM_U8_F32, + CVT_PKNORM_I16_F32, + CVT_PKNORM_U16_F32, + CVT_PKRTZ_F16_F32, + CVT_PK_U16_U32, + CVT_PK_I16_I32, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case CNDMASK_B32: + return "v_cndmask_b32"; + case READLANE_B32: + return "v_readlane_b32"; + case WRITELANE_B32: + return "v_writelane_b32"; + case ADD_F32: + return "v_add_f32"; + case SUB_F32: + return "v_sub_f32"; + case SUBREV_F32: + return "v_subrev_f32"; + case MAC_LEGACY_F32: + return "v_mac_legacy_f32"; + case MUL_LEGACY_F32: + return "v_mul_legacy_f32"; + case MUL_F32: + return "v_mul_f32"; + case MUL_I32_I24: + return "v_mul_i32_i24"; + case MUL_HI_I32_I24: + return "v_mul_hi_i32_i24"; + case MUL_U32_U24: + return "v_mul_u32_u24"; + case MUL_HI_U32_U24: + return "v_mul_hi_u32_u24"; + case MIN_LEGACY_F32: + return "v_min_legacy_f32"; + case MAX_LEGACY_F32: + return "v_max_legacy_f32"; + case MIN_F32: + return "v_min_f32"; + case MAX_F32: + return "v_max_f32"; + case MIN_I32: + return "v_min_i32"; + case MAX_I32: + return "v_max_i32"; + case MIN_U32: + return "v_min_u32"; + case MAX_U32: + return "v_max_u32"; + case LSHR_B32: + return "v_lshr_b32"; + case LSHRREV_B32: + return "v_lshrrev_b32"; + case ASHR_I32: + return "v_ashr_i32"; + case ASHRREV_I32: + return "v_ashrrev_i32"; + case LSHL_B32: + return "v_lshl_b32"; + case LSHLREV_B32: + return "v_lshlrev_b32"; + case AND_B32: + return "v_and_b32"; + case OR_B32: + return "v_or_b32"; + case XOR_B32: + return "v_xor_b32"; + case BFM_B32: + return "v_bfm_b32"; + case MAC_F32: + return "v_mac_f32"; + case MADMK_F32: + return "v_madmk_f32"; + case MADAK_F32: + return "v_madak_f32"; + case BCNT_U32_B32: + return "v_bcnt_u32_b32"; + case MBCNT_LO_U32_B32: + return "v_mbcnt_lo_u32_b32"; + case MBCNT_HI_U32_B32: + return "v_mbcnt_hi_u32_b32"; + case ADD_I32: + return "v_add_i32"; + case SUB_I32: + return "v_sub_i32"; + case SUBREV_I32: + return "v_subrev_i32"; + case ADDC_U32: + return "v_addc_u32"; + case SUBB_U32: + return "v_subb_u32"; + case SUBBREV_U32: + return "v_subbrev_u32"; + case LDEXP_F32: + return "v_ldexp_f32"; + case CVT_PKACCUM_U8_F32: + return "v_cvt_pkaccum_u8_f32"; + case CVT_PKNORM_I16_F32: + return "v_cvt_pknorm_i16_f32"; + case CVT_PKNORM_U16_F32: + return "v_cvt_pknorm_u16_f32"; + case CVT_PKRTZ_F16_F32: + return "v_cvt_pkrtz_f16_f32"; + case CVT_PK_U16_U32: + return "v_cvt_pk_u16_u32"; + case CVT_PK_I16_I32: + return "v_cvt_pk_i16_i32"; + } + return nullptr; +} + +} // namespace shader::ir::vop2 diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop3.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop3.hpp new file mode 100644 index 00000000..1deafafc --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop3.hpp @@ -0,0 +1,1249 @@ +#pragma once + +namespace shader::ir::vop3 { +enum Op { + CMP_F_F32, + CMP_LT_F32, + CMP_EQ_F32, + CMP_LE_F32, + CMP_GT_F32, + CMP_LG_F32, + CMP_GE_F32, + CMP_O_F32, + CMP_U_F32, + CMP_NGE_F32, + CMP_NLG_F32, + CMP_NGT_F32, + CMP_NLE_F32, + CMP_NEQ_F32, + CMP_NLT_F32, + CMP_TRU_F32, + CMPX_F_F32, + CMPX_LT_F32, + CMPX_EQ_F32, + CMPX_LE_F32, + CMPX_GT_F32, + CMPX_LG_F32, + CMPX_GE_F32, + CMPX_O_F32, + CMPX_U_F32, + CMPX_NGE_F32, + CMPX_NLG_F32, + CMPX_NGT_F32, + CMPX_NLE_F32, + CMPX_NEQ_F32, + CMPX_NLT_F32, + CMPX_TRU_F32, + CMP_F_F64, + CMP_LT_F64, + CMP_EQ_F64, + CMP_LE_F64, + CMP_GT_F64, + CMP_LG_F64, + CMP_GE_F64, + CMP_O_F64, + CMP_U_F64, + CMP_NGE_F64, + CMP_NLG_F64, + CMP_NGT_F64, + CMP_NLE_F64, + CMP_NEQ_F64, + CMP_NLT_F64, + CMP_TRU_F64, + CMPX_F_F64, + CMPX_LT_F64, + CMPX_EQ_F64, + CMPX_LE_F64, + CMPX_GT_F64, + CMPX_LG_F64, + CMPX_GE_F64, + CMPX_O_F64, + CMPX_U_F64, + CMPX_NGE_F64, + CMPX_NLG_F64, + CMPX_NGT_F64, + CMPX_NLE_F64, + CMPX_NEQ_F64, + CMPX_NLT_F64, + CMPX_TRU_F64, + CMPS_F_F32, + CMPS_LT_F32, + CMPS_EQ_F32, + CMPS_LE_F32, + CMPS_GT_F32, + CMPS_LG_F32, + CMPS_GE_F32, + CMPS_O_F32, + CMPS_U_F32, + CMPS_NGE_F32, + CMPS_NLG_F32, + CMPS_NGT_F32, + CMPS_NLE_F32, + CMPS_NEQ_F32, + CMPS_NLT_F32, + CMPS_TRU_F32, + CMPSX_F_F32, + CMPSX_LT_F32, + CMPSX_EQ_F32, + CMPSX_LE_F32, + CMPSX_GT_F32, + CMPSX_LG_F32, + CMPSX_GE_F32, + CMPSX_O_F32, + CMPSX_U_F32, + CMPSX_NGE_F32, + CMPSX_NLG_F32, + CMPSX_NGT_F32, + CMPSX_NLE_F32, + CMPSX_NEQ_F32, + CMPSX_NLT_F32, + CMPSX_TRU_F32, + CMPS_F_F64, + CMPS_LT_F64, + CMPS_EQ_F64, + CMPS_LE_F64, + CMPS_GT_F64, + CMPS_LG_F64, + CMPS_GE_F64, + CMPS_O_F64, + CMPS_U_F64, + CMPS_NGE_F64, + CMPS_NLG_F64, + CMPS_NGT_F64, + CMPS_NLE_F64, + CMPS_NEQ_F64, + CMPS_NLT_F64, + CMPS_TRU_F64, + CMPSX_F_F64, + CMPSX_LT_F64, + CMPSX_EQ_F64, + CMPSX_LE_F64, + CMPSX_GT_F64, + CMPSX_LG_F64, + CMPSX_GE_F64, + CMPSX_O_F64, + CMPSX_U_F64, + CMPSX_NGE_F64, + CMPSX_NLG_F64, + CMPSX_NGT_F64, + CMPSX_NLE_F64, + CMPSX_NEQ_F64, + CMPSX_NLT_F64, + CMPSX_TRU_F64, + CMP_F_I32, + CMP_LT_I32, + CMP_EQ_I32, + CMP_LE_I32, + CMP_GT_I32, + CMP_NE_I32, + CMP_GE_I32, + CMP_T_I32, + CMP_CLASS_F32, + CMP_LT_I16, + CMP_EQ_I16, + CMP_LE_I16, + CMP_GT_I16, + CMP_NE_I16, + CMP_GE_I16, + CMP_CLASS_F16, + CMPX_F_I32, + CMPX_LT_I32, + CMPX_EQ_I32, + CMPX_LE_I32, + CMPX_GT_I32, + CMPX_NE_I32, + CMPX_GE_I32, + CMPX_T_I32, + CMPX_CLASS_F32, + CMPX_LT_I16, + CMPX_EQ_I16, + CMPX_LE_I16, + CMPX_GT_I16, + CMPX_NE_I16, + CMPX_GE_I16, + CMPX_CLASS_F16, + CMP_F_I64, + CMP_LT_I64, + CMP_EQ_I64, + CMP_LE_I64, + CMP_GT_I64, + CMP_NE_I64, + CMP_GE_I64, + CMP_T_I64, + CMP_CLASS_F64, + CMP_LT_U16, + CMP_EQ_U16, + CMP_LE_U16, + CMP_GT_U16, + CMP_NE_U16, + CMP_GE_U16, + CMPX_F_I64 = 176, + CMPX_LT_I64, + CMPX_EQ_I64, + CMPX_LE_I64, + CMPX_GT_I64, + CMPX_NE_I64, + CMPX_GE_I64, + CMPX_T_I64, + CMPX_CLASS_F64, + CMPX_LT_U16, + CMPX_EQ_U16, + CMPX_LE_U16, + CMPX_GT_U16, + CMPX_NE_U16, + CMPX_GE_U16, + CMP_F_U32 = 192, + CMP_LT_U32, + CMP_EQ_U32, + CMP_LE_U32, + CMP_GT_U32, + CMP_NE_U32, + CMP_GE_U32, + CMP_T_U32, + CMP_F_F16, + CMP_LT_F16, + CMP_EQ_F16, + CMP_LE_F16, + CMP_GT_F16, + CMP_LG_F16, + CMP_GE_F16, + CMP_O_F16, + CMPX_F_U32, + CMPX_LT_U32, + CMPX_EQ_U32, + CMPX_LE_U32, + CMPX_GT_U32, + CMPX_NE_U32, + CMPX_GE_U32, + CMPX_T_U32, + CMPX_F_F16, + CMPX_LT_F16, + CMPX_EQ_F16, + CMPX_LE_F16, + CMPX_GT_F16, + CMPX_LG_F16, + CMPX_GE_F16, + CMPX_O_F16, + CMP_F_U64, + CMP_LT_U64, + CMP_EQ_U64, + CMP_LE_U64, + CMP_GT_U64, + CMP_NE_U64, + CMP_GE_U64, + CMP_T_U64, + CMP_U_F16, + CMP_NGE_F16, + CMP_NLG_F16, + CMP_NGT_F16, + CMP_NLE_F16, + CMP_NEQ_F16, + CMP_NLT_F16, + CMP_TRU_F16, + CMPX_F_U64, + CMPX_LT_U64, + CMPX_EQ_U64, + CMPX_LE_U64, + CMPX_GT_U64, + CMPX_NE_U64, + CMPX_GE_U64, + CMPX_T_U64, + CNDMASK_B32 = 256, + READLANE_B32, + WRITELANE_B32, + ADD_F32, + SUB_F32, + SUBREV_F32, + MAC_LEGACY_F32, + MUL_LEGACY_F32, + MUL_F32, + MUL_I32_I24, + MUL_HI_I32_I24, + MUL_U32_U24, + MUL_HI_U32_U24, + MIN_LEGACY_F32, + MAX_LEGACY_F32, + MIN_F32, + MAX_F32, + MIN_I32, + MAX_I32, + MIN_U32, + MAX_U32, + LSHR_B32, + LSHRREV_B32, + ASHR_I32, + ASHRREV_I32, + LSHL_B32, + LSHLREV_B32, + AND_B32, + OR_B32, + XOR_B32, + BFM_B32, + MAC_F32, + MADMK_F32, + MADAK_F32, + BCNT_U32_B32, + MBCNT_LO_U32_B32, + MBCNT_HI_U32_B32, + ADD_I32, + SUB_I32, + SUBREV_I32, + ADDC_U32, + SUBB_U32, + SUBBREV_U32, + LDEXP_F32, + CVT_PKACCUM_U8_F32, + CVT_PKNORM_I16_F32, + CVT_PKNORM_U16_F32, + CVT_PKRTZ_F16_F32, + CVT_PK_U16_U32, + CVT_PK_I16_I32, + MAD_LEGACY_F32 = 320, + MAD_F32, + MAD_I32_I24, + MAD_U32_U24, + CUBEID_F32, + CUBESC_F32, + CUBETC_F32, + CUBEMA_F32, + BFE_U32, + BFE_I32, + BFI_B32, + FMA_F32, + FMA_F64, + LERP_U8, + ALIGNBIT_B32, + ALIGNBYTE_B32, + MULLIT_F32, + MIN3_F32, + MIN3_I32, + MIN3_U32, + MAX3_F32, + MAX3_I32, + MAX3_U32, + MED3_F32, + MED3_I32, + MED3_U32, + SAD_U8, + SAD_HI_U8, + SAD_U16, + SAD_U32, + CVT_PK_U8_F32, + DIV_FIXUP_F32, + DIV_FIXUP_F64, + LSHL_B64, + LSHR_B64, + ASHR_I64, + ADD_F64, + MUL_F64, + MIN_F64, + MAX_F64, + LDEXP_F64, + MUL_LO_U32, + MUL_HI_U32, + MUL_LO_I32, + MUL_HI_I32, + DIV_SCALE_F32, + DIV_SCALE_F64, + DIV_FMAS_F32, + DIV_FMAS_F64, + MSAD_U8, + QSAD_U8, + MQSAD_U8, + TRIG_PREOP_F64, + MQSAD_U32_U8, + MAD_U64_U32, + MAD_I64_I32, + NOP = 384, + MOV_B32, + READFIRSTLANE_B32, + CVT_I32_F64, + CVT_F64_I32, + CVT_F32_I32, + CVT_F32_U32, + CVT_U32_F32, + CVT_I32_F32, + MOV_FED_B32, + CVT_F16_F32, + CVT_F32_F16, + CVT_RPI_I32_F32, + CVT_FLR_I32_F32, + CVT_OFF_F32_I4, + CVT_F32_F64, + CVT_F64_F32, + CVT_F32_UBYTE0, + CVT_F32_UBYTE1, + CVT_F32_UBYTE2, + CVT_F32_UBYTE3, + CVT_U32_F64, + CVT_F64_U32, + FRACT_F32 = 416, + TRUNC_F32, + CEIL_F32, + RNDNE_F32, + FLOOR_F32, + EXP_F32, + LOG_CLAMP_F32, + LOG_F32, + RCP_CLAMP_F32, + RCP_LEGACY_F32, + RCP_F32, + RCP_IFLAG_F32, + RSQ_CLAMP_F32, + RSQ_LEGACY_F32, + RSQ_F32, + RCP_F64, + RCP_CLAMP_F64, + RSQ_F64, + RSQ_CLAMP_F64, + SQRT_F32, + SQRT_F64, + SIN_F32, + COS_F32, + NOT_B32, + BFREV_B32, + FFBH_U32, + FFBL_B32, + FFBH_I32, + FREXP_EXP_I32_F64, + FREXP_MANT_F64, + FRACT_F64, + FREXP_EXP_I32_F32, + FREXP_MANT_F32, + CLREXCP, + MOVRELD_B32, + MOVRELS_B32, + MOVRELSD_B32, + + OpCount +}; +inline const char *getInstructionName(unsigned id) { + switch (id) { + case CMP_F_F32: + return "v_cmp_f_f32"; + case CMP_LT_F32: + return "v_cmp_lt_f32"; + case CMP_EQ_F32: + return "v_cmp_eq_f32"; + case CMP_LE_F32: + return "v_cmp_le_f32"; + case CMP_GT_F32: + return "v_cmp_gt_f32"; + case CMP_LG_F32: + return "v_cmp_lg_f32"; + case CMP_GE_F32: + return "v_cmp_ge_f32"; + case CMP_O_F32: + return "v_cmp_o_f32"; + case CMP_U_F32: + return "v_cmp_u_f32"; + case CMP_NGE_F32: + return "v_cmp_nge_f32"; + case CMP_NLG_F32: + return "v_cmp_nlg_f32"; + case CMP_NGT_F32: + return "v_cmp_ngt_f32"; + case CMP_NLE_F32: + return "v_cmp_nle_f32"; + case CMP_NEQ_F32: + return "v_cmp_neq_f32"; + case CMP_NLT_F32: + return "v_cmp_nlt_f32"; + case CMP_TRU_F32: + return "v_cmp_tru_f32"; + case CMPX_F_F32: + return "v_cmpx_f_f32"; + case CMPX_LT_F32: + return "v_cmpx_lt_f32"; + case CMPX_EQ_F32: + return "v_cmpx_eq_f32"; + case CMPX_LE_F32: + return "v_cmpx_le_f32"; + case CMPX_GT_F32: + return "v_cmpx_gt_f32"; + case CMPX_LG_F32: + return "v_cmpx_lg_f32"; + case CMPX_GE_F32: + return "v_cmpx_ge_f32"; + case CMPX_O_F32: + return "v_cmpx_o_f32"; + case CMPX_U_F32: + return "v_cmpx_u_f32"; + case CMPX_NGE_F32: + return "v_cmpx_nge_f32"; + case CMPX_NLG_F32: + return "v_cmpx_nlg_f32"; + case CMPX_NGT_F32: + return "v_cmpx_ngt_f32"; + case CMPX_NLE_F32: + return "v_cmpx_nle_f32"; + case CMPX_NEQ_F32: + return "v_cmpx_neq_f32"; + case CMPX_NLT_F32: + return "v_cmpx_nlt_f32"; + case CMPX_TRU_F32: + return "v_cmpx_tru_f32"; + case CMP_F_F64: + return "v_cmp_f_f64"; + case CMP_LT_F64: + return "v_cmp_lt_f64"; + case CMP_EQ_F64: + return "v_cmp_eq_f64"; + case CMP_LE_F64: + return "v_cmp_le_f64"; + case CMP_GT_F64: + return "v_cmp_gt_f64"; + case CMP_LG_F64: + return "v_cmp_lg_f64"; + case CMP_GE_F64: + return "v_cmp_ge_f64"; + case CMP_O_F64: + return "v_cmp_o_f64"; + case CMP_U_F64: + return "v_cmp_u_f64"; + case CMP_NGE_F64: + return "v_cmp_nge_f64"; + case CMP_NLG_F64: + return "v_cmp_nlg_f64"; + case CMP_NGT_F64: + return "v_cmp_ngt_f64"; + case CMP_NLE_F64: + return "v_cmp_nle_f64"; + case CMP_NEQ_F64: + return "v_cmp_neq_f64"; + case CMP_NLT_F64: + return "v_cmp_nlt_f64"; + case CMP_TRU_F64: + return "v_cmp_tru_f64"; + case CMPX_F_F64: + return "v_cmpx_f_f64"; + case CMPX_LT_F64: + return "v_cmpx_lt_f64"; + case CMPX_EQ_F64: + return "v_cmpx_eq_f64"; + case CMPX_LE_F64: + return "v_cmpx_le_f64"; + case CMPX_GT_F64: + return "v_cmpx_gt_f64"; + case CMPX_LG_F64: + return "v_cmpx_lg_f64"; + case CMPX_GE_F64: + return "v_cmpx_ge_f64"; + case CMPX_O_F64: + return "v_cmpx_o_f64"; + case CMPX_U_F64: + return "v_cmpx_u_f64"; + case CMPX_NGE_F64: + return "v_cmpx_nge_f64"; + case CMPX_NLG_F64: + return "v_cmpx_nlg_f64"; + case CMPX_NGT_F64: + return "v_cmpx_ngt_f64"; + case CMPX_NLE_F64: + return "v_cmpx_nle_f64"; + case CMPX_NEQ_F64: + return "v_cmpx_neq_f64"; + case CMPX_NLT_F64: + return "v_cmpx_nlt_f64"; + case CMPX_TRU_F64: + return "v_cmpx_tru_f64"; + case CMPS_F_F32: + return "v_cmps_f_f32"; + case CMPS_LT_F32: + return "v_cmps_lt_f32"; + case CMPS_EQ_F32: + return "v_cmps_eq_f32"; + case CMPS_LE_F32: + return "v_cmps_le_f32"; + case CMPS_GT_F32: + return "v_cmps_gt_f32"; + case CMPS_LG_F32: + return "v_cmps_lg_f32"; + case CMPS_GE_F32: + return "v_cmps_ge_f32"; + case CMPS_O_F32: + return "v_cmps_o_f32"; + case CMPS_U_F32: + return "v_cmps_u_f32"; + case CMPS_NGE_F32: + return "v_cmps_nge_f32"; + case CMPS_NLG_F32: + return "v_cmps_nlg_f32"; + case CMPS_NGT_F32: + return "v_cmps_ngt_f32"; + case CMPS_NLE_F32: + return "v_cmps_nle_f32"; + case CMPS_NEQ_F32: + return "v_cmps_neq_f32"; + case CMPS_NLT_F32: + return "v_cmps_nlt_f32"; + case CMPS_TRU_F32: + return "v_cmps_tru_f32"; + case CMPSX_F_F32: + return "v_cmpsx_f_f32"; + case CMPSX_LT_F32: + return "v_cmpsx_lt_f32"; + case CMPSX_EQ_F32: + return "v_cmpsx_eq_f32"; + case CMPSX_LE_F32: + return "v_cmpsx_le_f32"; + case CMPSX_GT_F32: + return "v_cmpsx_gt_f32"; + case CMPSX_LG_F32: + return "v_cmpsx_lg_f32"; + case CMPSX_GE_F32: + return "v_cmpsx_ge_f32"; + case CMPSX_O_F32: + return "v_cmpsx_o_f32"; + case CMPSX_U_F32: + return "v_cmpsx_u_f32"; + case CMPSX_NGE_F32: + return "v_cmpsx_nge_f32"; + case CMPSX_NLG_F32: + return "v_cmpsx_nlg_f32"; + case CMPSX_NGT_F32: + return "v_cmpsx_ngt_f32"; + case CMPSX_NLE_F32: + return "v_cmpsx_nle_f32"; + case CMPSX_NEQ_F32: + return "v_cmpsx_neq_f32"; + case CMPSX_NLT_F32: + return "v_cmpsx_nlt_f32"; + case CMPSX_TRU_F32: + return "v_cmpsx_tru_f32"; + case CMPS_F_F64: + return "v_cmps_f_f64"; + case CMPS_LT_F64: + return "v_cmps_lt_f64"; + case CMPS_EQ_F64: + return "v_cmps_eq_f64"; + case CMPS_LE_F64: + return "v_cmps_le_f64"; + case CMPS_GT_F64: + return "v_cmps_gt_f64"; + case CMPS_LG_F64: + return "v_cmps_lg_f64"; + case CMPS_GE_F64: + return "v_cmps_ge_f64"; + case CMPS_O_F64: + return "v_cmps_o_f64"; + case CMPS_U_F64: + return "v_cmps_u_f64"; + case CMPS_NGE_F64: + return "v_cmps_nge_f64"; + case CMPS_NLG_F64: + return "v_cmps_nlg_f64"; + case CMPS_NGT_F64: + return "v_cmps_ngt_f64"; + case CMPS_NLE_F64: + return "v_cmps_nle_f64"; + case CMPS_NEQ_F64: + return "v_cmps_neq_f64"; + case CMPS_NLT_F64: + return "v_cmps_nlt_f64"; + case CMPS_TRU_F64: + return "v_cmps_tru_f64"; + case CMPSX_F_F64: + return "v_cmpsx_f_f64"; + case CMPSX_LT_F64: + return "v_cmpsx_lt_f64"; + case CMPSX_EQ_F64: + return "v_cmpsx_eq_f64"; + case CMPSX_LE_F64: + return "v_cmpsx_le_f64"; + case CMPSX_GT_F64: + return "v_cmpsx_gt_f64"; + case CMPSX_LG_F64: + return "v_cmpsx_lg_f64"; + case CMPSX_GE_F64: + return "v_cmpsx_ge_f64"; + case CMPSX_O_F64: + return "v_cmpsx_o_f64"; + case CMPSX_U_F64: + return "v_cmpsx_u_f64"; + case CMPSX_NGE_F64: + return "v_cmpsx_nge_f64"; + case CMPSX_NLG_F64: + return "v_cmpsx_nlg_f64"; + case CMPSX_NGT_F64: + return "v_cmpsx_ngt_f64"; + case CMPSX_NLE_F64: + return "v_cmpsx_nle_f64"; + case CMPSX_NEQ_F64: + return "v_cmpsx_neq_f64"; + case CMPSX_NLT_F64: + return "v_cmpsx_nlt_f64"; + case CMPSX_TRU_F64: + return "v_cmpsx_tru_f64"; + case CMP_F_I32: + return "v_cmp_f_i32"; + case CMP_LT_I32: + return "v_cmp_lt_i32"; + case CMP_EQ_I32: + return "v_cmp_eq_i32"; + case CMP_LE_I32: + return "v_cmp_le_i32"; + case CMP_GT_I32: + return "v_cmp_gt_i32"; + case CMP_NE_I32: + return "v_cmp_ne_i32"; + case CMP_GE_I32: + return "v_cmp_ge_i32"; + case CMP_T_I32: + return "v_cmp_t_i32"; + case CMP_CLASS_F32: + return "v_cmp_class_f32"; + case CMP_LT_I16: + return "v_cmp_lt_i16"; + case CMP_EQ_I16: + return "v_cmp_eq_i16"; + case CMP_LE_I16: + return "v_cmp_le_i16"; + case CMP_GT_I16: + return "v_cmp_gt_i16"; + case CMP_NE_I16: + return "v_cmp_ne_i16"; + case CMP_GE_I16: + return "v_cmp_ge_i16"; + case CMP_CLASS_F16: + return "v_cmp_class_f16"; + case CMPX_F_I32: + return "v_cmpx_f_i32"; + case CMPX_LT_I32: + return "v_cmpx_lt_i32"; + case CMPX_EQ_I32: + return "v_cmpx_eq_i32"; + case CMPX_LE_I32: + return "v_cmpx_le_i32"; + case CMPX_GT_I32: + return "v_cmpx_gt_i32"; + case CMPX_NE_I32: + return "v_cmpx_ne_i32"; + case CMPX_GE_I32: + return "v_cmpx_ge_i32"; + case CMPX_T_I32: + return "v_cmpx_t_i32"; + case CMPX_CLASS_F32: + return "v_cmpx_class_f32"; + case CMPX_LT_I16: + return "v_cmpx_lt_i16"; + case CMPX_EQ_I16: + return "v_cmpx_eq_i16"; + case CMPX_LE_I16: + return "v_cmpx_le_i16"; + case CMPX_GT_I16: + return "v_cmpx_gt_i16"; + case CMPX_NE_I16: + return "v_cmpx_ne_i16"; + case CMPX_GE_I16: + return "v_cmpx_ge_i16"; + case CMPX_CLASS_F16: + return "v_cmpx_class_f16"; + case CMP_F_I64: + return "v_cmp_f_i64"; + case CMP_LT_I64: + return "v_cmp_lt_i64"; + case CMP_EQ_I64: + return "v_cmp_eq_i64"; + case CMP_LE_I64: + return "v_cmp_le_i64"; + case CMP_GT_I64: + return "v_cmp_gt_i64"; + case CMP_NE_I64: + return "v_cmp_ne_i64"; + case CMP_GE_I64: + return "v_cmp_ge_i64"; + case CMP_T_I64: + return "v_cmp_t_i64"; + case CMP_CLASS_F64: + return "v_cmp_class_f64"; + case CMP_LT_U16: + return "v_cmp_lt_u16"; + case CMP_EQ_U16: + return "v_cmp_eq_u16"; + case CMP_LE_U16: + return "v_cmp_le_u16"; + case CMP_GT_U16: + return "v_cmp_gt_u16"; + case CMP_NE_U16: + return "v_cmp_ne_u16"; + case CMP_GE_U16: + return "v_cmp_ge_u16"; + case CMPX_F_I64: + return "v_cmpx_f_i64"; + case CMPX_LT_I64: + return "v_cmpx_lt_i64"; + case CMPX_EQ_I64: + return "v_cmpx_eq_i64"; + case CMPX_LE_I64: + return "v_cmpx_le_i64"; + case CMPX_GT_I64: + return "v_cmpx_gt_i64"; + case CMPX_NE_I64: + return "v_cmpx_ne_i64"; + case CMPX_GE_I64: + return "v_cmpx_ge_i64"; + case CMPX_T_I64: + return "v_cmpx_t_i64"; + case CMPX_CLASS_F64: + return "v_cmpx_class_f64"; + case CMPX_LT_U16: + return "v_cmpx_lt_u16"; + case CMPX_EQ_U16: + return "v_cmpx_eq_u16"; + case CMPX_LE_U16: + return "v_cmpx_le_u16"; + case CMPX_GT_U16: + return "v_cmpx_gt_u16"; + case CMPX_NE_U16: + return "v_cmpx_ne_u16"; + case CMPX_GE_U16: + return "v_cmpx_ge_u16"; + case CMP_F_U32: + return "v_cmp_f_u32"; + case CMP_LT_U32: + return "v_cmp_lt_u32"; + case CMP_EQ_U32: + return "v_cmp_eq_u32"; + case CMP_LE_U32: + return "v_cmp_le_u32"; + case CMP_GT_U32: + return "v_cmp_gt_u32"; + case CMP_NE_U32: + return "v_cmp_ne_u32"; + case CMP_GE_U32: + return "v_cmp_ge_u32"; + case CMP_T_U32: + return "v_cmp_t_u32"; + case CMP_F_F16: + return "v_cmp_f_f16"; + case CMP_LT_F16: + return "v_cmp_lt_f16"; + case CMP_EQ_F16: + return "v_cmp_eq_f16"; + case CMP_LE_F16: + return "v_cmp_le_f16"; + case CMP_GT_F16: + return "v_cmp_gt_f16"; + case CMP_LG_F16: + return "v_cmp_lg_f16"; + case CMP_GE_F16: + return "v_cmp_ge_f16"; + case CMP_O_F16: + return "v_cmp_o_f16"; + case CMPX_F_U32: + return "v_cmpx_f_u32"; + case CMPX_LT_U32: + return "v_cmpx_lt_u32"; + case CMPX_EQ_U32: + return "v_cmpx_eq_u32"; + case CMPX_LE_U32: + return "v_cmpx_le_u32"; + case CMPX_GT_U32: + return "v_cmpx_gt_u32"; + case CMPX_NE_U32: + return "v_cmpx_ne_u32"; + case CMPX_GE_U32: + return "v_cmpx_ge_u32"; + case CMPX_T_U32: + return "v_cmpx_t_u32"; + case CMPX_F_F16: + return "v_cmpx_f_f16"; + case CMPX_LT_F16: + return "v_cmpx_lt_f16"; + case CMPX_EQ_F16: + return "v_cmpx_eq_f16"; + case CMPX_LE_F16: + return "v_cmpx_le_f16"; + case CMPX_GT_F16: + return "v_cmpx_gt_f16"; + case CMPX_LG_F16: + return "v_cmpx_lg_f16"; + case CMPX_GE_F16: + return "v_cmpx_ge_f16"; + case CMPX_O_F16: + return "v_cmpx_o_f16"; + case CMP_F_U64: + return "v_cmp_f_u64"; + case CMP_LT_U64: + return "v_cmp_lt_u64"; + case CMP_EQ_U64: + return "v_cmp_eq_u64"; + case CMP_LE_U64: + return "v_cmp_le_u64"; + case CMP_GT_U64: + return "v_cmp_gt_u64"; + case CMP_NE_U64: + return "v_cmp_ne_u64"; + case CMP_GE_U64: + return "v_cmp_ge_u64"; + case CMP_T_U64: + return "v_cmp_t_u64"; + case CMP_U_F16: + return "v_cmp_u_f16"; + case CMP_NGE_F16: + return "v_cmp_nge_f16"; + case CMP_NLG_F16: + return "v_cmp_nlg_f16"; + case CMP_NGT_F16: + return "v_cmp_ngt_f16"; + case CMP_NLE_F16: + return "v_cmp_nle_f16"; + case CMP_NEQ_F16: + return "v_cmp_neq_f16"; + case CMP_NLT_F16: + return "v_cmp_nlt_f16"; + case CMP_TRU_F16: + return "v_cmp_tru_f16"; + case CMPX_F_U64: + return "v_cmpx_f_u64"; + case CMPX_LT_U64: + return "v_cmpx_lt_u64"; + case CMPX_EQ_U64: + return "v_cmpx_eq_u64"; + case CMPX_LE_U64: + return "v_cmpx_le_u64"; + case CMPX_GT_U64: + return "v_cmpx_gt_u64"; + case CMPX_NE_U64: + return "v_cmpx_ne_u64"; + case CMPX_GE_U64: + return "v_cmpx_ge_u64"; + case CMPX_T_U64: + return "v_cmpx_t_u64"; + case CNDMASK_B32: + return "v_cndmask_b32"; + case READLANE_B32: + return "v_readlane_b32"; + case WRITELANE_B32: + return "v_writelane_b32"; + case ADD_F32: + return "v_add_f32"; + case SUB_F32: + return "v_sub_f32"; + case SUBREV_F32: + return "v_subrev_f32"; + case MAC_LEGACY_F32: + return "v_mac_legacy_f32"; + case MUL_LEGACY_F32: + return "v_mul_legacy_f32"; + case MUL_F32: + return "v_mul_f32"; + case MUL_I32_I24: + return "v_mul_i32_i24"; + case MUL_HI_I32_I24: + return "v_mul_hi_i32_i24"; + case MUL_U32_U24: + return "v_mul_u32_u24"; + case MUL_HI_U32_U24: + return "v_mul_hi_u32_u24"; + case MIN_LEGACY_F32: + return "v_min_legacy_f32"; + case MAX_LEGACY_F32: + return "v_max_legacy_f32"; + case MIN_F32: + return "v_min_f32"; + case MAX_F32: + return "v_max_f32"; + case MIN_I32: + return "v_min_i32"; + case MAX_I32: + return "v_max_i32"; + case MIN_U32: + return "v_min_u32"; + case MAX_U32: + return "v_max_u32"; + case LSHR_B32: + return "v_lshr_b32"; + case LSHRREV_B32: + return "v_lshrrev_b32"; + case ASHR_I32: + return "v_ashr_i32"; + case ASHRREV_I32: + return "v_ashrrev_i32"; + case LSHL_B32: + return "v_lshl_b32"; + case LSHLREV_B32: + return "v_lshlrev_b32"; + case AND_B32: + return "v_and_b32"; + case OR_B32: + return "v_or_b32"; + case XOR_B32: + return "v_xor_b32"; + case BFM_B32: + return "v_bfm_b32"; + case MAC_F32: + return "v_mac_f32"; + case MADMK_F32: + return "v_madmk_f32"; + case MADAK_F32: + return "v_madak_f32"; + case BCNT_U32_B32: + return "v_bcnt_u32_b32"; + case MBCNT_LO_U32_B32: + return "v_mbcnt_lo_u32_b32"; + case MBCNT_HI_U32_B32: + return "v_mbcnt_hi_u32_b32"; + case ADD_I32: + return "v_add_i32"; + case SUB_I32: + return "v_sub_i32"; + case SUBREV_I32: + return "v_subrev_i32"; + case ADDC_U32: + return "v_addc_u32"; + case SUBB_U32: + return "v_subb_u32"; + case SUBBREV_U32: + return "v_subbrev_u32"; + case LDEXP_F32: + return "v_ldexp_f32"; + case CVT_PKACCUM_U8_F32: + return "v_cvt_pkaccum_u8_f32"; + case CVT_PKNORM_I16_F32: + return "v_cvt_pknorm_i16_f32"; + case CVT_PKNORM_U16_F32: + return "v_cvt_pknorm_u16_f32"; + case CVT_PKRTZ_F16_F32: + return "v_cvt_pkrtz_f16_f32"; + case CVT_PK_U16_U32: + return "v_cvt_pk_u16_u32"; + case CVT_PK_I16_I32: + return "v_cvt_pk_i16_i32"; + case MAD_LEGACY_F32: + return "v_mad_legacy_f32"; + case MAD_F32: + return "v_mad_f32"; + case MAD_I32_I24: + return "v_mad_i32_i24"; + case MAD_U32_U24: + return "v_mad_u32_u24"; + case CUBEID_F32: + return "v_cubeid_f32"; + case CUBESC_F32: + return "v_cubesc_f32"; + case CUBETC_F32: + return "v_cubetc_f32"; + case CUBEMA_F32: + return "v_cubema_f32"; + case BFE_U32: + return "v_bfe_u32"; + case BFE_I32: + return "v_bfe_i32"; + case BFI_B32: + return "v_bfi_b32"; + case FMA_F32: + return "v_fma_f32"; + case FMA_F64: + return "v_fma_f64"; + case LERP_U8: + return "v_lerp_u8"; + case ALIGNBIT_B32: + return "v_alignbit_b32"; + case ALIGNBYTE_B32: + return "v_alignbyte_b32"; + case MULLIT_F32: + return "v_mullit_f32"; + case MIN3_F32: + return "v_min3_f32"; + case MIN3_I32: + return "v_min3_i32"; + case MIN3_U32: + return "v_min3_u32"; + case MAX3_F32: + return "v_max3_f32"; + case MAX3_I32: + return "v_max3_i32"; + case MAX3_U32: + return "v_max3_u32"; + case MED3_F32: + return "v_med3_f32"; + case MED3_I32: + return "v_med3_i32"; + case MED3_U32: + return "v_med3_u32"; + case SAD_U8: + return "v_sad_u8"; + case SAD_HI_U8: + return "v_sad_hi_u8"; + case SAD_U16: + return "v_sad_u16"; + case SAD_U32: + return "v_sad_u32"; + case CVT_PK_U8_F32: + return "v_cvt_pk_u8_f32"; + case DIV_FIXUP_F32: + return "v_div_fixup_f32"; + case DIV_FIXUP_F64: + return "v_div_fixup_f64"; + case LSHL_B64: + return "v_lshl_b64"; + case LSHR_B64: + return "v_lshr_b64"; + case ASHR_I64: + return "v_ashr_i64"; + case ADD_F64: + return "v_add_f64"; + case MUL_F64: + return "v_mul_f64"; + case MIN_F64: + return "v_min_f64"; + case MAX_F64: + return "v_max_f64"; + case LDEXP_F64: + return "v_ldexp_f64"; + case MUL_LO_U32: + return "v_mul_lo_u32"; + case MUL_HI_U32: + return "v_mul_hi_u32"; + case MUL_LO_I32: + return "v_mul_lo_i32"; + case MUL_HI_I32: + return "v_mul_hi_i32"; + case DIV_SCALE_F32: + return "v_div_scale_f32"; + case DIV_SCALE_F64: + return "v_div_scale_f64"; + case DIV_FMAS_F32: + return "v_div_fmas_f32"; + case DIV_FMAS_F64: + return "v_div_fmas_f64"; + case MSAD_U8: + return "v_msad_u8"; + case QSAD_U8: + return "v_qsad_u8"; + case MQSAD_U8: + return "v_mqsad_u8"; + case TRIG_PREOP_F64: + return "v_trig_preop_f64"; + case MQSAD_U32_U8: + return "v_mqsad_u32_u8"; + case MAD_U64_U32: + return "v_mad_u64_u32"; + case MAD_I64_I32: + return "v_mad_i64_i32"; + case NOP: + return "v_nop"; + case MOV_B32: + return "v_mov_b32"; + case READFIRSTLANE_B32: + return "v_readfirstlane_b32"; + case CVT_I32_F64: + return "v_cvt_i32_f64"; + case CVT_F64_I32: + return "v_cvt_f64_i32"; + case CVT_F32_I32: + return "v_cvt_f32_i32"; + case CVT_F32_U32: + return "v_cvt_f32_u32"; + case CVT_U32_F32: + return "v_cvt_u32_f32"; + case CVT_I32_F32: + return "v_cvt_i32_f32"; + case MOV_FED_B32: + return "v_mov_fed_b32"; + case CVT_F16_F32: + return "v_cvt_f16_f32"; + case CVT_F32_F16: + return "v_cvt_f32_f16"; + case CVT_RPI_I32_F32: + return "v_cvt_rpi_i32_f32"; + case CVT_FLR_I32_F32: + return "v_cvt_flr_i32_f32"; + case CVT_OFF_F32_I4: + return "v_cvt_off_f32_i4"; + case CVT_F32_F64: + return "v_cvt_f32_f64"; + case CVT_F64_F32: + return "v_cvt_f64_f32"; + case CVT_F32_UBYTE0: + return "v_cvt_f32_ubyte0"; + case CVT_F32_UBYTE1: + return "v_cvt_f32_ubyte1"; + case CVT_F32_UBYTE2: + return "v_cvt_f32_ubyte2"; + case CVT_F32_UBYTE3: + return "v_cvt_f32_ubyte3"; + case CVT_U32_F64: + return "v_cvt_u32_f64"; + case CVT_F64_U32: + return "v_cvt_f64_u32"; + case FRACT_F32: + return "v_fract_f32"; + case TRUNC_F32: + return "v_trunc_f32"; + case CEIL_F32: + return "v_ceil_f32"; + case RNDNE_F32: + return "v_rndne_f32"; + case FLOOR_F32: + return "v_floor_f32"; + case EXP_F32: + return "v_exp_f32"; + case LOG_CLAMP_F32: + return "v_log_clamp_f32"; + case LOG_F32: + return "v_log_f32"; + case RCP_CLAMP_F32: + return "v_rcp_clamp_f32"; + case RCP_LEGACY_F32: + return "v_rcp_legacy_f32"; + case RCP_F32: + return "v_rcp_f32"; + case RCP_IFLAG_F32: + return "v_rcp_iflag_f32"; + case RSQ_CLAMP_F32: + return "v_rsq_clamp_f32"; + case RSQ_LEGACY_F32: + return "v_rsq_legacy_f32"; + case RSQ_F32: + return "v_rsq_f32"; + case RCP_F64: + return "v_rcp_f64"; + case RCP_CLAMP_F64: + return "v_rcp_clamp_f64"; + case RSQ_F64: + return "v_rsq_f64"; + case RSQ_CLAMP_F64: + return "v_rsq_clamp_f64"; + case SQRT_F32: + return "v_sqrt_f32"; + case SQRT_F64: + return "v_sqrt_f64"; + case SIN_F32: + return "v_sin_f32"; + case COS_F32: + return "v_cos_f32"; + case NOT_B32: + return "v_not_b32"; + case BFREV_B32: + return "v_bfrev_b32"; + case FFBH_U32: + return "v_ffbh_u32"; + case FFBL_B32: + return "v_ffbl_b32"; + case FFBH_I32: + return "v_ffbh_i32"; + case FREXP_EXP_I32_F64: + return "v_frexp_exp_i32_f64"; + case FREXP_MANT_F64: + return "v_frexp_mant_f64"; + case FRACT_F64: + return "v_fract_f64"; + case FREXP_EXP_I32_F32: + return "v_frexp_exp_i32_f32"; + case FREXP_MANT_F32: + return "v_frexp_mant_f32"; + case CLREXCP: + return "v_clrexcp"; + case MOVRELD_B32: + return "v_movreld_b32"; + case MOVRELS_B32: + return "v_movrels_b32"; + case MOVRELSD_B32: + return "v_movrelsd_b32"; + } + return nullptr; +} +} diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vopc.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vopc.hpp new file mode 100644 index 00000000..49cab95d --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vopc.hpp @@ -0,0 +1,522 @@ +#pragma once + +namespace shader::ir::vopc { +enum Op { + CMP_F_F32, + CMP_LT_F32, + CMP_EQ_F32, + CMP_LE_F32, + CMP_GT_F32, + CMP_LG_F32, + CMP_GE_F32, + CMP_O_F32, + CMP_U_F32, + CMP_NGE_F32, + CMP_NLG_F32, + CMP_NGT_F32, + CMP_NLE_F32, + CMP_NEQ_F32, + CMP_NLT_F32, + CMP_TRU_F32, + CMPX_F_F32, + CMPX_LT_F32, + CMPX_EQ_F32, + CMPX_LE_F32, + CMPX_GT_F32, + CMPX_LG_F32, + CMPX_GE_F32, + CMPX_O_F32, + CMPX_U_F32, + CMPX_NGE_F32, + CMPX_NLG_F32, + CMPX_NGT_F32, + CMPX_NLE_F32, + CMPX_NEQ_F32, + CMPX_NLT_F32, + CMPX_TRU_F32, + CMP_F_F64, + CMP_LT_F64, + CMP_EQ_F64, + CMP_LE_F64, + CMP_GT_F64, + CMP_LG_F64, + CMP_GE_F64, + CMP_O_F64, + CMP_U_F64, + CMP_NGE_F64, + CMP_NLG_F64, + CMP_NGT_F64, + CMP_NLE_F64, + CMP_NEQ_F64, + CMP_NLT_F64, + CMP_TRU_F64, + CMPX_F_F64, + CMPX_LT_F64, + CMPX_EQ_F64, + CMPX_LE_F64, + CMPX_GT_F64, + CMPX_LG_F64, + CMPX_GE_F64, + CMPX_O_F64, + CMPX_U_F64, + CMPX_NGE_F64, + CMPX_NLG_F64, + CMPX_NGT_F64, + CMPX_NLE_F64, + CMPX_NEQ_F64, + CMPX_NLT_F64, + CMPX_TRU_F64, + CMPS_F_F32, + CMPS_LT_F32, + CMPS_EQ_F32, + CMPS_LE_F32, + CMPS_GT_F32, + CMPS_LG_F32, + CMPS_GE_F32, + CMPS_O_F32, + CMPS_U_F32, + CMPS_NGE_F32, + CMPS_NLG_F32, + CMPS_NGT_F32, + CMPS_NLE_F32, + CMPS_NEQ_F32, + CMPS_NLT_F32, + CMPS_TRU_F32, + CMPSX_F_F32, + CMPSX_LT_F32, + CMPSX_EQ_F32, + CMPSX_LE_F32, + CMPSX_GT_F32, + CMPSX_LG_F32, + CMPSX_GE_F32, + CMPSX_O_F32, + CMPSX_U_F32, + CMPSX_NGE_F32, + CMPSX_NLG_F32, + CMPSX_NGT_F32, + CMPSX_NLE_F32, + CMPSX_NEQ_F32, + CMPSX_NLT_F32, + CMPSX_TRU_F32, + CMPS_F_F64, + CMPS_LT_F64, + CMPS_EQ_F64, + CMPS_LE_F64, + CMPS_GT_F64, + CMPS_LG_F64, + CMPS_GE_F64, + CMPS_O_F64, + CMPS_U_F64, + CMPS_NGE_F64, + CMPS_NLG_F64, + CMPS_NGT_F64, + CMPS_NLE_F64, + CMPS_NEQ_F64, + CMPS_NLT_F64, + CMPS_TRU_F64, + CMPSX_F_F64, + CMPSX_LT_F64, + CMPSX_EQ_F64, + CMPSX_LE_F64, + CMPSX_GT_F64, + CMPSX_LG_F64, + CMPSX_GE_F64, + CMPSX_O_F64, + CMPSX_U_F64, + CMPSX_NGE_F64, + CMPSX_NLG_F64, + CMPSX_NGT_F64, + CMPSX_NLE_F64, + CMPSX_NEQ_F64, + CMPSX_NLT_F64, + CMPSX_TRU_F64, + CMP_F_I32, + CMP_LT_I32, + CMP_EQ_I32, + CMP_LE_I32, + CMP_GT_I32, + CMP_NE_I32, + CMP_GE_I32, + CMP_T_I32, + CMP_CLASS_F32, + CMP_LT_I16, + CMP_EQ_I16, + CMP_LE_I16, + CMP_GT_I16, + CMP_NE_I16, + CMP_GE_I16, + CMP_CLASS_F16, + CMPX_F_I32, + CMPX_LT_I32, + CMPX_EQ_I32, + CMPX_LE_I32, + CMPX_GT_I32, + CMPX_NE_I32, + CMPX_GE_I32, + CMPX_T_I32, + CMPX_CLASS_F32, + CMPX_LT_I16, + CMPX_EQ_I16, + CMPX_LE_I16, + CMPX_GT_I16, + CMPX_NE_I16, + CMPX_GE_I16, + CMPX_CLASS_F16, + CMP_F_I64, + CMP_LT_I64, + CMP_EQ_I64, + CMP_LE_I64, + CMP_GT_I64, + CMP_NE_I64, + CMP_GE_I64, + CMP_T_I64, + CMP_CLASS_F64, + CMP_LT_U16, + CMP_EQ_U16, + CMP_LE_U16, + CMP_GT_U16, + CMP_NE_U16, + CMP_GE_U16, + CMPX_F_I64 = 176, + CMPX_LT_I64, + CMPX_EQ_I64, + CMPX_LE_I64, + CMPX_GT_I64, + CMPX_NE_I64, + CMPX_GE_I64, + CMPX_T_I64, + CMPX_CLASS_F64, + CMPX_LT_U16, + CMPX_EQ_U16, + CMPX_LE_U16, + CMPX_GT_U16, + CMPX_NE_U16, + CMPX_GE_U16, + CMP_F_U32 = 192, + CMP_LT_U32, + CMP_EQ_U32, + CMP_LE_U32, + CMP_GT_U32, + CMP_NE_U32, + CMP_GE_U32, + CMP_T_U32, + CMP_F_F16, + CMP_LT_F16, + CMP_EQ_F16, + CMP_LE_F16, + CMP_GT_F16, + CMP_LG_F16, + CMP_GE_F16, + CMP_O_F16, + CMPX_F_U32, + CMPX_LT_U32, + CMPX_EQ_U32, + CMPX_LE_U32, + CMPX_GT_U32, + CMPX_NE_U32, + CMPX_GE_U32, + CMPX_T_U32, + CMPX_F_F16, + CMPX_LT_F16, + CMPX_EQ_F16, + CMPX_LE_F16, + CMPX_GT_F16, + CMPX_LG_F16, + CMPX_GE_F16, + CMPX_O_F16, + CMP_F_U64, + CMP_LT_U64, + CMP_EQ_U64, + CMP_LE_U64, + CMP_GT_U64, + CMP_NE_U64, + CMP_GE_U64, + CMP_T_U64, + CMP_U_F16, + CMP_NGE_F16, + CMP_NLG_F16, + CMP_NGT_F16, + CMP_NLE_F16, + CMP_NEQ_F16, + CMP_NLT_F16, + CMP_TRU_F16, + CMPX_F_U64, + CMPX_LT_U64, + CMPX_EQ_U64, + CMPX_LE_U64, + CMPX_GT_U64, + CMPX_NE_U64, + CMPX_GE_U64, + CMPX_T_U64, + CMPX_U_F16, + CMPX_NGE_F16, + CMPX_NLG_F16, + CMPX_NGT_F16, + CMPX_NLE_F16, + CMPX_NEQ_F16, + CMPX_NLT_F16, + CMPX_TRU_F16, + + OpCount +}; + +inline const char *getInstructionName(unsigned id) { + switch (id) { + case CMP_F_F32: return "v_cmp_f_f32"; + case CMP_LT_F32: return "v_cmp_lt_f32"; + case CMP_EQ_F32: return "v_cmp_eq_f32"; + case CMP_LE_F32: return "v_cmp_le_f32"; + case CMP_GT_F32: return "v_cmp_gt_f32"; + case CMP_LG_F32: return "v_cmp_lg_f32"; + case CMP_GE_F32: return "v_cmp_ge_f32"; + case CMP_O_F32: return "v_cmp_o_f32"; + case CMP_U_F32: return "v_cmp_u_f32"; + case CMP_NGE_F32: return "v_cmp_nge_f32"; + case CMP_NLG_F32: return "v_cmp_nlg_f32"; + case CMP_NGT_F32: return "v_cmp_ngt_f32"; + case CMP_NLE_F32: return "v_cmp_nle_f32"; + case CMP_NEQ_F32: return "v_cmp_neq_f32"; + case CMP_NLT_F32: return "v_cmp_nlt_f32"; + case CMP_TRU_F32: return "v_cmp_tru_f32"; + case CMPX_F_F32: return "v_cmpx_f_f32"; + case CMPX_LT_F32: return "v_cmpx_lt_f32"; + case CMPX_EQ_F32: return "v_cmpx_eq_f32"; + case CMPX_LE_F32: return "v_cmpx_le_f32"; + case CMPX_GT_F32: return "v_cmpx_gt_f32"; + case CMPX_LG_F32: return "v_cmpx_lg_f32"; + case CMPX_GE_F32: return "v_cmpx_ge_f32"; + case CMPX_O_F32: return "v_cmpx_o_f32"; + case CMPX_U_F32: return "v_cmpx_u_f32"; + case CMPX_NGE_F32: return "v_cmpx_nge_f32"; + case CMPX_NLG_F32: return "v_cmpx_nlg_f32"; + case CMPX_NGT_F32: return "v_cmpx_ngt_f32"; + case CMPX_NLE_F32: return "v_cmpx_nle_f32"; + case CMPX_NEQ_F32: return "v_cmpx_neq_f32"; + case CMPX_NLT_F32: return "v_cmpx_nlt_f32"; + case CMPX_TRU_F32: return "v_cmpx_tru_f32"; + case CMP_F_F64: return "v_cmp_f_f64"; + case CMP_LT_F64: return "v_cmp_lt_f64"; + case CMP_EQ_F64: return "v_cmp_eq_f64"; + case CMP_LE_F64: return "v_cmp_le_f64"; + case CMP_GT_F64: return "v_cmp_gt_f64"; + case CMP_LG_F64: return "v_cmp_lg_f64"; + case CMP_GE_F64: return "v_cmp_ge_f64"; + case CMP_O_F64: return "v_cmp_o_f64"; + case CMP_U_F64: return "v_cmp_u_f64"; + case CMP_NGE_F64: return "v_cmp_nge_f64"; + case CMP_NLG_F64: return "v_cmp_nlg_f64"; + case CMP_NGT_F64: return "v_cmp_ngt_f64"; + case CMP_NLE_F64: return "v_cmp_nle_f64"; + case CMP_NEQ_F64: return "v_cmp_neq_f64"; + case CMP_NLT_F64: return "v_cmp_nlt_f64"; + case CMP_TRU_F64: return "v_cmp_tru_f64"; + case CMPX_F_F64: return "v_cmpx_f_f64"; + case CMPX_LT_F64: return "v_cmpx_lt_f64"; + case CMPX_EQ_F64: return "v_cmpx_eq_f64"; + case CMPX_LE_F64: return "v_cmpx_le_f64"; + case CMPX_GT_F64: return "v_cmpx_gt_f64"; + case CMPX_LG_F64: return "v_cmpx_lg_f64"; + case CMPX_GE_F64: return "v_cmpx_ge_f64"; + case CMPX_O_F64: return "v_cmpx_o_f64"; + case CMPX_U_F64: return "v_cmpx_u_f64"; + case CMPX_NGE_F64: return "v_cmpx_nge_f64"; + case CMPX_NLG_F64: return "v_cmpx_nlg_f64"; + case CMPX_NGT_F64: return "v_cmpx_ngt_f64"; + case CMPX_NLE_F64: return "v_cmpx_nle_f64"; + case CMPX_NEQ_F64: return "v_cmpx_neq_f64"; + case CMPX_NLT_F64: return "v_cmpx_nlt_f64"; + case CMPX_TRU_F64: return "v_cmpx_tru_f64"; + case CMPS_F_F32: return "v_cmps_f_f32"; + case CMPS_LT_F32: return "v_cmps_lt_f32"; + case CMPS_EQ_F32: return "v_cmps_eq_f32"; + case CMPS_LE_F32: return "v_cmps_le_f32"; + case CMPS_GT_F32: return "v_cmps_gt_f32"; + case CMPS_LG_F32: return "v_cmps_lg_f32"; + case CMPS_GE_F32: return "v_cmps_ge_f32"; + case CMPS_O_F32: return "v_cmps_o_f32"; + case CMPS_U_F32: return "v_cmps_u_f32"; + case CMPS_NGE_F32: return "v_cmps_nge_f32"; + case CMPS_NLG_F32: return "v_cmps_nlg_f32"; + case CMPS_NGT_F32: return "v_cmps_ngt_f32"; + case CMPS_NLE_F32: return "v_cmps_nle_f32"; + case CMPS_NEQ_F32: return "v_cmps_neq_f32"; + case CMPS_NLT_F32: return "v_cmps_nlt_f32"; + case CMPS_TRU_F32: return "v_cmps_tru_f32"; + case CMPSX_F_F32: return "v_cmpsx_f_f32"; + case CMPSX_LT_F32: return "v_cmpsx_lt_f32"; + case CMPSX_EQ_F32: return "v_cmpsx_eq_f32"; + case CMPSX_LE_F32: return "v_cmpsx_le_f32"; + case CMPSX_GT_F32: return "v_cmpsx_gt_f32"; + case CMPSX_LG_F32: return "v_cmpsx_lg_f32"; + case CMPSX_GE_F32: return "v_cmpsx_ge_f32"; + case CMPSX_O_F32: return "v_cmpsx_o_f32"; + case CMPSX_U_F32: return "v_cmpsx_u_f32"; + case CMPSX_NGE_F32: return "v_cmpsx_nge_f32"; + case CMPSX_NLG_F32: return "v_cmpsx_nlg_f32"; + case CMPSX_NGT_F32: return "v_cmpsx_ngt_f32"; + case CMPSX_NLE_F32: return "v_cmpsx_nle_f32"; + case CMPSX_NEQ_F32: return "v_cmpsx_neq_f32"; + case CMPSX_NLT_F32: return "v_cmpsx_nlt_f32"; + case CMPSX_TRU_F32: return "v_cmpsx_tru_f32"; + case CMPS_F_F64: return "v_cmps_f_f64"; + case CMPS_LT_F64: return "v_cmps_lt_f64"; + case CMPS_EQ_F64: return "v_cmps_eq_f64"; + case CMPS_LE_F64: return "v_cmps_le_f64"; + case CMPS_GT_F64: return "v_cmps_gt_f64"; + case CMPS_LG_F64: return "v_cmps_lg_f64"; + case CMPS_GE_F64: return "v_cmps_ge_f64"; + case CMPS_O_F64: return "v_cmps_o_f64"; + case CMPS_U_F64: return "v_cmps_u_f64"; + case CMPS_NGE_F64: return "v_cmps_nge_f64"; + case CMPS_NLG_F64: return "v_cmps_nlg_f64"; + case CMPS_NGT_F64: return "v_cmps_ngt_f64"; + case CMPS_NLE_F64: return "v_cmps_nle_f64"; + case CMPS_NEQ_F64: return "v_cmps_neq_f64"; + case CMPS_NLT_F64: return "v_cmps_nlt_f64"; + case CMPS_TRU_F64: return "v_cmps_tru_f64"; + case CMPSX_F_F64: return "v_cmpsx_f_f64"; + case CMPSX_LT_F64: return "v_cmpsx_lt_f64"; + case CMPSX_EQ_F64: return "v_cmpsx_eq_f64"; + case CMPSX_LE_F64: return "v_cmpsx_le_f64"; + case CMPSX_GT_F64: return "v_cmpsx_gt_f64"; + case CMPSX_LG_F64: return "v_cmpsx_lg_f64"; + case CMPSX_GE_F64: return "v_cmpsx_ge_f64"; + case CMPSX_O_F64: return "v_cmpsx_o_f64"; + case CMPSX_U_F64: return "v_cmpsx_u_f64"; + case CMPSX_NGE_F64: return "v_cmpsx_nge_f64"; + case CMPSX_NLG_F64: return "v_cmpsx_nlg_f64"; + case CMPSX_NGT_F64: return "v_cmpsx_ngt_f64"; + case CMPSX_NLE_F64: return "v_cmpsx_nle_f64"; + case CMPSX_NEQ_F64: return "v_cmpsx_neq_f64"; + case CMPSX_NLT_F64: return "v_cmpsx_nlt_f64"; + case CMPSX_TRU_F64: return "v_cmpsx_tru_f64"; + case CMP_F_I32: return "v_cmp_f_i32"; + case CMP_LT_I32: return "v_cmp_lt_i32"; + case CMP_EQ_I32: return "v_cmp_eq_i32"; + case CMP_LE_I32: return "v_cmp_le_i32"; + case CMP_GT_I32: return "v_cmp_gt_i32"; + case CMP_NE_I32: return "v_cmp_ne_i32"; + case CMP_GE_I32: return "v_cmp_ge_i32"; + case CMP_T_I32: return "v_cmp_t_i32"; + case CMP_CLASS_F32: return "v_cmp_class_f32"; + case CMP_LT_I16: return "v_cmp_lt_i16"; + case CMP_EQ_I16: return "v_cmp_eq_i16"; + case CMP_LE_I16: return "v_cmp_le_i16"; + case CMP_GT_I16: return "v_cmp_gt_i16"; + case CMP_NE_I16: return "v_cmp_ne_i16"; + case CMP_GE_I16: return "v_cmp_ge_i16"; + case CMP_CLASS_F16: return "v_cmp_class_f16"; + case CMPX_F_I32: return "v_cmpx_f_i32"; + case CMPX_LT_I32: return "v_cmpx_lt_i32"; + case CMPX_EQ_I32: return "v_cmpx_eq_i32"; + case CMPX_LE_I32: return "v_cmpx_le_i32"; + case CMPX_GT_I32: return "v_cmpx_gt_i32"; + case CMPX_NE_I32: return "v_cmpx_ne_i32"; + case CMPX_GE_I32: return "v_cmpx_ge_i32"; + case CMPX_T_I32: return "v_cmpx_t_i32"; + case CMPX_CLASS_F32: return "v_cmpx_class_f32"; + case CMPX_LT_I16: return "v_cmpx_lt_i16"; + case CMPX_EQ_I16: return "v_cmpx_eq_i16"; + case CMPX_LE_I16: return "v_cmpx_le_i16"; + case CMPX_GT_I16: return "v_cmpx_gt_i16"; + case CMPX_NE_I16: return "v_cmpx_ne_i16"; + case CMPX_GE_I16: return "v_cmpx_ge_i16"; + case CMPX_CLASS_F16: return "v_cmpx_class_f16"; + case CMP_F_I64: return "v_cmp_f_i64"; + case CMP_LT_I64: return "v_cmp_lt_i64"; + case CMP_EQ_I64: return "v_cmp_eq_i64"; + case CMP_LE_I64: return "v_cmp_le_i64"; + case CMP_GT_I64: return "v_cmp_gt_i64"; + case CMP_NE_I64: return "v_cmp_ne_i64"; + case CMP_GE_I64: return "v_cmp_ge_i64"; + case CMP_T_I64: return "v_cmp_t_i64"; + case CMP_CLASS_F64: return "v_cmp_class_f64"; + case CMP_LT_U16: return "v_cmp_lt_u16"; + case CMP_EQ_U16: return "v_cmp_eq_u16"; + case CMP_LE_U16: return "v_cmp_le_u16"; + case CMP_GT_U16: return "v_cmp_gt_u16"; + case CMP_NE_U16: return "v_cmp_ne_u16"; + case CMP_GE_U16: return "v_cmp_ge_u16"; + case CMPX_F_I64: return "v_cmpx_f_i64"; + case CMPX_LT_I64: return "v_cmpx_lt_i64"; + case CMPX_EQ_I64: return "v_cmpx_eq_i64"; + case CMPX_LE_I64: return "v_cmpx_le_i64"; + case CMPX_GT_I64: return "v_cmpx_gt_i64"; + case CMPX_NE_I64: return "v_cmpx_ne_i64"; + case CMPX_GE_I64: return "v_cmpx_ge_i64"; + case CMPX_T_I64: return "v_cmpx_t_i64"; + case CMPX_CLASS_F64: return "v_cmpx_class_f64"; + case CMPX_LT_U16: return "v_cmpx_lt_u16"; + case CMPX_EQ_U16: return "v_cmpx_eq_u16"; + case CMPX_LE_U16: return "v_cmpx_le_u16"; + case CMPX_GT_U16: return "v_cmpx_gt_u16"; + case CMPX_NE_U16: return "v_cmpx_ne_u16"; + case CMPX_GE_U16: return "v_cmpx_ge_u16"; + case CMP_F_U32: return "v_cmp_f_u32"; + case CMP_LT_U32: return "v_cmp_lt_u32"; + case CMP_EQ_U32: return "v_cmp_eq_u32"; + case CMP_LE_U32: return "v_cmp_le_u32"; + case CMP_GT_U32: return "v_cmp_gt_u32"; + case CMP_NE_U32: return "v_cmp_ne_u32"; + case CMP_GE_U32: return "v_cmp_ge_u32"; + case CMP_T_U32: return "v_cmp_t_u32"; + case CMP_F_F16: return "v_cmp_f_f16"; + case CMP_LT_F16: return "v_cmp_lt_f16"; + case CMP_EQ_F16: return "v_cmp_eq_f16"; + case CMP_LE_F16: return "v_cmp_le_f16"; + case CMP_GT_F16: return "v_cmp_gt_f16"; + case CMP_LG_F16: return "v_cmp_lg_f16"; + case CMP_GE_F16: return "v_cmp_ge_f16"; + case CMP_O_F16: return "v_cmp_o_f16"; + case CMPX_F_U32: return "v_cmpx_f_u32"; + case CMPX_LT_U32: return "v_cmpx_lt_u32"; + case CMPX_EQ_U32: return "v_cmpx_eq_u32"; + case CMPX_LE_U32: return "v_cmpx_le_u32"; + case CMPX_GT_U32: return "v_cmpx_gt_u32"; + case CMPX_NE_U32: return "v_cmpx_ne_u32"; + case CMPX_GE_U32: return "v_cmpx_ge_u32"; + case CMPX_T_U32: return "v_cmpx_t_u32"; + case CMPX_F_F16: return "v_cmpx_f_f16"; + case CMPX_LT_F16: return "v_cmpx_lt_f16"; + case CMPX_EQ_F16: return "v_cmpx_eq_f16"; + case CMPX_LE_F16: return "v_cmpx_le_f16"; + case CMPX_GT_F16: return "v_cmpx_gt_f16"; + case CMPX_LG_F16: return "v_cmpx_lg_f16"; + case CMPX_GE_F16: return "v_cmpx_ge_f16"; + case CMPX_O_F16: return "v_cmpx_o_f16"; + case CMP_F_U64: return "v_cmp_f_u64"; + case CMP_LT_U64: return "v_cmp_lt_u64"; + case CMP_EQ_U64: return "v_cmp_eq_u64"; + case CMP_LE_U64: return "v_cmp_le_u64"; + case CMP_GT_U64: return "v_cmp_gt_u64"; + case CMP_NE_U64: return "v_cmp_ne_u64"; + case CMP_GE_U64: return "v_cmp_ge_u64"; + case CMP_T_U64: return "v_cmp_t_u64"; + case CMP_U_F16: return "v_cmp_u_f16"; + case CMP_NGE_F16: return "v_cmp_nge_f16"; + case CMP_NLG_F16: return "v_cmp_nlg_f16"; + case CMP_NGT_F16: return "v_cmp_ngt_f16"; + case CMP_NLE_F16: return "v_cmp_nle_f16"; + case CMP_NEQ_F16: return "v_cmp_neq_f16"; + case CMP_NLT_F16: return "v_cmp_nlt_f16"; + case CMP_TRU_F16: return "v_cmp_tru_f16"; + case CMPX_F_U64: return "v_cmpx_f_u64"; + case CMPX_LT_U64: return "v_cmpx_lt_u64"; + case CMPX_EQ_U64: return "v_cmpx_eq_u64"; + case CMPX_LE_U64: return "v_cmpx_le_u64"; + case CMPX_GT_U64: return "v_cmpx_gt_u64"; + case CMPX_NE_U64: return "v_cmpx_ne_u64"; + case CMPX_GE_U64: return "v_cmpx_ge_u64"; + case CMPX_T_U64: return "v_cmpx_t_u64"; + case CMPX_U_F16: return "v_cmpx_u_f16"; + case CMPX_NGE_F16: return "v_cmpx_nge_f16"; + case CMPX_NLG_F16: return "v_cmpx_nlg_f16"; + case CMPX_NGT_F16: return "v_cmpx_ngt_f16"; + case CMPX_NLE_F16: return "v_cmpx_nle_f16"; + case CMPX_NEQ_F16: return "v_cmpx_neq_f16"; + case CMPX_NLT_F16: return "v_cmpx_nlt_f16"; + case CMPX_TRU_F16: return "v_cmpx_tru_f16"; + } + return nullptr; +} +} diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/eval.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/eval.hpp new file mode 100644 index 00000000..948731dc --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/eval.hpp @@ -0,0 +1,92 @@ +#pragma once + +#include "Vector.hpp" +#include "ir/Value.hpp" +#include +#include +#include + +namespace shader::eval { +struct Value { + using Storage = std::variant< + std::nullptr_t, std::int8_t, std::int16_t, std::int32_t, std::int64_t, + std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t, float16_t, + float32_t, float64_t, u8vec2, u8vec3, u8vec4, i8vec2, i8vec3, i8vec4, + u16vec2, u16vec3, u16vec4, i16vec2, i16vec3, i16vec4, u32vec2, u32vec3, + u32vec4, i32vec2, i32vec3, i32vec4, u64vec2, u64vec3, u64vec4, i64vec2, + i64vec3, i64vec4, f32vec2, f32vec3, f32vec4, f64vec2, f64vec3, f64vec4, + f16vec2, f16vec3, f16vec4, bool, bvec2, bvec3, bvec4, std::array>; + static constexpr auto StorageSize = std::variant_size_v; + Storage storage; + + explicit operator bool() const { return !empty(); } + bool empty() const { return storage.index() == 0; } + + Value() : storage(nullptr) {} + + template + Value(T &&value) + requires requires { Storage(std::forward(value)); } + : storage(std::forward(value)) {} + + static Value compositeConstruct(ir::Value type, + std::span constituents); + Value compositeExtract(const Value &index) const; + // Value compositeInsert(const Value &object, std::size_t index) const; + + Value isNan() const; + Value isInf() const; + Value isFinite() const; + Value makeUnsigned() const; + Value makeSigned() const; + Value all() const; + Value any() const; + Value select(const Value &trueValue, const Value &falseValue) const; + Value iConvert(ir::Value type, bool isSigned) const; + Value sConvert(ir::Value type) const { return iConvert(type, true); } + Value uConvert(ir::Value type) const { return iConvert(type, false); } + Value fConvert(ir::Value type) const; + Value bitcast(ir::Value type) const; + std::optional zExtScalar() const; + std::optional sExtScalar() const; + + template + requires requires { std::get(storage); } + T get() const { + return std::get(storage); + } + + template + requires requires { std::get(storage); } + std::optional as() const { + if (auto result = std::get_if(&storage)) { + return *result; + } + + return std::nullopt; + } + + Value operator+(const Value &rhs) const; + Value operator-(const Value &rhs) const; + Value operator*(const Value &rhs) const; + Value operator/(const Value &rhs) const; + Value operator%(const Value &rhs) const; + Value operator&(const Value &rhs) const; + Value operator|(const Value &rhs) const; + Value operator^(const Value &rhs) const; + Value operator>>(const Value &rhs) const; + Value operator<<(const Value &rhs) const; + Value operator&&(const Value &rhs) const; + Value operator||(const Value &rhs) const; + Value operator<(const Value &rhs) const; + Value operator>(const Value &rhs) const; + Value operator<=(const Value &rhs) const; + Value operator>=(const Value &rhs) const; + Value operator==(const Value &rhs) const; + Value operator!=(const Value &rhs) const; + + Value operator-() const; + Value operator~() const; + Value operator!() const; +}; +} // namespace shader::eval diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/gcn.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/gcn.hpp new file mode 100644 index 00000000..5dc6821b --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/gcn.hpp @@ -0,0 +1,125 @@ +#pragma once + +#include "SemanticInfo.hpp" +#include "SpvConverter.hpp" +#include "analyze.hpp" +#include "rx/MemoryTable.hpp" +#include "spv.hpp" + +#include +#include + +namespace shader::gcn { +using Builder = ir::Builder; + +enum class Stage { + Ps, + VsVs, + VsEs, + VsLs, + Cs, + Gs, + GsVs, + Hs, + DsVs, + DsEs, + + Invalid, +}; + +struct Import : spv::Import { + ir::Node getOrCloneImpl(ir::Context &context, ir::Node node, + bool isOperand) override; +}; + +struct SemanticModuleInfo : shader::SemanticModuleInfo { + std::map registerVariables; +}; + +void canonicalizeSemantic(ir::Context &context, + const spv::BinaryLayout &semantic); +void collectSemanticModuleInfo(SemanticModuleInfo &moduleInfo, + const spv::BinaryLayout &layout); +SemanticInfo collectSemanticInfo(const SemanticModuleInfo &moduleInfo); + +struct InstructionRegion : ir::RegionLikeImpl { + ir::RegionLike base; + ir::Instruction *firstInstruction; + + void insertAfter(ir::Instruction point, ir::Instruction node) { + if (!*firstInstruction) { + *firstInstruction = node; + } + + base.insertAfter(point, node); + } +}; + +enum RegId { + Sgpr, + Vgpr, + M0, + Scc, + Vcc, + Exec, + VccZ, + ExecZ, + LdsDirect, + SgprCount, + VgprCount, + ThreadId, + MemoryTable, + Gds, +}; + +struct Context : spv::Context { + ir::Region body; + rx::MemoryAreaTable<> memoryMap; + std::uint32_t requiredUserSgprs = 0; + std::map registerVariables; + std::map instructions; + AnalysisStorage analysis; + + std::pair getOrCreateLabel(ir::Location loc, ir::Region body, + std::uint64_t address); + Builder createBuilder(InstructionRegion ®ion, ir::Region bodyRegion, + std::uint64_t address); + + ir::Value createCast(ir::Location loc, Builder &builder, ir::Value targetType, + ir::Value value); + + void setRegisterVariable(RegId id, ir::Value value) { + registerVariables[id] = value; + } + + ir::Value getOrCreateRegisterVariable(RegId id); + + ir::Value getRegisterRef(ir::Location loc, Builder &builder, RegId id, + const ir::Operand &index, ir::Value lane = nullptr); + + ir::Value readReg(ir::Location loc, Builder &builder, ir::Value typeValue, + RegId id, const ir::Operand &index, + ir::Value lane = nullptr); + + void writeReg(ir::Location loc, Builder &builder, RegId id, + const ir::Operand &index, ir::Value value, + ir::Value lane = nullptr); + + ir::Value createRegisterAccess(Builder &builder, ir::Location loc, + ir::Value reg, const ir::Operand &index, + ir::Value lane = nullptr); +}; + +struct Environment { + std::uint8_t vgprCount; + std::uint8_t sgprCount; + std::span userSgprs; + bool supportsBarycentric = true; + bool supportsInt8 = false; + bool supportsInt64Atomics = false; +}; + +ir::Region deserialize(Context &context, const Environment &environment, + const SemanticInfo &semanticInfo, std::uint64_t base, + std::function readMemory); +} // namespace shader::gcn diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/glsl.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/glsl.hpp new file mode 100644 index 00000000..b166aed5 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/glsl.hpp @@ -0,0 +1,31 @@ +#pragma once +#include "ir/Location.hpp" +#include "spv.hpp" +#include + +namespace shader::glsl { +enum class Stage { + Library, + Vertex, + TessControl, + TessEvaluation, + Geometry, + Fragment, + Compute, + RayGen, + Intersect, + AnyHit, + ClosestHit, + Miss, + Callable, + Task, + Mesh, +}; + +std::optional parseFile(ir::Context &context, Stage stage, + const std::filesystem::path &path); +std::optional parseSource(ir::Context &context, Stage stage, + std::string_view source, + ir::Location loc = nullptr); +std::string decompile(std::span spv); +} // namespace shader::glsl diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/graph.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/graph.hpp new file mode 100644 index 00000000..b505d983 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/graph.hpp @@ -0,0 +1,320 @@ +#pragma once + +#include +#include + +namespace graph { +template class DomTree { +public: + struct Node { + BasicBlockPtrT block = nullptr; + Node *immDom = nullptr; + unsigned dfsNumIn = ~0; + unsigned dfsNumOut = ~0; + unsigned level = 0; + std::vector children; + + bool isLeaf() const { return children.empty(); } + + bool dominatedBy(const Node *other) const { + return this->dfsNumIn >= other->dfsNumIn && + this->dfsNumOut <= other->dfsNumOut; + } + }; + +private: + std::map bbToNodes; + Node *rootNode = nullptr; + +public: + Node *getNode(BasicBlockPtrT bb) { + auto it = bbToNodes.find(bb); + if (it != bbToNodes.end()) { + return &it->second; + } + + return nullptr; + } + + Node *createChild(BasicBlockPtrT bb, Node *parent) { + auto &child = bbToNodes[bb]; + child.block = bb; + child.immDom = parent; + child.level = parent->level + 1; + parent->children.push_back(&child); + return &child; + } + + Node *createRoot(BasicBlockPtrT bb) { + auto &root = bbToNodes[bb]; + rootNode = &root; + root.block = bb; + return rootNode; + } + + Node *getRootNode() { return rootNode; } + + void updateDFSNumbers() { + std::vector::iterator>> + workStack; + + auto root = getRootNode(); + if (!root) + return; + + workStack.push_back({root, root->children.begin()}); + + unsigned dfsNum = 0; + root->dfsNumIn = dfsNum++; + + while (!workStack.empty()) { + auto node = workStack.back().first; + const auto childIt = workStack.back().second; + + if (childIt == node->children.end()) { + node->dfsNumOut = dfsNum++; + workStack.pop_back(); + } else { + auto child = *childIt; + ++workStack.back().second; + + workStack.push_back({child, child->children.begin()}); + child->dfsNumIn = dfsNum++; + } + } + } + + bool dominates(Node *a, Node *b) { + if (a == b || b->immDom == a) { + return true; + } + + if (a->immDom == b || a->level >= b->level) { + return false; + } + + return b->dominatedBy(a); + } + + bool dominates(BasicBlockPtrT a, BasicBlockPtrT b) { + return dominates(getNode(a), getNode(b)); + } + + BasicBlockPtrT getImmediateDominator(BasicBlockPtrT a) { + auto immDom = getNode(a)->immDom; + if (immDom) { + return immDom->block; + } + return{}; + } + + bool isImmediateDominator(BasicBlockPtrT block, BasicBlockPtrT immDomBlock) { + if (immDomBlock == nullptr) { + return false; + } + + return getImmediateDominator(immDomBlock) == block; + } + + BasicBlockPtrT findNearestCommonDominator(BasicBlockPtrT a, + BasicBlockPtrT b) { + auto aNode = getNode(a); + auto bNode = getNode(b); + + if (aNode == rootNode || bNode == rootNode) { + return rootNode->block; + } + + while (aNode != bNode) { + if (aNode->level < bNode->level) { + std::swap(aNode, bNode); + } + + aNode = aNode->immDom; + } + + return aNode->block; + } +}; + +template class DomTreeBuilder { + using DomTreeNode = typename DomTree::Node; + + struct NodeInfo { + unsigned dfsNum = 0; + unsigned parent = 0; + unsigned semi = 0; + BasicBlockPtrT label = nullptr; + BasicBlockPtrT immDom = nullptr; + std::vector revChildren; + }; + + std::vector indexToNode = {nullptr}; + std::map nodeToInfo; + + template + void runDFS(BasicBlockPtrT root, const WalkFn &walk) { + std::vector workList; + workList.reserve(10); + workList.push_back(root); + unsigned index = 0; + + while (!workList.empty()) { + auto bb = workList.back(); + workList.pop_back(); + + auto &bbInfo = nodeToInfo[bb]; + + if (bbInfo.dfsNum != 0) { + continue; + } + + bbInfo.dfsNum = bbInfo.semi = ++index; + bbInfo.label = bb; + indexToNode.push_back(bb); + + walk(bb, [&](BasicBlockPtrT successor) { + auto it = nodeToInfo.find(successor); + if (it != nodeToInfo.end() && it->second.dfsNum != 0) { + if (successor != bb) { + it->second.revChildren.push_back(bb); + } + + return; + } + + auto &succInfo = nodeToInfo[successor]; + workList.push_back(successor); + succInfo.parent = index; + succInfo.revChildren.push_back(bb); + }); + } + } + + void runSemiNCA() { + const unsigned nextDFS = indexToNode.size(); + + for (unsigned i = 1; i < nextDFS; ++i) { + const BasicBlockPtrT node = indexToNode[i]; + auto &NodeInfo = nodeToInfo[node]; + NodeInfo.immDom = indexToNode[NodeInfo.parent]; + } + + std::vector evalStack; + evalStack.reserve(10); + + for (unsigned i = nextDFS - 1; i >= 2; --i) { + BasicBlockPtrT node = indexToNode[i]; + auto &nodeInfo = nodeToInfo[node]; + + nodeInfo.semi = nodeInfo.parent; + for (const auto &child : nodeInfo.revChildren) { + if (!nodeToInfo.contains(child)) { + continue; + } + + unsigned childSemi = nodeToInfo[eval(child, i + 1, evalStack)].semi; + if (childSemi < nodeInfo.semi) { + nodeInfo.semi = childSemi; + } + } + } + + for (unsigned i = 2; i < nextDFS; ++i) { + const BasicBlockPtrT node = indexToNode[i]; + auto &nodeInfo = nodeToInfo[node]; + const unsigned sDomNum = nodeToInfo[indexToNode[nodeInfo.semi]].dfsNum; + BasicBlockPtrT immDom = nodeInfo.immDom; + + while (nodeToInfo[immDom].dfsNum > sDomNum) { + immDom = nodeToInfo[immDom].immDom; + } + + nodeInfo.immDom = immDom; + } + } + + BasicBlockPtrT eval(BasicBlockPtrT block, unsigned LastLinked, + std::vector &stack) { + NodeInfo *blockInfo = &nodeToInfo[block]; + if (blockInfo->parent < LastLinked) + return blockInfo->label; + + do { + stack.push_back(blockInfo); + blockInfo = &nodeToInfo[indexToNode[blockInfo->parent]]; + } while (blockInfo->parent >= LastLinked); + + const NodeInfo *pInfo = blockInfo; + const NodeInfo *pLabelInfo = &nodeToInfo[pInfo->label]; + do { + blockInfo = stack.back(); + stack.pop_back(); + + blockInfo->parent = pInfo->parent; + const NodeInfo *labelInfo = &nodeToInfo[blockInfo->label]; + if (pLabelInfo->semi < labelInfo->semi) { + blockInfo->label = pInfo->label; + } else { + pLabelInfo = labelInfo; + } + + pInfo = blockInfo; + } while (!stack.empty()); + return blockInfo->label; + } + + DomTreeNode *getNodeForBlock(BasicBlockPtrT BB, DomTree &DT) { + if (auto Node = DT.getNode(BB)) + return Node; + + BasicBlockPtrT IDom = getIDom(BB); + auto IDomNode = getNodeForBlock(IDom, DT); + + return DT.createChild(BB, IDomNode); + } + + BasicBlockPtrT getIDom(BasicBlockPtrT BB) const { + auto InfoIt = nodeToInfo.find(BB); + if (InfoIt == nodeToInfo.end()) + return nullptr; + + return InfoIt->second.immDom; + } + +public: + template + DomTree build(BasicBlockPtrT root, + const WalkFn &walkSuccessors) { + runDFS(root, walkSuccessors); + runSemiNCA(); + + DomTree domTree; + domTree.createRoot(root); + + nodeToInfo[indexToNode[1]].immDom = root; + + for (size_t i = 1, e = indexToNode.size(); i != e; ++i) { + BasicBlockPtrT node = indexToNode[i]; + + if (domTree.getNode(node)) + continue; + + BasicBlockPtrT immDom = getIDom(node); + + auto immDomNode = getNodeForBlock(immDom, domTree); + domTree.createChild(node, immDomNode); + } + + domTree.updateDFSNumbers(); + return domTree; + } +}; + +template +DomTree buildDomTree(BasicBlockPtrT root, auto &&walkSuccessors) + requires requires(void (*cb)(BasicBlockPtrT)) { walkSuccessors(root, cb); } +{ + return DomTreeBuilder().build(root, walkSuccessors); +} +} // namespace graph diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir.hpp new file mode 100644 index 00000000..7bcf63d5 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "ir/Context.hpp" // IWYU pragma: export +#include "ir/Instruction.hpp" // IWYU pragma: export +#include "ir/Location.hpp" // IWYU pragma: export +#include "ir/Node.hpp" // IWYU pragma: export +#include "ir/Operand.hpp" // IWYU pragma: export +#include "ir/PointerWrapper.hpp" // IWYU pragma: export +#include "ir/PrintableWrapper.hpp" // IWYU pragma: export +#include "ir/Value.hpp" // IWYU pragma: export +#include "ir/Builder.hpp" // IWYU pragma: export +#include "ir/Region.hpp" // IWYU pragma: export +#include "ir/OperandPrint.hpp" // IWYU pragma: export +#include "ir/Impl.hpp" // IWYU pragma: export diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Block.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Block.hpp new file mode 100644 index 00000000..5b1198f1 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Block.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include "RegionLike.hpp" +#include "RegionLikeImpl.hpp" +#include "ValueImpl.hpp" + +namespace shader::ir { +template +struct BlockWrapper : RegionLikeWrapper { + using RegionLikeWrapper::RegionLikeWrapper; + using RegionLikeWrapper::operator=; +}; + +struct BlockImpl; + +struct Block : BlockWrapper { + using BlockWrapper::BlockWrapper; + using BlockWrapper::operator=; +}; + +struct BlockImpl : ValueImpl, RegionLikeImpl { + BlockImpl(Location loc); + Node clone(Context &context, CloneMap &map) const override; + + void print(std::ostream &os, NameStorage &ns) const override { + os << '%' << ns.getNameOf(const_cast(this)); + os << " = "; + + if (!getOperands().empty()) { + os << '['; + for (bool first = true; auto &operand : getOperands()) { + if (first) { + first = false; + } else { + os << ", "; + } + + operand.print(os, ns); + } + os << "] "; + } + + os << "{\n"; + for (auto child : children()) { + os << " "; + child.print(os, ns); + os << "\n"; + } + os << "}"; + } +}; +} // namespace ir diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Builder.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Builder.hpp new file mode 100644 index 00000000..4b7a4e85 --- /dev/null +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Builder.hpp @@ -0,0 +1,84 @@ +#pragma once +#include "Context.hpp" +#include "Node.hpp" +#include "RegionLikeImpl.hpp" + +namespace shader::ir { +template struct BuilderFacade { + ImplT &instance() { + return *static_cast(static_cast(this)); + } + Context &getContext() { return instance().getContext(); } + + Node getInsertionStorage() { return instance().getInsertionStorage(); } + template + requires requires { + typename T::underlying_type; + requires std::is_constructible_v; + requires std::is_base_of_v; + } + T create(ArgsT &&...args) { + return instance().template create(std::forward(args)...); + } +}; + +template