diff --git a/app/src/main/cpp/skyline/gpu/descriptor_allocator.cpp b/app/src/main/cpp/skyline/gpu/descriptor_allocator.cpp index 152a15d1..470a30af 100644 --- a/app/src/main/cpp/skyline/gpu/descriptor_allocator.cpp +++ b/app/src/main/cpp/skyline/gpu/descriptor_allocator.cpp @@ -6,72 +6,120 @@ #include "descriptor_allocator.h" namespace skyline::gpu { - DescriptorAllocator::DescriptorPool::DescriptorPool(const vk::raii::Device &device, const vk::DescriptorPoolCreateInfo &createInfo) : vk::raii::DescriptorPool(device, createInfo), freeSetCount(createInfo.maxSets) {} + DescriptorAllocator::DescriptorSetSlot::DescriptorSetSlot(vk::DescriptorSet descriptorSet) : descriptorSet{descriptorSet} {} + + DescriptorAllocator::DescriptorSetSlot::DescriptorSetSlot(DescriptorAllocator::DescriptorSetSlot &&other) : descriptorSet{other.descriptorSet} { + other.descriptorSet = nullptr; + } + + DescriptorAllocator::DescriptorPool::DescriptorPool(const vk::raii::Device &device, const vk::DescriptorPoolCreateInfo &createInfo) : vk::raii::DescriptorPool{device, createInfo}, freeSetCount{createInfo.maxSets} {} void DescriptorAllocator::AllocateDescriptorPool() { namespace maxwell3d = soc::gm20b::engine::maxwell3d::type; // We use Maxwell3D as reference for base descriptor counts - using DescriptorSizes = std::array; + using DescriptorSizes = std::array; constexpr DescriptorSizes BaseDescriptorSizes{ vk::DescriptorPoolSize{ .descriptorCount = maxwell3d::PipelineStageConstantBufferCount, .type = vk::DescriptorType::eUniformBuffer, }, vk::DescriptorPoolSize{ - .descriptorCount = maxwell3d::PipelineStageCount * 20, + .descriptorCount = maxwell3d::PipelineStageCount * 5, + .type = vk::DescriptorType::eStorageBuffer, + }, + vk::DescriptorPoolSize{ + .descriptorCount = maxwell3d::PipelineStageCount * 5, .type = vk::DescriptorType::eCombinedImageSampler, }, - }; + vk::DescriptorPoolSize{ + .descriptorCount = maxwell3d::PipelineStageCount, + .type = vk::DescriptorType::eStorageImage, + }, + vk::DescriptorPoolSize{ + .descriptorCount = maxwell3d::RenderTargetCount, + .type = vk::DescriptorType::eInputAttachment, + }, + }; //!< A best approximate ratio of descriptors of each type that may be utilized, the total amount will grow in these ratios DescriptorSizes descriptorSizes{BaseDescriptorSizes}; for (auto &descriptorSize : descriptorSizes) descriptorSize.descriptorCount *= descriptorMultiplier; pool = std::make_shared(gpu.vkDevice, vk::DescriptorPoolCreateInfo{ - .flags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet, .maxSets = descriptorSetCount, .pPoolSizes = descriptorSizes.data(), .poolSizeCount = descriptorSizes.size(), }); } - DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(std::shared_ptr pPool, vk::DescriptorSet set) : pool(std::move(pPool)), DescriptorSet(set) { + vk::ResultValue DescriptorAllocator::AllocateVkDescriptorSet(vk::DescriptorSetLayout layout) { + vk::DescriptorSetAllocateInfo allocateInfo{ + .descriptorPool = **pool, + .pSetLayouts = &layout, + .descriptorSetCount = 1, + }; + vk::DescriptorSet descriptorSet{}; + + auto result{(*gpu.vkDevice).allocateDescriptorSets(&allocateInfo, &descriptorSet, *gpu.vkDevice.getDispatcher())}; + return vk::createResultValue(result, descriptorSet, __builtin_FUNCTION(), { + vk::Result::eSuccess, + vk::Result::eErrorOutOfPoolMemory, + vk::Result::eErrorFragmentedPool + }); + } + + DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(std::shared_ptr pPool, DescriptorSetSlot *slot) : pool{std::move(pPool)}, slot{slot} { pool->freeSetCount--; } DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(DescriptorAllocator::ActiveDescriptorSet &&other) noexcept { pool = std::move(other.pool); - static_cast(*this) = std::exchange(static_cast(other), vk::DescriptorSet{}); + slot = std::exchange(other.slot, nullptr); } DescriptorAllocator::ActiveDescriptorSet::~ActiveDescriptorSet() { - if (static_cast(*this)) { - std::scoped_lock lock(*pool); - pool->getDevice().freeDescriptorSets(**pool, 1, this, *pool->getDispatcher()); + if (slot) { + slot->active.clear(std::memory_order_release); pool->freeSetCount++; } } - DescriptorAllocator::DescriptorAllocator(GPU &gpu) : gpu(gpu) { + DescriptorAllocator::DescriptorAllocator(GPU &gpu) : gpu{gpu} { AllocateDescriptorPool(); } DescriptorAllocator::ActiveDescriptorSet DescriptorAllocator::AllocateSet(vk::DescriptorSetLayout layout) { - std::scoped_lock allocatorLock(mutex); + std::scoped_lock allocatorLock{mutex}; + + auto it{pool->layoutSlots.find(layout)}; + vk::Result lastResult{}; + if (it != pool->layoutSlots.end()) { + auto &slots{it->second}; + for (auto &slot : it->second) + if (!slot.active.test_and_set(std::memory_order_acq_rel)) + return ActiveDescriptorSet{pool, &slot}; + + // If we couldn't find an available slot, we need to allocate a new one + auto set{AllocateVkDescriptorSet(layout)}; + if (set.result == vk::Result::eSuccess) { + auto &slot{slots.emplace_back(set.value)}; + return ActiveDescriptorSet{pool, &slot}; + } else { + lastResult = set.result; + } + } else { + // If we couldn't find a layout, we need to allocate a new one + auto set{AllocateVkDescriptorSet(layout)}; + if (set.result == vk::Result::eSuccess) { + auto &layoutSlots{pool->layoutSlots.try_emplace(layout).first->second}; + return ActiveDescriptorSet{pool, &layoutSlots.emplace_back(set.value)}; + } else { + lastResult = set.result; + } + } while (true) { - std::scoped_lock poolLock(*pool); - - vk::DescriptorSetAllocateInfo allocateInfo{ - .descriptorPool = **pool, - .pSetLayouts = &layout, - .descriptorSetCount = 1, - }; - vk::DescriptorSet set{}; - - auto result{(*gpu.vkDevice).allocateDescriptorSets(&allocateInfo, &set, *gpu.vkDevice.getDispatcher())}; - if (result == vk::Result::eSuccess) { - return ActiveDescriptorSet(pool, set); - } else if (result == vk::Result::eErrorOutOfPoolMemory) { + // We attempt to modify the pool based on the last result + if (lastResult == vk::Result::eErrorOutOfPoolMemory) { if (pool->freeSetCount == 0) // The amount of maximum descriptor sets is insufficient descriptorSetCount += DescriptorSetCountIncrement; @@ -79,12 +127,17 @@ namespace skyline::gpu { // The amount of maximum descriptors is insufficient descriptorMultiplier++; AllocateDescriptorPool(); - continue; // Attempt to allocate again with the new pool - } else if (result == vk::Result::eErrorFragmentedPool) { + } else if (lastResult == vk::Result::eErrorFragmentedPool) { AllocateDescriptorPool(); // If the pool is fragmented, we reallocate without increasing the size - continue; + } + + // Try to allocate a new layout + auto set{AllocateVkDescriptorSet(layout)}; + if (set.result == vk::Result::eSuccess) { + auto &layoutSlots{pool->layoutSlots.try_emplace(layout).first->second}; + return ActiveDescriptorSet{pool, &layoutSlots.emplace_back(set.value)}; } else { - vk::throwResultException(result, __builtin_FUNCTION()); + lastResult = set.result; } } } diff --git a/app/src/main/cpp/skyline/gpu/descriptor_allocator.h b/app/src/main/cpp/skyline/gpu/descriptor_allocator.h index e7b574e2..2afe2848 100644 --- a/app/src/main/cpp/skyline/gpu/descriptor_allocator.h +++ b/app/src/main/cpp/skyline/gpu/descriptor_allocator.h @@ -3,7 +3,8 @@ #pragma once -#include "fence_cycle.h" +#include +#include namespace skyline::gpu { /** @@ -18,11 +19,24 @@ namespace skyline::gpu { u32 descriptorSetCount{DescriptorSetCountIncrement}; //!< The maximum amount of descriptor sets in the pool u32 descriptorMultiplier{1}; //!< A multiplier for the maximum amount of descriptors in the pool + /** + * @brief A slot representing a single descriptor set dynamically allocated from the pool + */ + struct DescriptorSetSlot { + std::atomic_flag active{true}; //!< If the descriptor is currently being utilized + vk::DescriptorSet descriptorSet; //!< The descriptor set allocated from the pool + + DescriptorSetSlot(vk::DescriptorSet descriptorSet); + + DescriptorSetSlot(DescriptorSetSlot &&other); + }; + /** * @brief A lockable VkDescriptorPool for maintaining external synchronization requirements */ - struct DescriptorPool : public std::mutex, public vk::raii::DescriptorPool { - u64 freeSetCount{}; //!< The amount of sets free to allocate from this pool + struct DescriptorPool : public vk::raii::DescriptorPool { + std::atomic freeSetCount{}; //!< The amount of sets free to allocate from this pool + std::unordered_map> layoutSlots; //!< A map of pools based on the layout of the descriptor sets DescriptorPool(vk::raii::Device const &device, vk::DescriptorPoolCreateInfo const &createInfo); }; @@ -35,35 +49,47 @@ namespace skyline::gpu { */ void AllocateDescriptorPool(); + /** + * @brief Allocates a descriptor set with the specified layout from the pool + * @return An error code that's either `eSuccess`, `eErrorOutOfPoolMemory` or `eErrorFragmentedPool` + */ + vk::ResultValue AllocateVkDescriptorSet(vk::DescriptorSetLayout layout); + public: /** * @brief A RAII-bound descriptor set that automatically frees resources into the pool on destruction while respecting external synchronization requirements */ - struct ActiveDescriptorSet : public vk::DescriptorSet { + struct ActiveDescriptorSet { private: - friend DescriptorAllocator; std::shared_ptr pool; + DescriptorSetSlot *slot; - /** - * @note The supplied pool **must** be locked prior to calling this - */ - ActiveDescriptorSet(std::shared_ptr pool, vk::DescriptorSet set); + friend class DescriptorAllocator; + + ActiveDescriptorSet(std::shared_ptr pool, DescriptorSetSlot *slot); public: ActiveDescriptorSet(ActiveDescriptorSet &&other) noexcept; - /* Delete the move constructor to prevent early freeing of the descriptor set */ + /* Delete the copy constructor/assignment to prevent early freeing of the descriptor set */ ActiveDescriptorSet(const ActiveDescriptorSet &) = delete; ActiveDescriptorSet &operator=(const ActiveDescriptorSet &) = delete; ~ActiveDescriptorSet(); + + vk::DescriptorSet &operator*() const { + return slot->descriptorSet; + } }; DescriptorAllocator(GPU &gpu); /** - * @note It is UB to allocate a set with a descriptor type that isn't in the pool as defined in AllocateDescriptorPool + * @brief Allocates a descriptor set from the pool with the supplied layout + * @note The layout object must be reused for equivalent layouts to avoid unnecessary descriptor set creation + * @note It is UB to allocate a set with a descriptor type that isn't in the pool as defined in AllocateDescriptorPool() + * @note The supplied ActiveDescriptorSet **must** stay alive until the descriptor set can be freed, it must not be destroyed after being bound but after any associated commands have completed execution */ ActiveDescriptorSet AllocateSet(vk::DescriptorSetLayout layout); }; diff --git a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h index b00ba7a4..90bc9044 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h @@ -1893,7 +1893,7 @@ namespace skyline::gpu::interconnect { public: void SetPrimitiveTopology(maxwell3d::PrimitiveTopology topology) { - auto[vkTopology, shaderTopology, isQuad] = [topology]() -> std::tuple { + auto[vkTopology, shaderTopology, isQuad]{[topology]() -> std::tuple { using MaxwellTopology = maxwell3d::PrimitiveTopology; using VkTopology = vk::PrimitiveTopology; using ShaderTopology = ShaderCompiler::InputTopology; @@ -1922,7 +1922,7 @@ namespace skyline::gpu::interconnect { default: throw exception("Unimplemented Maxwell3D Primitive Topology: {}", maxwell3d::ToString(topology)); } - }(); + }()}; inputAssemblyState.topology = vkTopology; needsQuadConversion = isQuad; @@ -2844,7 +2844,7 @@ namespace skyline::gpu::interconnect { } } else if (needsQuadConversion) { // Convert the guest-supplied quad list to an indexed triangle list - auto[bufferView, indexType, indexCount] = GetNonIndexedQuadConversionBuffer(count); + auto[bufferView, indexType, indexCount]{GetNonIndexedQuadConversionBuffer(count)}; executor.AttachBuffer(bufferView); count = indexCount; @@ -2948,28 +2948,23 @@ namespace skyline::gpu::interconnect { .depthStencilAttachment = depthRenderTargetView, }, programState.descriptorSetBindings)}; - // Draw Persistent Storage + // Descriptor Set Binding + Update Setup struct DrawStorage { ShaderProgramState::DescriptorSetWrites descriptorSetWrites; - std::optional descriptorSet; + DescriptorAllocator::ActiveDescriptorSet descriptorSet; - DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites) : descriptorSetWrites(std::move(descriptorSetWrites)) {} - - DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites, DescriptorAllocator::ActiveDescriptorSet &&descriptorSet) : descriptorSetWrites(std::move(descriptorSetWrites)), descriptorSet(std::move(descriptorSet)) {} + DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites, DescriptorAllocator::ActiveDescriptorSet &&descriptorSet) : descriptorSetWrites{std::move(descriptorSetWrites)}, descriptorSet{std::move(descriptorSet)} {} }; std::shared_ptr drawStorage{}; if (!programState.descriptorSetWrites->empty()) { - if (gpu.traits.supportsPushDescriptors) - drawStorage = std::make_shared(std::move(programState.descriptorSetWrites)); - else { - drawStorage = std::make_shared(std::move(programState.descriptorSetWrites), gpu.descriptor.AllocateSet(compiledPipeline.descriptorSetLayout)); - } + drawStorage = std::make_shared(std::move(programState.descriptorSetWrites), gpu.descriptor.AllocateSet(compiledPipeline.descriptorSetLayout)); + // We can't update the descriptor set here as the bindings might be retroactively updated by future draws + executor.AttachDependency(drawStorage); } // Submit Draw - executor.AddSubpass([=, drawStorage = std::move(drawStorage), &vkDevice = gpu.vkDevice, pipelineLayout = compiledPipeline.pipelineLayout, pipeline = compiledPipeline.pipeline, supportsPushDescriptors = gpu.traits.supportsPushDescriptors](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &cycle, GPU &, vk::RenderPass renderPass, u32 subpassIndex) mutable { - + executor.AddSubpass([=, drawStorage = std::move(drawStorage), pipelineLayout = compiledPipeline.pipelineLayout, pipeline = compiledPipeline.pipeline](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &cycle, GPU &, vk::RenderPass renderPass, u32 subpassIndex) mutable { auto &vertexBufferHandles{boundVertexBuffers->handles}; for (u32 bindingIndex{}; bindingIndex != vertexBufferHandles.size(); bindingIndex++) { // We need to bind all non-null vertex buffers while skipping any null ones @@ -2984,16 +2979,12 @@ namespace skyline::gpu::interconnect { } if (drawStorage) { - if (supportsPushDescriptors) { - commandBuffer.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, *drawStorage->descriptorSetWrites); - } else { - for (auto &descriptorSetWrite : *drawStorage->descriptorSetWrites) - descriptorSetWrite.dstSet = *drawStorage->descriptorSet; - vkDevice.updateDescriptorSets(*drawStorage->descriptorSetWrites, nullptr); - commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, *drawStorage->descriptorSet, nullptr); - } + vk::DescriptorSet descriptorSet{*drawStorage->descriptorSet}; + for (auto &descriptorSetWrite : *drawStorage->descriptorSetWrites) + descriptorSetWrite.dstSet = descriptorSet; + gpu.vkDevice.updateDescriptorSets(*drawStorage->descriptorSetWrites, nullptr); - cycle->AttachObject(drawStorage); + commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, descriptorSet, nullptr); } commandBuffer.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline);