mirror of
https://github.com/skyline-emu/skyline.git
synced 2025-01-01 11:05:28 +03:00
Rework Descriptor Set Allocation/Updates
A substantial amount of time would be spent on creation/destruction of `VkDescriptorSet` which scales on titles doing a substantial amount of draws with bindings, this leads to poor performance on those titles as the frametime is dragged down by performing these tasks while they repeatedly create descriptor sets of the same layouts. This commit fixes it by pooling descriptor sets per-layout in a dynamically resizable pool and keeping them around rather than destroying them after usage which leads to the vast majority of cases not requiring a new descriptor set to even be created. It leads to significantly improved performance where it would otherwise be spent on redundant destruction/recreation or push descriptor updates which took a substantial amount of time themselves. Additionally, the `BaseDescriptorSizes` were not kept up to date with all of the descriptor types, it led to no crashes on Adreno/Mali as they were purely used for size calculations on either driver but has been corrected to avoid any future issues.
This commit is contained in:
parent
e1a4325137
commit
8fc3cc7a16
@ -6,72 +6,120 @@
|
||||
#include "descriptor_allocator.h"
|
||||
|
||||
namespace skyline::gpu {
|
||||
DescriptorAllocator::DescriptorPool::DescriptorPool(const vk::raii::Device &device, const vk::DescriptorPoolCreateInfo &createInfo) : vk::raii::DescriptorPool(device, createInfo), freeSetCount(createInfo.maxSets) {}
|
||||
DescriptorAllocator::DescriptorSetSlot::DescriptorSetSlot(vk::DescriptorSet descriptorSet) : descriptorSet{descriptorSet} {}
|
||||
|
||||
DescriptorAllocator::DescriptorSetSlot::DescriptorSetSlot(DescriptorAllocator::DescriptorSetSlot &&other) : descriptorSet{other.descriptorSet} {
|
||||
other.descriptorSet = nullptr;
|
||||
}
|
||||
|
||||
DescriptorAllocator::DescriptorPool::DescriptorPool(const vk::raii::Device &device, const vk::DescriptorPoolCreateInfo &createInfo) : vk::raii::DescriptorPool{device, createInfo}, freeSetCount{createInfo.maxSets} {}
|
||||
|
||||
void DescriptorAllocator::AllocateDescriptorPool() {
|
||||
namespace maxwell3d = soc::gm20b::engine::maxwell3d::type; // We use Maxwell3D as reference for base descriptor counts
|
||||
using DescriptorSizes = std::array<vk::DescriptorPoolSize, 2>;
|
||||
using DescriptorSizes = std::array<vk::DescriptorPoolSize, 5>;
|
||||
constexpr DescriptorSizes BaseDescriptorSizes{
|
||||
vk::DescriptorPoolSize{
|
||||
.descriptorCount = maxwell3d::PipelineStageConstantBufferCount,
|
||||
.type = vk::DescriptorType::eUniformBuffer,
|
||||
},
|
||||
vk::DescriptorPoolSize{
|
||||
.descriptorCount = maxwell3d::PipelineStageCount * 20,
|
||||
.descriptorCount = maxwell3d::PipelineStageCount * 5,
|
||||
.type = vk::DescriptorType::eStorageBuffer,
|
||||
},
|
||||
vk::DescriptorPoolSize{
|
||||
.descriptorCount = maxwell3d::PipelineStageCount * 5,
|
||||
.type = vk::DescriptorType::eCombinedImageSampler,
|
||||
},
|
||||
};
|
||||
vk::DescriptorPoolSize{
|
||||
.descriptorCount = maxwell3d::PipelineStageCount,
|
||||
.type = vk::DescriptorType::eStorageImage,
|
||||
},
|
||||
vk::DescriptorPoolSize{
|
||||
.descriptorCount = maxwell3d::RenderTargetCount,
|
||||
.type = vk::DescriptorType::eInputAttachment,
|
||||
},
|
||||
}; //!< A best approximate ratio of descriptors of each type that may be utilized, the total amount will grow in these ratios
|
||||
|
||||
DescriptorSizes descriptorSizes{BaseDescriptorSizes};
|
||||
for (auto &descriptorSize : descriptorSizes)
|
||||
descriptorSize.descriptorCount *= descriptorMultiplier;
|
||||
|
||||
pool = std::make_shared<DescriptorPool>(gpu.vkDevice, vk::DescriptorPoolCreateInfo{
|
||||
.flags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet,
|
||||
.maxSets = descriptorSetCount,
|
||||
.pPoolSizes = descriptorSizes.data(),
|
||||
.poolSizeCount = descriptorSizes.size(),
|
||||
});
|
||||
}
|
||||
|
||||
DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(std::shared_ptr<DescriptorPool> pPool, vk::DescriptorSet set) : pool(std::move(pPool)), DescriptorSet(set) {
|
||||
vk::ResultValue<vk::DescriptorSet> DescriptorAllocator::AllocateVkDescriptorSet(vk::DescriptorSetLayout layout) {
|
||||
vk::DescriptorSetAllocateInfo allocateInfo{
|
||||
.descriptorPool = **pool,
|
||||
.pSetLayouts = &layout,
|
||||
.descriptorSetCount = 1,
|
||||
};
|
||||
vk::DescriptorSet descriptorSet{};
|
||||
|
||||
auto result{(*gpu.vkDevice).allocateDescriptorSets(&allocateInfo, &descriptorSet, *gpu.vkDevice.getDispatcher())};
|
||||
return vk::createResultValue(result, descriptorSet, __builtin_FUNCTION(), {
|
||||
vk::Result::eSuccess,
|
||||
vk::Result::eErrorOutOfPoolMemory,
|
||||
vk::Result::eErrorFragmentedPool
|
||||
});
|
||||
}
|
||||
|
||||
DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(std::shared_ptr<DescriptorPool> pPool, DescriptorSetSlot *slot) : pool{std::move(pPool)}, slot{slot} {
|
||||
pool->freeSetCount--;
|
||||
}
|
||||
|
||||
DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(DescriptorAllocator::ActiveDescriptorSet &&other) noexcept {
|
||||
pool = std::move(other.pool);
|
||||
static_cast<vk::DescriptorSet &>(*this) = std::exchange(static_cast<vk::DescriptorSet &>(other), vk::DescriptorSet{});
|
||||
slot = std::exchange(other.slot, nullptr);
|
||||
}
|
||||
|
||||
DescriptorAllocator::ActiveDescriptorSet::~ActiveDescriptorSet() {
|
||||
if (static_cast<vk::DescriptorSet &>(*this)) {
|
||||
std::scoped_lock lock(*pool);
|
||||
pool->getDevice().freeDescriptorSets(**pool, 1, this, *pool->getDispatcher());
|
||||
if (slot) {
|
||||
slot->active.clear(std::memory_order_release);
|
||||
pool->freeSetCount++;
|
||||
}
|
||||
}
|
||||
|
||||
DescriptorAllocator::DescriptorAllocator(GPU &gpu) : gpu(gpu) {
|
||||
DescriptorAllocator::DescriptorAllocator(GPU &gpu) : gpu{gpu} {
|
||||
AllocateDescriptorPool();
|
||||
}
|
||||
|
||||
DescriptorAllocator::ActiveDescriptorSet DescriptorAllocator::AllocateSet(vk::DescriptorSetLayout layout) {
|
||||
std::scoped_lock allocatorLock(mutex);
|
||||
std::scoped_lock allocatorLock{mutex};
|
||||
|
||||
auto it{pool->layoutSlots.find(layout)};
|
||||
vk::Result lastResult{};
|
||||
if (it != pool->layoutSlots.end()) {
|
||||
auto &slots{it->second};
|
||||
for (auto &slot : it->second)
|
||||
if (!slot.active.test_and_set(std::memory_order_acq_rel))
|
||||
return ActiveDescriptorSet{pool, &slot};
|
||||
|
||||
// If we couldn't find an available slot, we need to allocate a new one
|
||||
auto set{AllocateVkDescriptorSet(layout)};
|
||||
if (set.result == vk::Result::eSuccess) {
|
||||
auto &slot{slots.emplace_back(set.value)};
|
||||
return ActiveDescriptorSet{pool, &slot};
|
||||
} else {
|
||||
lastResult = set.result;
|
||||
}
|
||||
} else {
|
||||
// If we couldn't find a layout, we need to allocate a new one
|
||||
auto set{AllocateVkDescriptorSet(layout)};
|
||||
if (set.result == vk::Result::eSuccess) {
|
||||
auto &layoutSlots{pool->layoutSlots.try_emplace(layout).first->second};
|
||||
return ActiveDescriptorSet{pool, &layoutSlots.emplace_back(set.value)};
|
||||
} else {
|
||||
lastResult = set.result;
|
||||
}
|
||||
}
|
||||
|
||||
while (true) {
|
||||
std::scoped_lock poolLock(*pool);
|
||||
|
||||
vk::DescriptorSetAllocateInfo allocateInfo{
|
||||
.descriptorPool = **pool,
|
||||
.pSetLayouts = &layout,
|
||||
.descriptorSetCount = 1,
|
||||
};
|
||||
vk::DescriptorSet set{};
|
||||
|
||||
auto result{(*gpu.vkDevice).allocateDescriptorSets(&allocateInfo, &set, *gpu.vkDevice.getDispatcher())};
|
||||
if (result == vk::Result::eSuccess) {
|
||||
return ActiveDescriptorSet(pool, set);
|
||||
} else if (result == vk::Result::eErrorOutOfPoolMemory) {
|
||||
// We attempt to modify the pool based on the last result
|
||||
if (lastResult == vk::Result::eErrorOutOfPoolMemory) {
|
||||
if (pool->freeSetCount == 0)
|
||||
// The amount of maximum descriptor sets is insufficient
|
||||
descriptorSetCount += DescriptorSetCountIncrement;
|
||||
@ -79,12 +127,17 @@ namespace skyline::gpu {
|
||||
// The amount of maximum descriptors is insufficient
|
||||
descriptorMultiplier++;
|
||||
AllocateDescriptorPool();
|
||||
continue; // Attempt to allocate again with the new pool
|
||||
} else if (result == vk::Result::eErrorFragmentedPool) {
|
||||
} else if (lastResult == vk::Result::eErrorFragmentedPool) {
|
||||
AllocateDescriptorPool(); // If the pool is fragmented, we reallocate without increasing the size
|
||||
continue;
|
||||
}
|
||||
|
||||
// Try to allocate a new layout
|
||||
auto set{AllocateVkDescriptorSet(layout)};
|
||||
if (set.result == vk::Result::eSuccess) {
|
||||
auto &layoutSlots{pool->layoutSlots.try_emplace(layout).first->second};
|
||||
return ActiveDescriptorSet{pool, &layoutSlots.emplace_back(set.value)};
|
||||
} else {
|
||||
vk::throwResultException(result, __builtin_FUNCTION());
|
||||
lastResult = set.result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3,7 +3,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "fence_cycle.h"
|
||||
#include <vulkan/vulkan.hpp>
|
||||
#include <common.h>
|
||||
|
||||
namespace skyline::gpu {
|
||||
/**
|
||||
@ -18,11 +19,24 @@ namespace skyline::gpu {
|
||||
u32 descriptorSetCount{DescriptorSetCountIncrement}; //!< The maximum amount of descriptor sets in the pool
|
||||
u32 descriptorMultiplier{1}; //!< A multiplier for the maximum amount of descriptors in the pool
|
||||
|
||||
/**
|
||||
* @brief A slot representing a single descriptor set dynamically allocated from the pool
|
||||
*/
|
||||
struct DescriptorSetSlot {
|
||||
std::atomic_flag active{true}; //!< If the descriptor is currently being utilized
|
||||
vk::DescriptorSet descriptorSet; //!< The descriptor set allocated from the pool
|
||||
|
||||
DescriptorSetSlot(vk::DescriptorSet descriptorSet);
|
||||
|
||||
DescriptorSetSlot(DescriptorSetSlot &&other);
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A lockable VkDescriptorPool for maintaining external synchronization requirements
|
||||
*/
|
||||
struct DescriptorPool : public std::mutex, public vk::raii::DescriptorPool {
|
||||
u64 freeSetCount{}; //!< The amount of sets free to allocate from this pool
|
||||
struct DescriptorPool : public vk::raii::DescriptorPool {
|
||||
std::atomic<u64> freeSetCount{}; //!< The amount of sets free to allocate from this pool
|
||||
std::unordered_map<vk::DescriptorSetLayout, std::list<DescriptorSetSlot>> layoutSlots; //!< A map of pools based on the layout of the descriptor sets
|
||||
|
||||
DescriptorPool(vk::raii::Device const &device, vk::DescriptorPoolCreateInfo const &createInfo);
|
||||
};
|
||||
@ -35,35 +49,47 @@ namespace skyline::gpu {
|
||||
*/
|
||||
void AllocateDescriptorPool();
|
||||
|
||||
/**
|
||||
* @brief Allocates a descriptor set with the specified layout from the pool
|
||||
* @return An error code that's either `eSuccess`, `eErrorOutOfPoolMemory` or `eErrorFragmentedPool`
|
||||
*/
|
||||
vk::ResultValue<vk::DescriptorSet> AllocateVkDescriptorSet(vk::DescriptorSetLayout layout);
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief A RAII-bound descriptor set that automatically frees resources into the pool on destruction while respecting external synchronization requirements
|
||||
*/
|
||||
struct ActiveDescriptorSet : public vk::DescriptorSet {
|
||||
struct ActiveDescriptorSet {
|
||||
private:
|
||||
friend DescriptorAllocator;
|
||||
std::shared_ptr<DescriptorPool> pool;
|
||||
DescriptorSetSlot *slot;
|
||||
|
||||
/**
|
||||
* @note The supplied pool **must** be locked prior to calling this
|
||||
*/
|
||||
ActiveDescriptorSet(std::shared_ptr<DescriptorPool> pool, vk::DescriptorSet set);
|
||||
friend class DescriptorAllocator;
|
||||
|
||||
ActiveDescriptorSet(std::shared_ptr<DescriptorPool> pool, DescriptorSetSlot *slot);
|
||||
|
||||
public:
|
||||
ActiveDescriptorSet(ActiveDescriptorSet &&other) noexcept;
|
||||
|
||||
/* Delete the move constructor to prevent early freeing of the descriptor set */
|
||||
/* Delete the copy constructor/assignment to prevent early freeing of the descriptor set */
|
||||
ActiveDescriptorSet(const ActiveDescriptorSet &) = delete;
|
||||
|
||||
ActiveDescriptorSet &operator=(const ActiveDescriptorSet &) = delete;
|
||||
|
||||
~ActiveDescriptorSet();
|
||||
|
||||
vk::DescriptorSet &operator*() const {
|
||||
return slot->descriptorSet;
|
||||
}
|
||||
};
|
||||
|
||||
DescriptorAllocator(GPU &gpu);
|
||||
|
||||
/**
|
||||
* @note It is UB to allocate a set with a descriptor type that isn't in the pool as defined in AllocateDescriptorPool
|
||||
* @brief Allocates a descriptor set from the pool with the supplied layout
|
||||
* @note The layout object must be reused for equivalent layouts to avoid unnecessary descriptor set creation
|
||||
* @note It is UB to allocate a set with a descriptor type that isn't in the pool as defined in AllocateDescriptorPool()
|
||||
* @note The supplied ActiveDescriptorSet **must** stay alive until the descriptor set can be freed, it must not be destroyed after being bound but after any associated commands have completed execution
|
||||
*/
|
||||
ActiveDescriptorSet AllocateSet(vk::DescriptorSetLayout layout);
|
||||
};
|
||||
|
@ -1893,7 +1893,7 @@ namespace skyline::gpu::interconnect {
|
||||
|
||||
public:
|
||||
void SetPrimitiveTopology(maxwell3d::PrimitiveTopology topology) {
|
||||
auto[vkTopology, shaderTopology, isQuad] = [topology]() -> std::tuple<vk::PrimitiveTopology, ShaderCompiler::InputTopology, bool> {
|
||||
auto[vkTopology, shaderTopology, isQuad]{[topology]() -> std::tuple<vk::PrimitiveTopology, ShaderCompiler::InputTopology, bool> {
|
||||
using MaxwellTopology = maxwell3d::PrimitiveTopology;
|
||||
using VkTopology = vk::PrimitiveTopology;
|
||||
using ShaderTopology = ShaderCompiler::InputTopology;
|
||||
@ -1922,7 +1922,7 @@ namespace skyline::gpu::interconnect {
|
||||
default:
|
||||
throw exception("Unimplemented Maxwell3D Primitive Topology: {}", maxwell3d::ToString(topology));
|
||||
}
|
||||
}();
|
||||
}()};
|
||||
|
||||
inputAssemblyState.topology = vkTopology;
|
||||
needsQuadConversion = isQuad;
|
||||
@ -2844,7 +2844,7 @@ namespace skyline::gpu::interconnect {
|
||||
}
|
||||
} else if (needsQuadConversion) {
|
||||
// Convert the guest-supplied quad list to an indexed triangle list
|
||||
auto[bufferView, indexType, indexCount] = GetNonIndexedQuadConversionBuffer(count);
|
||||
auto[bufferView, indexType, indexCount]{GetNonIndexedQuadConversionBuffer(count)};
|
||||
executor.AttachBuffer(bufferView);
|
||||
|
||||
count = indexCount;
|
||||
@ -2948,28 +2948,23 @@ namespace skyline::gpu::interconnect {
|
||||
.depthStencilAttachment = depthRenderTargetView,
|
||||
}, programState.descriptorSetBindings)};
|
||||
|
||||
// Draw Persistent Storage
|
||||
// Descriptor Set Binding + Update Setup
|
||||
struct DrawStorage {
|
||||
ShaderProgramState::DescriptorSetWrites descriptorSetWrites;
|
||||
std::optional<DescriptorAllocator::ActiveDescriptorSet> descriptorSet;
|
||||
DescriptorAllocator::ActiveDescriptorSet descriptorSet;
|
||||
|
||||
DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites) : descriptorSetWrites(std::move(descriptorSetWrites)) {}
|
||||
|
||||
DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites, DescriptorAllocator::ActiveDescriptorSet &&descriptorSet) : descriptorSetWrites(std::move(descriptorSetWrites)), descriptorSet(std::move(descriptorSet)) {}
|
||||
DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites, DescriptorAllocator::ActiveDescriptorSet &&descriptorSet) : descriptorSetWrites{std::move(descriptorSetWrites)}, descriptorSet{std::move(descriptorSet)} {}
|
||||
};
|
||||
|
||||
std::shared_ptr<DrawStorage> drawStorage{};
|
||||
if (!programState.descriptorSetWrites->empty()) {
|
||||
if (gpu.traits.supportsPushDescriptors)
|
||||
drawStorage = std::make_shared<DrawStorage>(std::move(programState.descriptorSetWrites));
|
||||
else {
|
||||
drawStorage = std::make_shared<DrawStorage>(std::move(programState.descriptorSetWrites), gpu.descriptor.AllocateSet(compiledPipeline.descriptorSetLayout));
|
||||
}
|
||||
drawStorage = std::make_shared<DrawStorage>(std::move(programState.descriptorSetWrites), gpu.descriptor.AllocateSet(compiledPipeline.descriptorSetLayout));
|
||||
// We can't update the descriptor set here as the bindings might be retroactively updated by future draws
|
||||
executor.AttachDependency(drawStorage);
|
||||
}
|
||||
|
||||
// Submit Draw
|
||||
executor.AddSubpass([=, drawStorage = std::move(drawStorage), &vkDevice = gpu.vkDevice, pipelineLayout = compiledPipeline.pipelineLayout, pipeline = compiledPipeline.pipeline, supportsPushDescriptors = gpu.traits.supportsPushDescriptors](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &, vk::RenderPass renderPass, u32 subpassIndex) mutable {
|
||||
|
||||
executor.AddSubpass([=, drawStorage = std::move(drawStorage), pipelineLayout = compiledPipeline.pipelineLayout, pipeline = compiledPipeline.pipeline](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &, vk::RenderPass renderPass, u32 subpassIndex) mutable {
|
||||
auto &vertexBufferHandles{boundVertexBuffers->handles};
|
||||
for (u32 bindingIndex{}; bindingIndex != vertexBufferHandles.size(); bindingIndex++) {
|
||||
// We need to bind all non-null vertex buffers while skipping any null ones
|
||||
@ -2984,16 +2979,12 @@ namespace skyline::gpu::interconnect {
|
||||
}
|
||||
|
||||
if (drawStorage) {
|
||||
if (supportsPushDescriptors) {
|
||||
commandBuffer.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, *drawStorage->descriptorSetWrites);
|
||||
} else {
|
||||
for (auto &descriptorSetWrite : *drawStorage->descriptorSetWrites)
|
||||
descriptorSetWrite.dstSet = *drawStorage->descriptorSet;
|
||||
vkDevice.updateDescriptorSets(*drawStorage->descriptorSetWrites, nullptr);
|
||||
commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, *drawStorage->descriptorSet, nullptr);
|
||||
}
|
||||
vk::DescriptorSet descriptorSet{*drawStorage->descriptorSet};
|
||||
for (auto &descriptorSetWrite : *drawStorage->descriptorSetWrites)
|
||||
descriptorSetWrite.dstSet = descriptorSet;
|
||||
gpu.vkDevice.updateDescriptorSets(*drawStorage->descriptorSetWrites, nullptr);
|
||||
|
||||
cycle->AttachObject(drawStorage);
|
||||
commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, descriptorSet, nullptr);
|
||||
}
|
||||
|
||||
commandBuffer.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline);
|
||||
|
Loading…
Reference in New Issue
Block a user