Implement thread pool based async pipeline compilation with futures

By distributing the load of shader compiling onto multiple threads and then only waiting for completion until absolutely neccessary we can reduce compilation stutters significantly.
This commit is contained in:
Billy Laws 2022-12-03 19:57:00 +00:00
parent 186549748d
commit 072b8193a1
6 changed files with 78 additions and 54 deletions

View File

@ -296,32 +296,9 @@ namespace skyline::gpu::cache {
return lhs == rhs;
}
GraphicsPipelineCache::PipelineCacheEntry::PipelineCacheEntry(vk::raii::DescriptorSetLayout &&descriptorSetLayout, vk::raii::PipelineLayout &&pipelineLayout, vk::raii::Pipeline &&pipeline) : descriptorSetLayout(std::move(descriptorSetLayout)), pipelineLayout(std::move(pipelineLayout)), pipeline(std::move(pipeline)) {}
GraphicsPipelineCache::CompiledPipeline::CompiledPipeline(const PipelineCacheEntry &entry) : descriptorSetLayout(*entry.descriptorSetLayout), pipelineLayout(*entry.pipelineLayout), pipeline(*entry.pipeline) {}
GraphicsPipelineCache::CompiledPipeline GraphicsPipelineCache::GetCompiledPipeline(const PipelineState &state, span<const vk::DescriptorSetLayoutBinding> layoutBindings, span<const vk::PushConstantRange> pushConstantRanges, bool noPushDescriptors) {
std::unique_lock lock(mutex);
auto it{pipelineCache.find(state)};
if (it != pipelineCache.end())
return CompiledPipeline{it->second};
lock.unlock();
vk::raii::DescriptorSetLayout descriptorSetLayout{gpu.vkDevice, vk::DescriptorSetLayoutCreateInfo{
.flags = vk::DescriptorSetLayoutCreateFlags{(!noPushDescriptors && gpu.traits.supportsPushDescriptors) ? vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR : vk::DescriptorSetLayoutCreateFlags{}},
.pBindings = layoutBindings.data(),
.bindingCount = static_cast<u32>(layoutBindings.size()),
}};
vk::raii::PipelineLayout pipelineLayout{gpu.vkDevice, vk::PipelineLayoutCreateInfo{
.pSetLayouts = &*descriptorSetLayout,
.setLayoutCount = 1,
.pPushConstantRanges = pushConstantRanges.data(),
.pushConstantRangeCount = static_cast<u32>(pushConstantRanges.size()),
}};
GraphicsPipelineCache::PipelineCacheEntry::PipelineCacheEntry(vk::raii::DescriptorSetLayout &&descriptorSetLayout, vk::raii::PipelineLayout &&pipelineLayout) : descriptorSetLayout{std::move(descriptorSetLayout)}, pipelineLayout{std::move(pipelineLayout)} {}
vk::raii::Pipeline GraphicsPipelineCache::BuildPipeline(const PipelineCacheKey &key, vk::PipelineLayout pipelineLayout) {
boost::container::small_vector<vk::AttachmentDescription, 8> attachmentDescriptions;
boost::container::small_vector<vk::AttachmentReference, 8> attachmentReferences;
@ -329,7 +306,7 @@ namespace skyline::gpu::cache {
if (format != vk::Format::eUndefined) {
attachmentDescriptions.push_back(vk::AttachmentDescription{
.format = format,
.samples = state.sampleCount,
.samples = key.sampleCount,
.loadOp = vk::AttachmentLoadOp::eLoad,
.storeOp = vk::AttachmentStoreOp::eStore,
.stencilLoadOp = vk::AttachmentLoadOp::eLoad,
@ -354,11 +331,11 @@ namespace skyline::gpu::cache {
.pipelineBindPoint = vk::PipelineBindPoint::eGraphics,
};
for (auto &colorAttachment : state.colorFormats)
for (auto &colorAttachment : key.colorFormats)
pushAttachment(colorAttachment);
if (state.depthStencilFormat != vk::Format::eUndefined) {
pushAttachment(state.depthStencilFormat);
if (key.depthStencilFormat != vk::Format::eUndefined) {
pushAttachment(key.depthStencilFormat);
subpassDescription.pColorAttachments = attachmentReferences.data();
subpassDescription.colorAttachmentCount = static_cast<u32>(attachmentReferences.size() - 1);
@ -375,25 +352,48 @@ namespace skyline::gpu::cache {
.pSubpasses = &subpassDescription,
}};
auto pipeline{gpu.vkDevice.createGraphicsPipeline(vkPipelineCache, vk::GraphicsPipelineCreateInfo{
.pStages = state.shaderStages.data(),
.stageCount = static_cast<u32>(state.shaderStages.size()),
.pVertexInputState = &state.vertexState.get<vk::PipelineVertexInputStateCreateInfo>(),
.pInputAssemblyState = &state.inputAssemblyState,
.pViewportState = &state.viewportState,
.pRasterizationState = &state.rasterizationState.get<vk::PipelineRasterizationStateCreateInfo>(),
.pMultisampleState = &state.multisampleState,
.pDepthStencilState = &state.depthStencilState,
.pColorBlendState = &state.colorBlendState,
.pDynamicState = &state.dynamicState,
.layout = *pipelineLayout,
return gpu.vkDevice.createGraphicsPipeline(vkPipelineCache, vk::GraphicsPipelineCreateInfo{
.pStages = key.shaderStages.data(),
.stageCount = static_cast<u32>(key.shaderStages.size()),
.pVertexInputState = &key.vertexState.get<vk::PipelineVertexInputStateCreateInfo>(),
.pInputAssemblyState = &key.inputAssemblyState,
.pViewportState = &key.viewportState,
.pRasterizationState = &key.rasterizationState.get<vk::PipelineRasterizationStateCreateInfo>(),
.pMultisampleState = &key.multisampleState,
.pDepthStencilState = &key.depthStencilState,
.pColorBlendState = &key.colorBlendState,
.pDynamicState = &key.dynamicState,
.layout = pipelineLayout,
.renderPass = *renderPass,
.subpass = 0,
})};
});
}
lock.lock();
GraphicsPipelineCache::CompiledPipeline::CompiledPipeline(const PipelineCacheEntry &entry) : descriptorSetLayout{*entry.descriptorSetLayout}, pipelineLayout{*entry.pipelineLayout}, pipeline{*entry.pipeline} {}
auto pipelineEntryIt{pipelineCache.try_emplace(PipelineCacheKey{state}, std::move(descriptorSetLayout), std::move(pipelineLayout), std::move(pipeline))};
GraphicsPipelineCache::CompiledPipeline GraphicsPipelineCache::GetCompiledPipeline(const PipelineState &state, span<const vk::DescriptorSetLayoutBinding> layoutBindings, span<const vk::PushConstantRange> pushConstantRanges, bool noPushDescriptors) {
std::unique_lock lock(mutex);
auto it{pipelineCache.find(state)};
if (it != pipelineCache.end())
return CompiledPipeline{it->second};
vk::raii::DescriptorSetLayout descriptorSetLayout{gpu.vkDevice, vk::DescriptorSetLayoutCreateInfo{
.flags = vk::DescriptorSetLayoutCreateFlags{(!noPushDescriptors && gpu.traits.supportsPushDescriptors) ? vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR : vk::DescriptorSetLayoutCreateFlags{}},
.pBindings = layoutBindings.data(),
.bindingCount = static_cast<u32>(layoutBindings.size()),
}};
vk::raii::PipelineLayout pipelineLayout{gpu.vkDevice, vk::PipelineLayoutCreateInfo{
.pSetLayouts = &*descriptorSetLayout,
.setLayoutCount = 1,
.pPushConstantRanges = pushConstantRanges.data(),
.pushConstantRangeCount = static_cast<u32>(pushConstantRanges.size()),
}};
auto pipelineEntryIt{pipelineCache.try_emplace(PipelineCacheKey{state}, std::move(descriptorSetLayout), std::move(pipelineLayout))};
auto pipelineFuture{pool.submit(&GraphicsPipelineCache::BuildPipeline, this, std::ref(pipelineEntryIt.first->first), std::ref(*pipelineEntryIt.first->second.pipelineLayout))};
pipelineEntryIt.first->second.pipeline = pipelineFuture.share();
return CompiledPipeline{pipelineEntryIt.first->second};
}
}

View File

@ -3,6 +3,8 @@
#pragma once
#include <future>
#include <BS_thread_pool.hpp>
#include <vulkan/vulkan_raii.hpp>
namespace skyline::gpu {
@ -136,20 +138,23 @@ namespace skyline::gpu::cache {
struct PipelineCacheEntry {
vk::raii::DescriptorSetLayout descriptorSetLayout;
vk::raii::PipelineLayout pipelineLayout;
vk::raii::Pipeline pipeline;
std::optional<std::shared_future<vk::raii::Pipeline>> pipeline;
PipelineCacheEntry(vk::raii::DescriptorSetLayout&& descriptorSetLayout, vk::raii::PipelineLayout &&layout, vk::raii::Pipeline &&pipeline);
PipelineCacheEntry(vk::raii::DescriptorSetLayout&& descriptorSetLayout, vk::raii::PipelineLayout &&layout);
};
BS::thread_pool pool;
std::unordered_map<PipelineCacheKey, PipelineCacheEntry, PipelineStateHash, PipelineCacheEqual> pipelineCache;
vk::raii::Pipeline BuildPipeline(const PipelineCacheKey &key, vk::PipelineLayout pipelineLayout);
public:
GraphicsPipelineCache(GPU &gpu);
struct CompiledPipeline {
vk::DescriptorSetLayout descriptorSetLayout;
vk::PipelineLayout pipelineLayout;
vk::Pipeline pipeline;
std::shared_future<vk::raii::Pipeline> pipeline;
CompiledPipeline(const PipelineCacheEntry &entry);
};

View File

@ -3,6 +3,7 @@
#pragma once
#include <future>
#include <gpu/interconnect/command_executor.h>
#include "common.h"
@ -261,6 +262,16 @@ namespace skyline::gpu::interconnect {
};
using SetPipelineCmd = CmdHolder<SetPipelineCmdImpl>;
struct SetPipelineFutureCmdImpl {
void Record(GPU &gpu, vk::raii::CommandBuffer &commandBuffer) {
commandBuffer.bindPipeline(bindPoint, *pipeline.get());
}
std::shared_future<vk::raii::Pipeline> pipeline;
vk::PipelineBindPoint bindPoint;
};
using SetPipelineFutureCmd = CmdHolder<SetPipelineFutureCmdImpl>;
/**
* @brief Single-use helper for recording a batch of state updates into a command buffer
*/
@ -471,6 +482,14 @@ namespace skyline::gpu::interconnect {
});
}
void SetPipeline(const std::shared_future<vk::raii::Pipeline> &pipeline, vk::PipelineBindPoint bindPoint) {
AppendCmd<SetPipelineFutureCmd>(
{
.pipeline = pipeline,
.bindPoint = bindPoint,
});
}
void SetDescriptorSetWithPush(DescriptorUpdateInfo *updateInfo) {
AppendCmd<SetDescriptorSetWithPushCmd>(
{

View File

@ -205,7 +205,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
size_t PackedPipelineState::GetColorRenderTargetCount() const {
for (size_t i{engine::ColorTargetCount}; i > 0 ; i--)
if (IsColorRenderTargetEnabled(i - 1))
if (IsColorRenderTargetEnabled(ctSelect[i - 1]))
return i;
return 0;

View File

@ -530,7 +530,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
for (u32 i{}; i < packedState.GetColorRenderTargetCount(); i++) {
attachmentBlendStates.push_back(packedState.GetAttachmentBlendState(i));
texture::Format format{packedState.GetColorRenderTargetFormat(i)};
texture::Format format{packedState.GetColorRenderTargetFormat(packedState.ctSelect[i])};
colorAttachmentFormats.push_back(format ? format->vkFormat : vk::Format::eUndefined);
}
@ -595,10 +595,10 @@ namespace skyline::gpu::interconnect::maxwell3d {
}
Pipeline::Pipeline(InterconnectContext &ctx, const PipelineStateAccessor &accessor, const PackedPipelineState &packedState)
: shaderStages{MakePipelineShaders(ctx, accessor, packedState)},
: sourcePackedState{packedState},
shaderStages{MakePipelineShaders(ctx, accessor, sourcePackedState)},
descriptorInfo{MakePipelineDescriptorInfo(shaderStages, ctx.gpu.traits.quirks.needsIndividualTextureBindingWrites)},
compiledPipeline{MakeCompiledPipeline(ctx, packedState, shaderStages, descriptorInfo.descriptorSetLayoutBindings)},
sourcePackedState{packedState} {
compiledPipeline{MakeCompiledPipeline(ctx, sourcePackedState, shaderStages, descriptorInfo.descriptorSetLayoutBindings)} {
storageBufferViews.resize(descriptorInfo.totalStorageBufferCount);
}

View File

@ -82,6 +82,8 @@ namespace skyline::gpu::interconnect::maxwell3d {
u32 totalImageDescCount;
};
PackedPipelineState sourcePackedState;
private:
std::vector<CachedMappedBufferView> storageBufferViews;
u32 lastExecutionNumber{}; //!< The last execution number this pipeline was used at
@ -99,8 +101,6 @@ namespace skyline::gpu::interconnect::maxwell3d {
cache::GraphicsPipelineCache::CompiledPipeline compiledPipeline;
size_t sampledImageCount{};
PackedPipelineState sourcePackedState;
Pipeline(InterconnectContext &ctx, const PipelineStateAccessor &accessor, const PackedPipelineState &packedState);
Pipeline *LookupNext(const PackedPipelineState &packedState);