Begin command buffers asynchronously in command executor

vkBeginCommandBuffer can take quite some time on adreno, move it to the cycle waiter thread where it won't block GPFIFO.
This commit is contained in:
Billy Laws 2022-11-06 19:18:36 +00:00
parent 4b3e906c22
commit 1f9de17e98
2 changed files with 49 additions and 5 deletions

View File

@ -15,6 +15,12 @@ namespace skyline::gpu::interconnect {
outgoing{*state.settings->executorSlotCount},
thread{&CommandRecordThread::Run, this} {}
CommandRecordThread::Slot::ScopedBegin::ScopedBegin(CommandRecordThread::Slot &slot) : slot{slot} {}
CommandRecordThread::Slot::ScopedBegin::~ScopedBegin() {
slot.Begin();
}
static vk::raii::CommandBuffer AllocateRaiiCommandBuffer(GPU &gpu, vk::raii::CommandPool &pool) {
return {gpu.vkDevice, (*gpu.vkDevice).allocateCommandBuffers(
{
@ -35,14 +41,17 @@ namespace skyline::gpu::interconnect {
commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)},
fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }},
semaphore{gpu.vkDevice, vk::SemaphoreCreateInfo{}},
cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, *semaphore, true)} {}
cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, *semaphore, true)} {
Begin();
}
CommandRecordThread::Slot::Slot(Slot &&other)
: commandPool{std::move(other.commandPool)},
commandBuffer{std::move(other.commandBuffer)},
fence{std::move(other.fence)},
semaphore{std::move(other.semaphore)},
cycle{std::move(other.cycle)} {}
cycle{std::move(other.cycle)},
ready{other.ready} {}
std::shared_ptr<FenceCycle> CommandRecordThread::Slot::Reset(GPU &gpu) {
cycle->Wait();
@ -51,6 +60,21 @@ namespace skyline::gpu::interconnect {
return cycle;
}
void CommandRecordThread::Slot::WaitReady() {
std::unique_lock lock{beginLock};
beginCondition.wait(lock, [this] { return ready; });
cycle->AttachObject(std::make_shared<ScopedBegin>(*this));
}
void CommandRecordThread::Slot::Begin() {
std::unique_lock lock{beginLock};
commandBuffer.begin(vk::CommandBufferBeginInfo{
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
});
ready = true;
beginCondition.notify_all();
}
void CommandRecordThread::ProcessSlot(Slot *slot) {
TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, slot->executionNumber);
auto &gpu{*state.gpu};
@ -83,6 +107,7 @@ namespace skyline::gpu::interconnect {
}
slot->commandBuffer.end();
slot->ready = false;
gpu.scheduler.SubmitCommandBuffer(slot->commandBuffer, slot->cycle);
@ -404,9 +429,7 @@ namespace skyline::gpu::interconnect {
FinishRenderPass();
{
slot->commandBuffer.begin(vk::CommandBufferBeginInfo{
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
});
slot->WaitReady();
// We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands
slot->commandBuffer.pipelineBarrier(

View File

@ -19,6 +19,17 @@ namespace skyline::gpu::interconnect {
* @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread
*/
struct Slot {
/**
* @brief Helper to begin the slot command buffer on the cycle waiter thread
*/
struct ScopedBegin {
Slot &slot;
ScopedBegin(Slot &slot);
~ScopedBegin();
};
vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time
vk::raii::CommandBuffer commandBuffer;
vk::raii::Fence fence;
@ -26,7 +37,10 @@ namespace skyline::gpu::interconnect {
std::shared_ptr<FenceCycle> cycle;
boost::container::stable_vector<node::NodeVariant> nodes;
LinearAllocatorState<> allocator;
std::mutex beginLock;
std::condition_variable beginCondition;
u32 executionNumber;
bool ready{}; //!< If this slot's command buffer has had 'beginCommandBuffer' called and is ready to have commands recorded into it
bool capture{}; //!< If this slot's Vulkan commands should be captured using the renderdoc API
Slot(GPU &gpu);
@ -38,6 +52,13 @@ namespace skyline::gpu::interconnect {
* @note A new fence cycle for the reset command buffer
*/
std::shared_ptr<FenceCycle> Reset(GPU &gpu);
/**
* @brief Waits for the command buffer to be began so it can be recorded into
*/
void WaitReady();
void Begin();
};
private: