mirror of
https://github.com/skyline-emu/skyline.git
synced 2025-01-01 07:35:29 +03:00
Implement asynchronous command recording
Recording of command nodes into Vulkan command buffers is very easily parallelisable as it can effectively be treated as part of the GPU execution, which is inherently async. By moving it to a seperate thread we can shave off about 20% of GPFIFO execution time. It should be noted that the command scheduler command buffer infra is no longer used, since we need to record texture updates on the GPFIFO thread (while another slot is being recorded on the record thread) and then use the same command buffer on the record thread later. This ends up requiring a pool per slot, which is reasonable considering we only have four slots by default.
This commit is contained in:
parent
a197dd2b28
commit
7c9212743c
@ -1,16 +1,137 @@
|
|||||||
// SPDX-License-Identifier: MPL-2.0
|
// SPDX-License-Identifier: MPL-2.0
|
||||||
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
||||||
|
|
||||||
|
#include <loader/loader.h>
|
||||||
#include <gpu.h>
|
#include <gpu.h>
|
||||||
#include "command_executor.h"
|
#include "command_executor.h"
|
||||||
|
|
||||||
namespace skyline::gpu::interconnect {
|
namespace skyline::gpu::interconnect {
|
||||||
CommandExecutor::CommandExecutor(const DeviceState &state) : gpu{*state.gpu}, activeCommandBuffer{gpu.scheduler.AllocateCommandBuffer()}, cycle{activeCommandBuffer.GetFenceCycle()}, tag{AllocateTag()} {}
|
CommandRecordThread::CommandRecordThread(const DeviceState &state) : state{state}, thread{&CommandRecordThread::Run, this} {}
|
||||||
|
|
||||||
|
static vk::raii::CommandBuffer AllocateRaiiCommandBuffer(GPU &gpu, vk::raii::CommandPool &pool) {
|
||||||
|
return {gpu.vkDevice, (*gpu.vkDevice).allocateCommandBuffers(
|
||||||
|
{
|
||||||
|
.commandPool = *pool,
|
||||||
|
.level = vk::CommandBufferLevel::ePrimary,
|
||||||
|
.commandBufferCount = 1
|
||||||
|
}, *gpu.vkDevice.getDispatcher()).front(),
|
||||||
|
*pool};
|
||||||
|
}
|
||||||
|
|
||||||
|
CommandRecordThread::Slot::Slot(GPU &gpu)
|
||||||
|
: commandPool{gpu.vkDevice,
|
||||||
|
vk::CommandPoolCreateInfo{
|
||||||
|
.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer | vk::CommandPoolCreateFlagBits::eTransient,
|
||||||
|
.queueFamilyIndex = gpu.vkQueueFamilyIndex
|
||||||
|
}
|
||||||
|
},
|
||||||
|
commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)},
|
||||||
|
fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }},
|
||||||
|
cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, true)} {}
|
||||||
|
|
||||||
|
std::shared_ptr<FenceCycle> CommandRecordThread::Slot::Reset(GPU &gpu) {
|
||||||
|
cycle->Wait();
|
||||||
|
cycle = std::make_shared<FenceCycle>(gpu.vkDevice, *fence);
|
||||||
|
commandBuffer.reset();
|
||||||
|
return cycle;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CommandRecordThread::ProcessSlot(Slot *slot) {
|
||||||
|
auto &gpu{*state.gpu};
|
||||||
|
|
||||||
|
vk::RenderPass lRenderPass;
|
||||||
|
u32 subpassIndex;
|
||||||
|
|
||||||
|
std::scoped_lock bufferLock{gpu.buffer.recreationMutex};
|
||||||
|
using namespace node;
|
||||||
|
for (NodeVariant &node : slot->nodes) {
|
||||||
|
#define NODE(name) [&](name& node) { node(slot->commandBuffer, slot->cycle, gpu); }
|
||||||
|
std::visit(VariantVisitor{
|
||||||
|
NODE(FunctionNode),
|
||||||
|
|
||||||
|
[&](RenderPassNode &node) {
|
||||||
|
lRenderPass = node(slot->commandBuffer, slot->cycle, gpu);
|
||||||
|
subpassIndex = 0;
|
||||||
|
},
|
||||||
|
|
||||||
|
[&](NextSubpassNode &node) {
|
||||||
|
node(slot->commandBuffer, slot->cycle, gpu);
|
||||||
|
++subpassIndex;
|
||||||
|
},
|
||||||
|
[&](SubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); },
|
||||||
|
[&](NextSubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); },
|
||||||
|
|
||||||
|
NODE(RenderPassEndNode),
|
||||||
|
}, node);
|
||||||
|
#undef NODE
|
||||||
|
}
|
||||||
|
|
||||||
|
slot->commandBuffer.end();
|
||||||
|
|
||||||
|
gpu.scheduler.SubmitCommandBuffer(slot->commandBuffer, slot->cycle);
|
||||||
|
|
||||||
|
slot->nodes.clear();
|
||||||
|
slot->allocator.Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CommandRecordThread::Run() {
|
||||||
|
auto &gpu{*state.gpu};
|
||||||
|
std::array<Slot, ActiveRecordSlots> slots{{gpu, gpu, gpu, gpu}};
|
||||||
|
outgoing.AppendTranform(span<Slot>(slots), [](auto &slot) { return &slot; });
|
||||||
|
|
||||||
|
if (int result{pthread_setname_np(pthread_self(), "Sky-CmdRecord")})
|
||||||
|
Logger::Warn("Failed to set the thread name: {}", strerror(result));
|
||||||
|
|
||||||
|
try {
|
||||||
|
signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
|
||||||
|
|
||||||
|
incoming.Process([this](Slot *slot) {
|
||||||
|
ProcessSlot(slot);
|
||||||
|
outgoing.Push(slot);
|
||||||
|
}, [] {});
|
||||||
|
} catch (const signal::SignalException &e) {
|
||||||
|
Logger::Error("{}\nStack Trace:{}", e.what(), state.loader->GetStackTrace(e.frames));
|
||||||
|
if (state.process)
|
||||||
|
state.process->Kill(false);
|
||||||
|
else
|
||||||
|
std::rethrow_exception(std::current_exception());
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
Logger::Error(e.what());
|
||||||
|
if (state.process)
|
||||||
|
state.process->Kill(false);
|
||||||
|
else
|
||||||
|
std::rethrow_exception(std::current_exception());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CommandRecordThread::Slot *CommandRecordThread::AcquireSlot() {
|
||||||
|
return outgoing.Pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CommandRecordThread::ReleaseSlot(Slot *slot) {
|
||||||
|
incoming.Push(slot);
|
||||||
|
}
|
||||||
|
|
||||||
|
CommandExecutor::CommandExecutor(const DeviceState &state)
|
||||||
|
: gpu{*state.gpu},
|
||||||
|
recordThread{state},
|
||||||
|
tag{AllocateTag()} {
|
||||||
|
RotateRecordSlot();
|
||||||
|
}
|
||||||
|
|
||||||
CommandExecutor::~CommandExecutor() {
|
CommandExecutor::~CommandExecutor() {
|
||||||
cycle->Cancel();
|
cycle->Cancel();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void CommandExecutor::RotateRecordSlot() {
|
||||||
|
if (slot)
|
||||||
|
recordThread.ReleaseSlot(slot);
|
||||||
|
|
||||||
|
slot = recordThread.AcquireSlot();
|
||||||
|
cycle = slot->Reset(gpu);
|
||||||
|
allocator = &slot->allocator;
|
||||||
|
}
|
||||||
|
|
||||||
TextureManager &CommandExecutor::AcquireTextureManager() {
|
TextureManager &CommandExecutor::AcquireTextureManager() {
|
||||||
if (!textureManagerLock)
|
if (!textureManagerLock)
|
||||||
textureManagerLock.emplace(gpu.texture);
|
textureManagerLock.emplace(gpu.texture);
|
||||||
@ -55,8 +176,8 @@ namespace skyline::gpu::interconnect {
|
|||||||
if (renderPass == nullptr || renderPass->renderArea != renderArea || subpassCount >= gpu.traits.quirks.maxSubpassCount) {
|
if (renderPass == nullptr || renderPass->renderArea != renderArea || subpassCount >= gpu.traits.quirks.maxSubpassCount) {
|
||||||
// We need to create a render pass if one doesn't already exist or the current one isn't compatible
|
// We need to create a render pass if one doesn't already exist or the current one isn't compatible
|
||||||
if (renderPass != nullptr)
|
if (renderPass != nullptr)
|
||||||
nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
|
slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
|
||||||
renderPass = &std::get<node::RenderPassNode>(nodes.emplace_back(std::in_place_type_t<node::RenderPassNode>(), renderArea));
|
renderPass = &std::get<node::RenderPassNode>(slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassNode>(), renderArea));
|
||||||
addSubpass();
|
addSubpass();
|
||||||
subpassCount = 1;
|
subpassCount = 1;
|
||||||
return false;
|
return false;
|
||||||
@ -77,7 +198,7 @@ namespace skyline::gpu::interconnect {
|
|||||||
|
|
||||||
void CommandExecutor::FinishRenderPass() {
|
void CommandExecutor::FinishRenderPass() {
|
||||||
if (renderPass) {
|
if (renderPass) {
|
||||||
nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
|
slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
|
||||||
|
|
||||||
renderPass = nullptr;
|
renderPass = nullptr;
|
||||||
subpassCount = 0;
|
subpassCount = 0;
|
||||||
@ -168,9 +289,9 @@ namespace skyline::gpu::interconnect {
|
|||||||
|
|
||||||
bool gotoNext{CreateRenderPassWithSubpass(renderArea, inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr)};
|
bool gotoNext{CreateRenderPassWithSubpass(renderArea, inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr)};
|
||||||
if (gotoNext)
|
if (gotoNext)
|
||||||
nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), std::forward<decltype(function)>(function));
|
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), std::forward<decltype(function)>(function));
|
||||||
else
|
else
|
||||||
nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), std::forward<decltype(function)>(function));
|
slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), std::forward<decltype(function)>(function));
|
||||||
|
|
||||||
if (exclusiveSubpass)
|
if (exclusiveSubpass)
|
||||||
FinishRenderPass();
|
FinishRenderPass();
|
||||||
@ -180,14 +301,14 @@ namespace skyline::gpu::interconnect {
|
|||||||
if (renderPass)
|
if (renderPass)
|
||||||
FinishRenderPass();
|
FinishRenderPass();
|
||||||
|
|
||||||
nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), std::forward<decltype(function)>(function));
|
slot->nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), std::forward<decltype(function)>(function));
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommandExecutor::AddClearColorSubpass(TextureView *attachment, const vk::ClearColorValue &value) {
|
void CommandExecutor::AddClearColorSubpass(TextureView *attachment, const vk::ClearColorValue &value) {
|
||||||
bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, attachment, nullptr)};
|
bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, attachment, nullptr)};
|
||||||
if (renderPass->ClearColorAttachment(0, value, gpu)) {
|
if (renderPass->ClearColorAttachment(0, value, gpu)) {
|
||||||
if (gotoNext)
|
if (gotoNext)
|
||||||
nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
|
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
|
||||||
} else {
|
} else {
|
||||||
auto function{[scissor = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
|
auto function{[scissor = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
|
||||||
commandBuffer.clearAttachments(vk::ClearAttachment{
|
commandBuffer.clearAttachments(vk::ClearAttachment{
|
||||||
@ -202,9 +323,9 @@ namespace skyline::gpu::interconnect {
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
if (gotoNext)
|
if (gotoNext)
|
||||||
nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
|
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
|
||||||
else
|
else
|
||||||
nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
|
slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -212,7 +333,7 @@ namespace skyline::gpu::interconnect {
|
|||||||
bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, {}, attachment)};
|
bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, {}, attachment)};
|
||||||
if (renderPass->ClearDepthStencilAttachment(value, gpu)) {
|
if (renderPass->ClearDepthStencilAttachment(value, gpu)) {
|
||||||
if (gotoNext)
|
if (gotoNext)
|
||||||
nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
|
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
|
||||||
} else {
|
} else {
|
||||||
auto function{[aspect = attachment->format->vkAspect, extent = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
|
auto function{[aspect = attachment->format->vkAspect, extent = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
|
||||||
commandBuffer.clearAttachments(vk::ClearAttachment{
|
commandBuffer.clearAttachments(vk::ClearAttachment{
|
||||||
@ -226,9 +347,9 @@ namespace skyline::gpu::interconnect {
|
|||||||
}};
|
}};
|
||||||
|
|
||||||
if (gotoNext)
|
if (gotoNext)
|
||||||
nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
|
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
|
||||||
else
|
else
|
||||||
nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
|
slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -241,13 +362,12 @@ namespace skyline::gpu::interconnect {
|
|||||||
FinishRenderPass();
|
FinishRenderPass();
|
||||||
|
|
||||||
{
|
{
|
||||||
auto &commandBuffer{*activeCommandBuffer};
|
slot->commandBuffer.begin(vk::CommandBufferBeginInfo{
|
||||||
commandBuffer.begin(vk::CommandBufferBeginInfo{
|
|
||||||
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
|
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
|
||||||
});
|
});
|
||||||
|
|
||||||
// We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands
|
// We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands
|
||||||
commandBuffer.pipelineBarrier(
|
slot->commandBuffer.pipelineBarrier(
|
||||||
vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
|
vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
|
||||||
.srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
|
.srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
|
||||||
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
|
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
|
||||||
@ -255,43 +375,12 @@ namespace skyline::gpu::interconnect {
|
|||||||
);
|
);
|
||||||
|
|
||||||
for (const auto &texture : attachedTextures)
|
for (const auto &texture : attachedTextures)
|
||||||
texture->SynchronizeHostInline(commandBuffer, cycle, true);
|
texture->SynchronizeHostInline(slot->commandBuffer, cycle, true);
|
||||||
|
|
||||||
vk::RenderPass lRenderPass;
|
|
||||||
u32 subpassIndex;
|
|
||||||
|
|
||||||
using namespace node;
|
|
||||||
for (NodeVariant &node : nodes) {
|
|
||||||
#define NODE(name) [&](name& node) { node(commandBuffer, cycle, gpu); }
|
|
||||||
std::visit(VariantVisitor{
|
|
||||||
NODE(FunctionNode),
|
|
||||||
|
|
||||||
[&](RenderPassNode &node) {
|
|
||||||
lRenderPass = node(commandBuffer, cycle, gpu);
|
|
||||||
subpassIndex = 0;
|
|
||||||
},
|
|
||||||
|
|
||||||
[&](NextSubpassNode &node) {
|
|
||||||
node(commandBuffer, cycle, gpu);
|
|
||||||
++subpassIndex;
|
|
||||||
},
|
|
||||||
[&](SubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, subpassIndex); },
|
|
||||||
[&](NextSubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, ++subpassIndex); },
|
|
||||||
|
|
||||||
NODE(RenderPassEndNode),
|
|
||||||
}, node);
|
|
||||||
#undef NODE
|
|
||||||
}
|
}
|
||||||
|
|
||||||
commandBuffer.end();
|
|
||||||
|
|
||||||
for (const auto &attachedBuffer : attachedBuffers)
|
for (const auto &attachedBuffer : attachedBuffers)
|
||||||
if (attachedBuffer->SequencedCpuBackingWritesBlocked())
|
if (attachedBuffer->SequencedCpuBackingWritesBlocked())
|
||||||
attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls
|
attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer
|
||||||
|
|
||||||
gpu.scheduler.SubmitCommandBuffer(commandBuffer, cycle);
|
|
||||||
|
|
||||||
nodes.clear();
|
|
||||||
|
|
||||||
for (const auto &attachedTexture : attachedTextures) {
|
for (const auto &attachedTexture : attachedTextures) {
|
||||||
// We don't need to attach the Texture to the cycle as a TextureView will already be attached
|
// We don't need to attach the Texture to the cycle as a TextureView will already be attached
|
||||||
@ -305,7 +394,8 @@ namespace skyline::gpu::interconnect {
|
|||||||
attachedBuffer->UpdateCycle(cycle);
|
attachedBuffer->UpdateCycle(cycle);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
RotateRecordSlot();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommandExecutor::ResetInternal() {
|
void CommandExecutor::ResetInternal() {
|
||||||
@ -314,32 +404,16 @@ namespace skyline::gpu::interconnect {
|
|||||||
attachedBuffers.clear();
|
attachedBuffers.clear();
|
||||||
bufferManagerLock.reset();
|
bufferManagerLock.reset();
|
||||||
megaBufferAllocatorLock.reset();
|
megaBufferAllocatorLock.reset();
|
||||||
allocator.Reset();
|
allocator->Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommandExecutor::Submit() {
|
void CommandExecutor::Submit() {
|
||||||
for (const auto &callback : flushCallbacks)
|
for (const auto &callback : flushCallbacks)
|
||||||
callback();
|
callback();
|
||||||
|
|
||||||
if (!nodes.empty()) {
|
if (!slot->nodes.empty()) {
|
||||||
TRACE_EVENT("gpu", "CommandExecutor::Submit");
|
TRACE_EVENT("gpu", "CommandExecutor::Submit");
|
||||||
SubmitInternal();
|
SubmitInternal();
|
||||||
activeCommandBuffer = gpu.scheduler.AllocateCommandBuffer();
|
|
||||||
cycle = activeCommandBuffer.GetFenceCycle();
|
|
||||||
}
|
|
||||||
ResetInternal();
|
|
||||||
|
|
||||||
executionNumber++;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CommandExecutor::SubmitWithFlush() {
|
|
||||||
for (const auto &callback : flushCallbacks)
|
|
||||||
callback();
|
|
||||||
|
|
||||||
if (!nodes.empty()) {
|
|
||||||
TRACE_EVENT("gpu", "CommandExecutor::SubmitWithFlush");
|
|
||||||
SubmitInternal();
|
|
||||||
cycle = activeCommandBuffer.Reset();
|
|
||||||
}
|
}
|
||||||
ResetInternal();
|
ResetInternal();
|
||||||
|
|
||||||
|
@ -10,6 +10,57 @@
|
|||||||
#include "command_nodes.h"
|
#include "command_nodes.h"
|
||||||
|
|
||||||
namespace skyline::gpu::interconnect {
|
namespace skyline::gpu::interconnect {
|
||||||
|
/*
|
||||||
|
* @brief Thread responsible for recording Vulkan commands from the execution nodes and submitting them
|
||||||
|
*/
|
||||||
|
class CommandRecordThread {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread
|
||||||
|
*/
|
||||||
|
struct Slot {
|
||||||
|
vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time
|
||||||
|
vk::raii::CommandBuffer commandBuffer;
|
||||||
|
vk::raii::Fence fence;
|
||||||
|
std::shared_ptr<FenceCycle> cycle;
|
||||||
|
boost::container::stable_vector<node::NodeVariant> nodes;
|
||||||
|
LinearAllocatorState<> allocator;
|
||||||
|
|
||||||
|
Slot(GPU &gpu);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Waits on the fence and resets the command buffer
|
||||||
|
* @note A new fence cycle for the reset command buffer
|
||||||
|
*/
|
||||||
|
std::shared_ptr<FenceCycle> Reset(GPU &gpu);
|
||||||
|
};
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceState &state;
|
||||||
|
std::thread thread;
|
||||||
|
|
||||||
|
static constexpr size_t ActiveRecordSlots{4}; //!< Maximum number of simultaneously active slots
|
||||||
|
CircularQueue<Slot *> incoming{ActiveRecordSlots}; //!< Slots pending recording
|
||||||
|
CircularQueue<Slot *> outgoing{ActiveRecordSlots}; //!< Slots that have been submitted, may still be active on the GPU
|
||||||
|
|
||||||
|
void ProcessSlot(Slot *slot);
|
||||||
|
|
||||||
|
void Run();
|
||||||
|
|
||||||
|
public:
|
||||||
|
CommandRecordThread(const DeviceState &state);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return A free slot, `Reset` needs to be called before accessing it
|
||||||
|
*/
|
||||||
|
Slot *AcquireSlot();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Submit a slot to be recorded
|
||||||
|
*/
|
||||||
|
void ReleaseSlot(Slot *slot);
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
|
* @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
|
||||||
* @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
|
* @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
|
||||||
@ -17,11 +68,10 @@ namespace skyline::gpu::interconnect {
|
|||||||
class CommandExecutor {
|
class CommandExecutor {
|
||||||
private:
|
private:
|
||||||
GPU &gpu;
|
GPU &gpu;
|
||||||
CommandScheduler::ActiveCommandBuffer activeCommandBuffer;
|
CommandRecordThread recordThread;
|
||||||
boost::container::stable_vector<node::NodeVariant> nodes;
|
CommandRecordThread::Slot *slot{};
|
||||||
node::RenderPassNode *renderPass{};
|
node::RenderPassNode *renderPass{};
|
||||||
size_t subpassCount{}; //!< The number of subpasses in the current render pass
|
size_t subpassCount{}; //!< The number of subpasses in the current render pass
|
||||||
|
|
||||||
std::optional<std::scoped_lock<TextureManager>> textureManagerLock; //!< The lock on the texture manager, this is locked for the duration of the command execution from the first usage inside an execution to the submission
|
std::optional<std::scoped_lock<TextureManager>> textureManagerLock; //!< The lock on the texture manager, this is locked for the duration of the command execution from the first usage inside an execution to the submission
|
||||||
std::optional<std::scoped_lock<BufferManager>> bufferManagerLock; //!< The lock on the buffer manager, see above for details
|
std::optional<std::scoped_lock<BufferManager>> bufferManagerLock; //!< The lock on the buffer manager, see above for details
|
||||||
std::optional<std::scoped_lock<MegaBufferAllocator>> megaBufferAllocatorLock; //!< The lock on the megabuffer allocator, see above for details
|
std::optional<std::scoped_lock<MegaBufferAllocator>> megaBufferAllocatorLock; //!< The lock on the megabuffer allocator, see above for details
|
||||||
@ -72,6 +122,8 @@ namespace skyline::gpu::interconnect {
|
|||||||
|
|
||||||
std::vector<std::function<void()>> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording
|
std::vector<std::function<void()>> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording
|
||||||
|
|
||||||
|
void RotateRecordSlot();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Create a new render pass and subpass with the specified attachments, if one doesn't already exist or the current one isn't compatible
|
* @brief Create a new render pass and subpass with the specified attachments, if one doesn't already exist or the current one isn't compatible
|
||||||
* @note This also checks for subpass coalescing and will merge the new subpass with the previous one when possible
|
* @note This also checks for subpass coalescing and will merge the new subpass with the previous one when possible
|
||||||
@ -97,7 +149,7 @@ namespace skyline::gpu::interconnect {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
|
std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
|
||||||
LinearAllocatorState<> allocator;
|
LinearAllocatorState<> *allocator;
|
||||||
ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
|
ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
|
||||||
size_t executionNumber{};
|
size_t executionNumber{};
|
||||||
|
|
||||||
@ -193,10 +245,5 @@ namespace skyline::gpu::interconnect {
|
|||||||
* @brief Execute all the nodes and submit the resulting command buffer to the GPU
|
* @brief Execute all the nodes and submit the resulting command buffer to the GPU
|
||||||
*/
|
*/
|
||||||
void Submit();
|
void Submit();
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Execute all the nodes and submit the resulting command buffer to the GPU then wait for the completion of the command buffer
|
|
||||||
*/
|
|
||||||
void SubmitWithFlush();
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -163,7 +163,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Maxwell3D::Draw(engine::DrawTopology topology, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) {
|
void Maxwell3D::Draw(engine::DrawTopology topology, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) {
|
||||||
StateUpdateBuilder builder{ctx.executor.allocator};
|
StateUpdateBuilder builder{*ctx.executor.allocator};
|
||||||
|
|
||||||
Pipeline *oldPipeline{activeState.GetPipeline()};
|
Pipeline *oldPipeline{activeState.GetPipeline()};
|
||||||
activeState.Update(ctx, builder, indexed, topology, count);
|
activeState.Update(ctx, builder, indexed, topology, count);
|
||||||
|
@ -615,9 +615,9 @@ namespace skyline::gpu::interconnect::maxwell3d {
|
|||||||
u32 bufferIdx{};
|
u32 bufferIdx{};
|
||||||
u32 imageIdx{};
|
u32 imageIdx{};
|
||||||
|
|
||||||
auto writes{ctx.executor.allocator.AllocateUntracked<vk::WriteDescriptorSet>(descriptorInfo.writeDescCount)};
|
auto writes{ctx.executor.allocator->AllocateUntracked<vk::WriteDescriptorSet>(descriptorInfo.writeDescCount)};
|
||||||
auto bufferDescs{ctx.executor.allocator.AllocateUntracked<vk::DescriptorBufferInfo>(descriptorInfo.totalBufferDescCount)};
|
auto bufferDescs{ctx.executor.allocator->AllocateUntracked<vk::DescriptorBufferInfo>(descriptorInfo.totalBufferDescCount)};
|
||||||
auto bufferDescViews{ctx.executor.allocator.AllocateUntracked<DynamicBufferBinding>(descriptorInfo.totalBufferDescCount)};
|
auto bufferDescViews{ctx.executor.allocator->AllocateUntracked<DynamicBufferBinding>(descriptorInfo.totalBufferDescCount)};
|
||||||
|
|
||||||
auto writeBufferDescs{[&](vk::DescriptorType type, const auto &descs, u32 count, auto getBufferCb) {
|
auto writeBufferDescs{[&](vk::DescriptorType type, const auto &descs, u32 count, auto getBufferCb) {
|
||||||
if (!descs.empty()) {
|
if (!descs.empty()) {
|
||||||
@ -658,13 +658,13 @@ namespace skyline::gpu::interconnect::maxwell3d {
|
|||||||
const auto &cbufUsageInfo{descriptorInfo.cbufUsages[static_cast<size_t>(quickBind.stage)][quickBind.index]};
|
const auto &cbufUsageInfo{descriptorInfo.cbufUsages[static_cast<size_t>(quickBind.stage)][quickBind.index]};
|
||||||
const auto &shaderInfo{shaderStages[static_cast<size_t>(quickBind.stage)].info};
|
const auto &shaderInfo{shaderStages[static_cast<size_t>(quickBind.stage)].info};
|
||||||
auto &stageConstantBuffers{constantBuffers[static_cast<size_t>(quickBind.stage)]};
|
auto &stageConstantBuffers{constantBuffers[static_cast<size_t>(quickBind.stage)]};
|
||||||
auto copy{ctx.executor.allocator.AllocateUntracked<vk::CopyDescriptorSet>()};
|
auto copy{ctx.executor.allocator->AllocateUntracked<vk::CopyDescriptorSet>()};
|
||||||
auto writes{ctx.executor.allocator.AllocateUntracked<vk::WriteDescriptorSet>(cbufUsageInfo.writeDescCount)};
|
auto writes{ctx.executor.allocator->AllocateUntracked<vk::WriteDescriptorSet>(cbufUsageInfo.writeDescCount)};
|
||||||
size_t writeIdx{};
|
size_t writeIdx{};
|
||||||
size_t bufferIdx{};
|
size_t bufferIdx{};
|
||||||
|
|
||||||
auto bufferDescs{ctx.executor.allocator.AllocateUntracked<vk::DescriptorBufferInfo>(cbufUsageInfo.totalBufferDescCount)};
|
auto bufferDescs{ctx.executor.allocator->AllocateUntracked<vk::DescriptorBufferInfo>(cbufUsageInfo.totalBufferDescCount)};
|
||||||
auto bufferDescViews{ctx.executor.allocator.AllocateUntracked<DynamicBufferBinding>(cbufUsageInfo.totalBufferDescCount)};
|
auto bufferDescViews{ctx.executor.allocator->AllocateUntracked<DynamicBufferBinding>(cbufUsageInfo.totalBufferDescCount)};
|
||||||
|
|
||||||
// TODO: opt this to do partial copy
|
// TODO: opt this to do partial copy
|
||||||
*copy = vk::CopyDescriptorSet{
|
*copy = vk::CopyDescriptorSet{
|
||||||
|
Loading…
Reference in New Issue
Block a user