Skip waiting on host GPU after command buffer submission

We waited on the host GPU after `Execute` but this isn't optimal as it causes a major stall on the CPU which can lead to several adverse effects such as downclocking by the governor and losing the opportunity to work in parallel with the GPU.

This has now been fixed by splitting `Execute`'s functionality into two functions: `Submit` and `SubmitWithFlush` which both execute all nodes and submit the resulting command buffer to the GPU but flushing will wait on the GPU to complete while the non-flush variant will not wait and work ahead of the GPU.
This commit is contained in:
PixelyIon 2022-06-05 18:22:56 +05:30
parent 5129d2ae78
commit 662ea532d8
No known key found for this signature in database
GPG Key ID: 11BC6C3201BC2C05
6 changed files with 74 additions and 52 deletions

View File

@ -5,7 +5,7 @@
#include "command_executor.h"
namespace skyline::gpu::interconnect {
CommandExecutor::CommandExecutor(const DeviceState &state) : gpu(*state.gpu), activeCommandBuffer(gpu.scheduler.AllocateCommandBuffer()), cycle(activeCommandBuffer.GetFenceCycle()), megaBuffer(gpu.buffer.AcquireMegaBuffer(cycle)) {}
CommandExecutor::CommandExecutor(const DeviceState &state) : gpu{*state.gpu}, activeCommandBuffer{gpu.scheduler.AllocateCommandBuffer()}, cycle{activeCommandBuffer.GetFenceCycle()}, megaBuffer{gpu.buffer.AcquireMegaBuffer(cycle)} {}
CommandExecutor::~CommandExecutor() {
cycle->Cancel();
@ -168,67 +168,78 @@ namespace skyline::gpu::interconnect {
flushCallbacks.emplace_back(std::forward<decltype(callback)>(callback));
}
void CommandExecutor::Execute() {
if (!nodes.empty()) {
TRACE_EVENT("gpu", "CommandExecutor::Execute");
void CommandExecutor::SubmitInternal() {
if (renderPass)
FinishRenderPass();
if (renderPass)
FinishRenderPass();
{
auto &commandBuffer{*activeCommandBuffer};
commandBuffer.begin(vk::CommandBufferBeginInfo{
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
});
{
auto &commandBuffer{*activeCommandBuffer};
commandBuffer.begin(vk::CommandBufferBeginInfo{
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
});
for (auto texture : attachedTextures) {
texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
texture->MarkGpuDirty();
}
for (auto texture : attachedTextures) {
texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
texture->MarkGpuDirty();
}
for (const auto &delegate : attachedBuffers)
delegate->usageCallback = nullptr;
for (const auto &delegate : attachedBuffers)
delegate->usageCallback = nullptr;
vk::RenderPass lRenderPass;
u32 subpassIndex;
vk::RenderPass lRenderPass;
u32 subpassIndex;
using namespace node;
for (NodeVariant &node : nodes) {
#define NODE(name) [&](name& node) { node(commandBuffer, cycle, gpu); }
std::visit(VariantVisitor{
NODE(FunctionNode),
using namespace node;
for (NodeVariant &node : nodes) {
#define NODE(name) [&](name& node) { node(commandBuffer, cycle, gpu); }
std::visit(VariantVisitor{
NODE(FunctionNode),
[&](RenderPassNode &node) {
lRenderPass = node(commandBuffer, cycle, gpu);
subpassIndex = 0;
},
[&](RenderPassNode &node) {
lRenderPass = node(commandBuffer, cycle, gpu);
subpassIndex = 0;
},
[&](NextSubpassNode &node) {
node(commandBuffer, cycle, gpu);
++subpassIndex;
},
[&](SubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, subpassIndex); },
[&](NextSubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, ++subpassIndex); },
[&](NextSubpassNode &node) {
node(commandBuffer, cycle, gpu);
++subpassIndex;
},
[&](SubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, subpassIndex); },
[&](NextSubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, ++subpassIndex); },
NODE(RenderPassEndNode),
}, node);
#undef NODE
}
NODE(RenderPassEndNode),
}, node);
#undef NODE
}
commandBuffer.end();
gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());
commandBuffer.end();
gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());
for (const auto &delegate : attachedBuffers)
delegate->view->megabufferOffset = 0;
nodes.clear();
attachedTextures.clear();
attachedBuffers.clear();
nodes.clear();
attachedTextures.clear();
attachedBuffers.clear();
}
}
cycle = activeCommandBuffer.Reset();
void CommandExecutor::Submit() {
if (!nodes.empty()) {
TRACE_EVENT("gpu", "CommandExecutor::Submit");
SubmitInternal();
activeCommandBuffer = gpu.scheduler.AllocateCommandBuffer();
cycle = activeCommandBuffer.GetFenceCycle();
megaBuffer = gpu.buffer.AcquireMegaBuffer(cycle);
}
}
megaBuffer.Reset();
}
void CommandExecutor::SubmitWithFlush() {
if (!nodes.empty()) {
TRACE_EVENT("gpu", "CommandExecutor::SubmitWithFlush");
SubmitInternal();
cycle = activeCommandBuffer.Reset();
megaBuffer.Reset();
}
}
}

View File

@ -43,6 +43,12 @@ namespace skyline::gpu::interconnect {
*/
void FinishRenderPass();
/**
* @brief Execute all the nodes and submit the resulting command buffer to the GPU
* @note It is the responsibility of the caller to handle resetting of command buffers, fence cycle and megabuffers
*/
void SubmitInternal();
public:
std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
@ -102,6 +108,11 @@ namespace skyline::gpu::interconnect {
/**
* @brief Execute all the nodes and submit the resulting command buffer to the GPU
*/
void Execute();
void Submit();
/**
* @brief Execute all the nodes and submit the resulting command buffer to the GPU then wait for the completion of the command buffer
*/
void SubmitWithFlush();
};
}

View File

@ -630,7 +630,7 @@ namespace skyline::gpu::interconnect {
T object;
std::scoped_lock lock{view};
view.Read(pExecutor.cycle, []() {
// TODO: here we should trigger an execute, however that doesn't currently work due to Read being called mid-draw and attached objects not handling this case
// TODO: here we should trigger a SubmitWithFlush, however that doesn't currently work due to Read being called mid-draw and attached objects not handling this case
Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
}, span<T>(object).template cast<u8>(), dstOffset);
return object;

View File

@ -19,7 +19,7 @@ namespace skyline::soc::gm20b::engine {
ENGINE_STRUCT_CASE(syncpoint, action, {
if (action.operation == Registers::Syncpoint::Operation::Incr) {
Logger::Debug("Increment syncpoint: {}", +action.index);
channelCtx.executor.Execute();
channelCtx.executor.Submit();
syncpoints.at(action.index).Increment();
} else if (action.operation == Registers::Syncpoint::Operation::Wait) {
Logger::Debug("Wait syncpoint: {}, thresh: {}", +action.index, registers.syncpoint->payload);

View File

@ -656,7 +656,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
ENGINE_CASE(syncpointAction, {
Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
channelCtx.executor.Execute();
channelCtx.executor.Submit();
syncpoints.at(syncpointAction.id).Increment();
})

View File

@ -36,7 +36,7 @@ namespace skyline::soc::gm20b::engine {
return;
}
executor.Execute();
executor.SubmitWithFlush();
if (registers.launchDma->multiLineEnable) {
if (registers.launchDma->srcMemoryLayout == Registers::LaunchDma::MemoryLayout::Pitch &&
registers.launchDma->dstMemoryLayout == Registers::LaunchDma::MemoryLayout::BlockLinear)