Skip waiting on host GPU after command buffer submission

We waited on the host GPU after `Execute` but this isn't optimal as it causes a major stall on the CPU which can lead to several adverse effects such as downclocking by the governor and losing the opportunity to work in parallel with the GPU. This has now been fixed by splitting `Execute`'s functionality into two functions: `Submit` and `SubmitWithFlush` which both execute all nodes and submit the resulting command buffer to the GPU but flushing will wait on the GPU to complete while the non-flush variant will not wait and work ahead of the GPU.
2025-06-14 03:42:03 +03:00 · 2022-06-05 18:22:56 +05:30 · 2022-06-05 18:22:56 +05:30 · 662ea532d8
commit 662ea532d8
parent 5129d2ae78
6 changed files with 74 additions and 52 deletions
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -5,7 +5,7 @@
 #include "command_executor.h"

 namespace skyline::gpu::interconnect {
-    CommandExecutor::CommandExecutor(const DeviceState &state) : gpu(*state.gpu), activeCommandBuffer(gpu.scheduler.AllocateCommandBuffer()), cycle(activeCommandBuffer.GetFenceCycle()), megaBuffer(gpu.buffer.AcquireMegaBuffer(cycle)) {}
+    CommandExecutor::CommandExecutor(const DeviceState &state) : gpu{*state.gpu}, activeCommandBuffer{gpu.scheduler.AllocateCommandBuffer()}, cycle{activeCommandBuffer.GetFenceCycle()}, megaBuffer{gpu.buffer.AcquireMegaBuffer(cycle)} {}

    CommandExecutor::~CommandExecutor() {
        cycle->Cancel();
@ -168,10 +168,7 @@ namespace skyline::gpu::interconnect {
        flushCallbacks.emplace_back(std::forward<decltype(callback)>(callback));
    }

-    void CommandExecutor::Execute() {
-        if (!nodes.empty()) {
-            TRACE_EVENT("gpu", "CommandExecutor::Execute");
-
+    void CommandExecutor::SubmitInternal() {
        if (renderPass)
            FinishRenderPass();

@ -224,11 +221,25 @@ namespace skyline::gpu::interconnect {
            nodes.clear();
            attachedTextures.clear();
            attachedBuffers.clear();
+        }
+    }

+    void CommandExecutor::Submit() {
+        if (!nodes.empty()) {
+            TRACE_EVENT("gpu", "CommandExecutor::Submit");
+            SubmitInternal();
+            activeCommandBuffer = gpu.scheduler.AllocateCommandBuffer();
+            cycle = activeCommandBuffer.GetFenceCycle();
+            megaBuffer = gpu.buffer.AcquireMegaBuffer(cycle);
+        }
+    }
+
+    void CommandExecutor::SubmitWithFlush() {
+        if (!nodes.empty()) {
+            TRACE_EVENT("gpu", "CommandExecutor::SubmitWithFlush");
+            SubmitInternal();
            cycle = activeCommandBuffer.Reset();
-
            megaBuffer.Reset();
        }
    }
-    }
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -43,6 +43,12 @@ namespace skyline::gpu::interconnect {
         */
        void FinishRenderPass();

+        /**
+         * @brief Execute all the nodes and submit the resulting command buffer to the GPU
+         * @note It is the responsibility of the caller to handle resetting of command buffers, fence cycle and megabuffers
+         */
+        void SubmitInternal();
+
      public:
        std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
        MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
@ -102,6 +108,11 @@ namespace skyline::gpu::interconnect {
        /**
         * @brief Execute all the nodes and submit the resulting command buffer to the GPU
         */
-        void Execute();
+        void Submit();
+
+        /**
+         * @brief Execute all the nodes and submit the resulting command buffer to the GPU then wait for the completion of the command buffer
+         */
+        void SubmitWithFlush();
    };
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -630,7 +630,7 @@ namespace skyline::gpu::interconnect {
                T object;
                std::scoped_lock lock{view};
                view.Read(pExecutor.cycle, []() {
-                    // TODO: here we should trigger an execute, however that doesn't currently work due to Read being called mid-draw and attached objects not handling this case
+                    // TODO: here we should trigger a SubmitWithFlush, however that doesn't currently work due to Read being called mid-draw and attached objects not handling this case
                    Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
                }, span<T>(object).template cast<u8>(), dstOffset);
                return object;
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
@ -19,7 +19,7 @@ namespace skyline::soc::gm20b::engine {
            ENGINE_STRUCT_CASE(syncpoint, action, {
                if (action.operation == Registers::Syncpoint::Operation::Incr) {
                    Logger::Debug("Increment syncpoint: {}", +action.index);
-                    channelCtx.executor.Execute();
+                    channelCtx.executor.Submit();
                    syncpoints.at(action.index).Increment();
                } else if (action.operation == Registers::Syncpoint::Operation::Wait) {
                    Logger::Debug("Wait syncpoint: {}, thresh: {}", +action.index, registers.syncpoint->payload);
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@ -656,7 +656,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {

            ENGINE_CASE(syncpointAction, {
                Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
-                channelCtx.executor.Execute();
+                channelCtx.executor.Submit();
                syncpoints.at(syncpointAction.id).Increment();
            })

--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
@ -36,7 +36,7 @@ namespace skyline::soc::gm20b::engine {
            return;
        }

-        executor.Execute();
+        executor.SubmitWithFlush();
        if (registers.launchDma->multiLineEnable) {
            if (registers.launchDma->srcMemoryLayout == Registers::LaunchDma::MemoryLayout::Pitch &&
                registers.launchDma->dstMemoryLayout == Registers::LaunchDma::MemoryLayout::BlockLinear)