Begin command buffers asynchronously in command executor

vkBeginCommandBuffer can take quite some time on adreno, move it to the cycle waiter thread where it won't block GPFIFO.
2025-06-30 00:49:26 +03:00 · 2022-11-06 19:18:36 +00:00 · 2022-11-06 19:18:36 +00:00 · 1f9de17e98
commit 1f9de17e98
parent 4b3e906c22
2 changed files with 49 additions and 5 deletions
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -15,6 +15,12 @@ namespace skyline::gpu::interconnect {
          outgoing{*state.settings->executorSlotCount},
          thread{&CommandRecordThread::Run, this} {}

+    CommandRecordThread::Slot::ScopedBegin::ScopedBegin(CommandRecordThread::Slot &slot) : slot{slot} {}
+
+    CommandRecordThread::Slot::ScopedBegin::~ScopedBegin() {
+        slot.Begin();
+    }
+
    static vk::raii::CommandBuffer AllocateRaiiCommandBuffer(GPU &gpu, vk::raii::CommandPool &pool) {
        return {gpu.vkDevice, (*gpu.vkDevice).allocateCommandBuffers(
                    {
@ -35,14 +41,17 @@ namespace skyline::gpu::interconnect {
          commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)},
          fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }},
          semaphore{gpu.vkDevice, vk::SemaphoreCreateInfo{}},
-          cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, *semaphore, true)} {}
+          cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, *semaphore, true)} {
+        Begin();
+    }

    CommandRecordThread::Slot::Slot(Slot &&other)
        : commandPool{std::move(other.commandPool)},
          commandBuffer{std::move(other.commandBuffer)},
          fence{std::move(other.fence)},
          semaphore{std::move(other.semaphore)},
-          cycle{std::move(other.cycle)} {}
+          cycle{std::move(other.cycle)},
+          ready{other.ready} {}

    std::shared_ptr<FenceCycle> CommandRecordThread::Slot::Reset(GPU &gpu) {
        cycle->Wait();
@ -51,6 +60,21 @@ namespace skyline::gpu::interconnect {
        return cycle;
    }

+    void CommandRecordThread::Slot::WaitReady() {
+        std::unique_lock lock{beginLock};
+        beginCondition.wait(lock, [this] { return ready; });
+        cycle->AttachObject(std::make_shared<ScopedBegin>(*this));
+    }
+
+    void CommandRecordThread::Slot::Begin() {
+        std::unique_lock lock{beginLock};
+        commandBuffer.begin(vk::CommandBufferBeginInfo{
+            .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
+        });
+        ready = true;
+        beginCondition.notify_all();
+    }
+
    void CommandRecordThread::ProcessSlot(Slot *slot) {
        TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, slot->executionNumber);
        auto &gpu{*state.gpu};
@ -83,6 +107,7 @@ namespace skyline::gpu::interconnect {
        }

        slot->commandBuffer.end();
+        slot->ready = false;

        gpu.scheduler.SubmitCommandBuffer(slot->commandBuffer, slot->cycle);

@ -404,9 +429,7 @@ namespace skyline::gpu::interconnect {
            FinishRenderPass();

        {
-            slot->commandBuffer.begin(vk::CommandBufferBeginInfo{
-                .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
-            });
+            slot->WaitReady();

            // We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands
            slot->commandBuffer.pipelineBarrier(
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -19,6 +19,17 @@ namespace skyline::gpu::interconnect {
         * @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread
         */
        struct Slot {
+            /**
+             * @brief Helper to begin the slot command buffer on the cycle waiter thread
+             */
+            struct ScopedBegin {
+                Slot &slot;
+
+                ScopedBegin(Slot &slot);
+
+                ~ScopedBegin();
+            };
+
            vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time
            vk::raii::CommandBuffer commandBuffer;
            vk::raii::Fence fence;
@ -26,7 +37,10 @@ namespace skyline::gpu::interconnect {
            std::shared_ptr<FenceCycle> cycle;
            boost::container::stable_vector<node::NodeVariant> nodes;
            LinearAllocatorState<> allocator;
+            std::mutex beginLock;
+            std::condition_variable beginCondition;
            u32 executionNumber;
+            bool ready{}; //!< If this slot's command buffer has had 'beginCommandBuffer' called and is ready to have commands recorded into it
            bool capture{}; //!< If this slot's Vulkan commands should be captured using the renderdoc API

            Slot(GPU &gpu);
@ -38,6 +52,13 @@ namespace skyline::gpu::interconnect {
             * @note A new fence cycle for the reset command buffer
             */
            std::shared_ptr<FenceCycle> Reset(GPU &gpu);
+
+            /**
+             * @brief Waits for the command buffer to be began so it can be recorded into
+             */
+            void WaitReady();
+
+            void Begin();
        };

      private: