Implement asynchronous command recording

Recording of command nodes into Vulkan command buffers is very easily parallelisable as it can effectively be treated as part of the GPU execution, which is inherently async. By moving it to a seperate thread we can shave off about 20% of GPFIFO execution time. It should be noted that the command scheduler command buffer infra is no longer used, since we need to record texture updates on the GPFIFO thread (while another slot is being recorded on the record thread) and then use the same command buffer on the record thread later. This ends up requiring a pool per slot, which is reasonable considering we only have four slots by default.
2025-06-09 16:12:03 +03:00 · 2022-09-19 14:38:36 +01:00 · 2022-09-19 14:38:36 +01:00 · 7c9212743c
commit 7c9212743c
parent a197dd2b28
4 changed files with 218 additions and 97 deletions
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -1,16 +1,137 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)

+#include <loader/loader.h>
 #include <gpu.h>
 #include "command_executor.h"

 namespace skyline::gpu::interconnect {
-    CommandExecutor::CommandExecutor(const DeviceState &state) : gpu{*state.gpu}, activeCommandBuffer{gpu.scheduler.AllocateCommandBuffer()}, cycle{activeCommandBuffer.GetFenceCycle()}, tag{AllocateTag()} {}
+    CommandRecordThread::CommandRecordThread(const DeviceState &state) : state{state}, thread{&CommandRecordThread::Run, this} {}
+
+    static vk::raii::CommandBuffer AllocateRaiiCommandBuffer(GPU &gpu, vk::raii::CommandPool &pool) {
+        return {gpu.vkDevice, (*gpu.vkDevice).allocateCommandBuffers(
+                    {
+                        .commandPool = *pool,
+                        .level = vk::CommandBufferLevel::ePrimary,
+                        .commandBufferCount = 1
+                    }, *gpu.vkDevice.getDispatcher()).front(),
+                *pool};
+    }
+
+    CommandRecordThread::Slot::Slot(GPU &gpu)
+        : commandPool{gpu.vkDevice,
+                      vk::CommandPoolCreateInfo{
+                          .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer | vk::CommandPoolCreateFlagBits::eTransient,
+                          .queueFamilyIndex = gpu.vkQueueFamilyIndex
+                      }
+          },
+          commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)},
+          fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }},
+          cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, true)} {}
+
+    std::shared_ptr<FenceCycle> CommandRecordThread::Slot::Reset(GPU &gpu) {
+        cycle->Wait();
+        cycle = std::make_shared<FenceCycle>(gpu.vkDevice, *fence);
+        commandBuffer.reset();
+        return cycle;
+    }
+
+    void CommandRecordThread::ProcessSlot(Slot *slot) {
+        auto &gpu{*state.gpu};
+
+        vk::RenderPass lRenderPass;
+        u32 subpassIndex;
+
+        std::scoped_lock bufferLock{gpu.buffer.recreationMutex};
+        using namespace node;
+        for (NodeVariant &node : slot->nodes) {
+            #define NODE(name) [&](name& node) { node(slot->commandBuffer, slot->cycle, gpu); }
+            std::visit(VariantVisitor{
+                NODE(FunctionNode),
+
+                [&](RenderPassNode &node) {
+                    lRenderPass = node(slot->commandBuffer, slot->cycle, gpu);
+                    subpassIndex = 0;
+                },
+
+                [&](NextSubpassNode &node) {
+                    node(slot->commandBuffer, slot->cycle, gpu);
+                    ++subpassIndex;
+                },
+                [&](SubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); },
+                [&](NextSubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); },
+
+                NODE(RenderPassEndNode),
+            }, node);
+            #undef NODE
+        }
+
+        slot->commandBuffer.end();
+
+        gpu.scheduler.SubmitCommandBuffer(slot->commandBuffer, slot->cycle);
+
+        slot->nodes.clear();
+        slot->allocator.Reset();
+    }
+
+    void CommandRecordThread::Run() {
+        auto &gpu{*state.gpu};
+        std::array<Slot, ActiveRecordSlots> slots{{gpu, gpu, gpu, gpu}};
+        outgoing.AppendTranform(span<Slot>(slots), [](auto &slot) { return &slot; });
+
+        if (int result{pthread_setname_np(pthread_self(), "Sky-CmdRecord")})
+            Logger::Warn("Failed to set the thread name: {}", strerror(result));
+
+        try {
+            signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
+
+            incoming.Process([this](Slot *slot) {
+                ProcessSlot(slot);
+                outgoing.Push(slot);
+            }, [] {});
+        } catch (const signal::SignalException &e) {
+            Logger::Error("{}\nStack Trace:{}", e.what(), state.loader->GetStackTrace(e.frames));
+            if (state.process)
+                state.process->Kill(false);
+            else
+                std::rethrow_exception(std::current_exception());
+        } catch (const std::exception &e) {
+            Logger::Error(e.what());
+            if (state.process)
+                state.process->Kill(false);
+            else
+                std::rethrow_exception(std::current_exception());
+        }
+    }
+
+    CommandRecordThread::Slot *CommandRecordThread::AcquireSlot() {
+        return outgoing.Pop();
+    }
+
+    void CommandRecordThread::ReleaseSlot(Slot *slot) {
+        incoming.Push(slot);
+    }
+
+    CommandExecutor::CommandExecutor(const DeviceState &state)
+        : gpu{*state.gpu},
+          recordThread{state},
+          tag{AllocateTag()} {
+        RotateRecordSlot();
+    }

    CommandExecutor::~CommandExecutor() {
        cycle->Cancel();
    }

+    void CommandExecutor::RotateRecordSlot() {
+        if (slot)
+            recordThread.ReleaseSlot(slot);
+
+        slot = recordThread.AcquireSlot();
+        cycle = slot->Reset(gpu);
+        allocator = &slot->allocator;
+    }
+
    TextureManager &CommandExecutor::AcquireTextureManager() {
        if (!textureManagerLock)
            textureManagerLock.emplace(gpu.texture);
@ -55,8 +176,8 @@ namespace skyline::gpu::interconnect {
        if (renderPass == nullptr || renderPass->renderArea != renderArea || subpassCount >= gpu.traits.quirks.maxSubpassCount) {
            // We need to create a render pass if one doesn't already exist or the current one isn't compatible
            if (renderPass != nullptr)
-                nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
-            renderPass = &std::get<node::RenderPassNode>(nodes.emplace_back(std::in_place_type_t<node::RenderPassNode>(), renderArea));
+                slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
+            renderPass = &std::get<node::RenderPassNode>(slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassNode>(), renderArea));
            addSubpass();
            subpassCount = 1;
            return false;
@ -77,7 +198,7 @@ namespace skyline::gpu::interconnect {

    void CommandExecutor::FinishRenderPass() {
        if (renderPass) {
-            nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
+            slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());

            renderPass = nullptr;
            subpassCount = 0;
@ -168,9 +289,9 @@ namespace skyline::gpu::interconnect {

        bool gotoNext{CreateRenderPassWithSubpass(renderArea, inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr)};
        if (gotoNext)
-            nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), std::forward<decltype(function)>(function));
+            slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), std::forward<decltype(function)>(function));
        else
-            nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), std::forward<decltype(function)>(function));
+            slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), std::forward<decltype(function)>(function));

        if (exclusiveSubpass)
            FinishRenderPass();
@ -180,14 +301,14 @@ namespace skyline::gpu::interconnect {
        if (renderPass)
            FinishRenderPass();

-        nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), std::forward<decltype(function)>(function));
+        slot->nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), std::forward<decltype(function)>(function));
    }

    void CommandExecutor::AddClearColorSubpass(TextureView *attachment, const vk::ClearColorValue &value) {
        bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, attachment, nullptr)};
        if (renderPass->ClearColorAttachment(0, value, gpu)) {
            if (gotoNext)
-                nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
+                slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
        } else {
            auto function{[scissor = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
                commandBuffer.clearAttachments(vk::ClearAttachment{
@ -202,9 +323,9 @@ namespace skyline::gpu::interconnect {
            }};

            if (gotoNext)
-                nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
+                slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
            else
-                nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
+                slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
        }
    }

@ -212,7 +333,7 @@ namespace skyline::gpu::interconnect {
        bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, {}, attachment)};
        if (renderPass->ClearDepthStencilAttachment(value, gpu)) {
            if (gotoNext)
-                nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
+                slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
        } else {
            auto function{[aspect = attachment->format->vkAspect, extent = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
                commandBuffer.clearAttachments(vk::ClearAttachment{
@ -226,9 +347,9 @@ namespace skyline::gpu::interconnect {
            }};

            if (gotoNext)
-                nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
+                slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
            else
-                nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
+                slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
        }
    }

@ -241,13 +362,12 @@ namespace skyline::gpu::interconnect {
            FinishRenderPass();

        {
-            auto &commandBuffer{*activeCommandBuffer};
-            commandBuffer.begin(vk::CommandBufferBeginInfo{
+            slot->commandBuffer.begin(vk::CommandBufferBeginInfo{
                .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
            });

            // We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands
-            commandBuffer.pipelineBarrier(
+            slot->commandBuffer.pipelineBarrier(
                vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
                    .srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
                    .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
@ -255,43 +375,12 @@ namespace skyline::gpu::interconnect {
            );

            for (const auto &texture : attachedTextures)
-                texture->SynchronizeHostInline(commandBuffer, cycle, true);
-
-            vk::RenderPass lRenderPass;
-            u32 subpassIndex;
-
-            using namespace node;
-            for (NodeVariant &node : nodes) {
-                #define NODE(name) [&](name& node) { node(commandBuffer, cycle, gpu); }
-                std::visit(VariantVisitor{
-                    NODE(FunctionNode),
-
-                    [&](RenderPassNode &node) {
-                        lRenderPass = node(commandBuffer, cycle, gpu);
-                        subpassIndex = 0;
-                    },
-
-                    [&](NextSubpassNode &node) {
-                        node(commandBuffer, cycle, gpu);
-                        ++subpassIndex;
-                    },
-                    [&](SubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, subpassIndex); },
-                    [&](NextSubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, ++subpassIndex); },
-
-                    NODE(RenderPassEndNode),
-                }, node);
-                #undef NODE
+                texture->SynchronizeHostInline(slot->commandBuffer, cycle, true);
        }

-            commandBuffer.end();
-
        for (const auto &attachedBuffer : attachedBuffers)
            if (attachedBuffer->SequencedCpuBackingWritesBlocked())
-                    attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls
-
-            gpu.scheduler.SubmitCommandBuffer(commandBuffer, cycle);
-
-            nodes.clear();
+                attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer

        for (const auto &attachedTexture : attachedTextures) {
            // We don't need to attach the Texture to the cycle as a TextureView will already be attached
@ -305,7 +394,8 @@ namespace skyline::gpu::interconnect {
                attachedBuffer->UpdateCycle(cycle);
            }
        }
-        }
+
+        RotateRecordSlot();
    }

    void CommandExecutor::ResetInternal() {
@ -314,32 +404,16 @@ namespace skyline::gpu::interconnect {
        attachedBuffers.clear();
        bufferManagerLock.reset();
        megaBufferAllocatorLock.reset();
-        allocator.Reset();
+        allocator->Reset();
    }

    void CommandExecutor::Submit() {
        for (const auto &callback : flushCallbacks)
            callback();

-        if (!nodes.empty()) {
+        if (!slot->nodes.empty()) {
            TRACE_EVENT("gpu", "CommandExecutor::Submit");
            SubmitInternal();
-            activeCommandBuffer = gpu.scheduler.AllocateCommandBuffer();
-            cycle = activeCommandBuffer.GetFenceCycle();
-        }
-        ResetInternal();
-
-        executionNumber++;
-    }
-
-    void CommandExecutor::SubmitWithFlush() {
-        for (const auto &callback : flushCallbacks)
-            callback();
-
-        if (!nodes.empty()) {
-            TRACE_EVENT("gpu", "CommandExecutor::SubmitWithFlush");
-            SubmitInternal();
-            cycle = activeCommandBuffer.Reset();
        }
        ResetInternal();

--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -10,6 +10,57 @@
 #include "command_nodes.h"

 namespace skyline::gpu::interconnect {
+    /*
+     * @brief Thread responsible for recording Vulkan commands from the execution nodes and submitting them
+     */
+    class CommandRecordThread {
+      public:
+        /**
+         * @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread
+         */
+        struct Slot {
+            vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time
+            vk::raii::CommandBuffer commandBuffer;
+            vk::raii::Fence fence;
+            std::shared_ptr<FenceCycle> cycle;
+            boost::container::stable_vector<node::NodeVariant> nodes;
+            LinearAllocatorState<> allocator;
+
+            Slot(GPU &gpu);
+
+            /**
+             * @brief Waits on the fence and resets the command buffer
+             * @note A new fence cycle for the reset command buffer
+             */
+            std::shared_ptr<FenceCycle> Reset(GPU &gpu);
+        };
+
+      private:
+        const DeviceState &state;
+        std::thread thread;
+
+        static constexpr size_t ActiveRecordSlots{4}; //!< Maximum number of simultaneously active slots
+        CircularQueue<Slot *> incoming{ActiveRecordSlots}; //!< Slots pending recording
+        CircularQueue<Slot *> outgoing{ActiveRecordSlots}; //!< Slots that have been submitted, may still be active on the GPU
+
+        void ProcessSlot(Slot *slot);
+
+        void Run();
+
+      public:
+        CommandRecordThread(const DeviceState &state);
+
+        /**
+         * @return A free slot, `Reset` needs to be called before accessing it
+         */
+        Slot *AcquireSlot();
+
+        /**
+         * @brief Submit a slot to be recorded
+         */
+        void ReleaseSlot(Slot *slot);
+    };
+
    /**
     * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
     * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@ -17,11 +68,10 @@ namespace skyline::gpu::interconnect {
    class CommandExecutor {
      private:
        GPU &gpu;
-        CommandScheduler::ActiveCommandBuffer activeCommandBuffer;
-        boost::container::stable_vector<node::NodeVariant> nodes;
+        CommandRecordThread recordThread;
+        CommandRecordThread::Slot *slot{};
        node::RenderPassNode *renderPass{};
        size_t subpassCount{}; //!< The number of subpasses in the current render pass
-
        std::optional<std::scoped_lock<TextureManager>> textureManagerLock; //!< The lock on the texture manager, this is locked for the duration of the command execution from the first usage inside an execution to the submission
        std::optional<std::scoped_lock<BufferManager>> bufferManagerLock; //!< The lock on the buffer manager, see above for details
        std::optional<std::scoped_lock<MegaBufferAllocator>> megaBufferAllocatorLock; //!< The lock on the megabuffer allocator, see above for details
@ -72,6 +122,8 @@ namespace skyline::gpu::interconnect {

        std::vector<std::function<void()>> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording

+        void RotateRecordSlot();
+
        /**
         * @brief Create a new render pass and subpass with the specified attachments, if one doesn't already exist or the current one isn't compatible
         * @note This also checks for subpass coalescing and will merge the new subpass with the previous one when possible
@ -97,7 +149,7 @@ namespace skyline::gpu::interconnect {

      public:
        std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
-        LinearAllocatorState<> allocator;
+        LinearAllocatorState<> *allocator;
        ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
        size_t executionNumber{};

@ -193,10 +245,5 @@ namespace skyline::gpu::interconnect {
         * @brief Execute all the nodes and submit the resulting command buffer to the GPU
         */
        void Submit();
-
-        /**
-         * @brief Execute all the nodes and submit the resulting command buffer to the GPU then wait for the completion of the command buffer
-         */
-        void SubmitWithFlush();
    };
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
@ -163,7 +163,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
    }

    void Maxwell3D::Draw(engine::DrawTopology topology, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) {
-        StateUpdateBuilder builder{ctx.executor.allocator};
+        StateUpdateBuilder builder{*ctx.executor.allocator};

        Pipeline *oldPipeline{activeState.GetPipeline()};
        activeState.Update(ctx, builder, indexed, topology, count);
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp
@ -615,9 +615,9 @@ namespace skyline::gpu::interconnect::maxwell3d {
        u32 bufferIdx{};
        u32 imageIdx{};

-        auto writes{ctx.executor.allocator.AllocateUntracked<vk::WriteDescriptorSet>(descriptorInfo.writeDescCount)};
-        auto bufferDescs{ctx.executor.allocator.AllocateUntracked<vk::DescriptorBufferInfo>(descriptorInfo.totalBufferDescCount)};
-        auto bufferDescViews{ctx.executor.allocator.AllocateUntracked<DynamicBufferBinding>(descriptorInfo.totalBufferDescCount)};
+        auto writes{ctx.executor.allocator->AllocateUntracked<vk::WriteDescriptorSet>(descriptorInfo.writeDescCount)};
+        auto bufferDescs{ctx.executor.allocator->AllocateUntracked<vk::DescriptorBufferInfo>(descriptorInfo.totalBufferDescCount)};
+        auto bufferDescViews{ctx.executor.allocator->AllocateUntracked<DynamicBufferBinding>(descriptorInfo.totalBufferDescCount)};

        auto writeBufferDescs{[&](vk::DescriptorType type, const auto &descs, u32 count, auto getBufferCb) {
            if (!descs.empty()) {
@ -658,13 +658,13 @@ namespace skyline::gpu::interconnect::maxwell3d {
        const auto &cbufUsageInfo{descriptorInfo.cbufUsages[static_cast<size_t>(quickBind.stage)][quickBind.index]};
        const auto &shaderInfo{shaderStages[static_cast<size_t>(quickBind.stage)].info};
        auto &stageConstantBuffers{constantBuffers[static_cast<size_t>(quickBind.stage)]};
-        auto copy{ctx.executor.allocator.AllocateUntracked<vk::CopyDescriptorSet>()};
-        auto writes{ctx.executor.allocator.AllocateUntracked<vk::WriteDescriptorSet>(cbufUsageInfo.writeDescCount)};
+        auto copy{ctx.executor.allocator->AllocateUntracked<vk::CopyDescriptorSet>()};
+        auto writes{ctx.executor.allocator->AllocateUntracked<vk::WriteDescriptorSet>(cbufUsageInfo.writeDescCount)};
        size_t writeIdx{};
        size_t bufferIdx{};

-        auto bufferDescs{ctx.executor.allocator.AllocateUntracked<vk::DescriptorBufferInfo>(cbufUsageInfo.totalBufferDescCount)};
-        auto bufferDescViews{ctx.executor.allocator.AllocateUntracked<DynamicBufferBinding>(cbufUsageInfo.totalBufferDescCount)};
+        auto bufferDescs{ctx.executor.allocator->AllocateUntracked<vk::DescriptorBufferInfo>(cbufUsageInfo.totalBufferDescCount)};
+        auto bufferDescViews{ctx.executor.allocator->AllocateUntracked<DynamicBufferBinding>(cbufUsageInfo.totalBufferDescCount)};

        // TODO: opt this to do partial copy
        *copy = vk::CopyDescriptorSet{