Implement accelerated uploads/copies through buffer manager

Previously, both I2M uploads and DMA copies would force GPU serialisation if they happened to hit a trap or were used to copy GPU dirty buffers. By using the buffer manager to implement them on the host GPU we can avoid such slowdowns entiely.
2025-07-20 11:26:14 +03:00 · 2022-10-25 20:57:30 +01:00 · 2022-10-25 20:57:30 +01:00 · cac287d9fd
commit cac287d9fd
parent c5ec484d9a
10 changed files with 205 additions and 15 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -186,6 +186,8 @@ add_library(skyline SHARED
        ${source_DIR}/skyline/gpu/cache/renderpass_cache.cpp
        ${source_DIR}/skyline/gpu/cache/framebuffer_cache.cpp
        ${source_DIR}/skyline/gpu/interconnect/fermi_2d.cpp
+        ${source_DIR}/skyline/gpu/interconnect/maxwell_dma.cpp
+        ${source_DIR}/skyline/gpu/interconnect/inline2memory.cpp
        ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/common.cpp
        ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/active_state.cpp
        ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp
--- a/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <gpu/buffer_manager.h>
+#include <soc/gm20b/gmmu.h>
+#include <soc/gm20b/channel.h>
+#include "inline2memory.h"
+
+namespace skyline::gpu::interconnect {
+    using IOVA = soc::gm20b::IOVA;
+
+    Inline2Memory::Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
+        : gpu{gpu},
+          channelCtx{channelCtx},
+          executor{channelCtx.executor} {}
+
+    void Inline2Memory::Upload(IOVA dst, span<u32> src) {
+        auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, src.size_bytes())};
+
+        if (dstMappings.size() > 1)
+            Logger::Warn("Split mapping are unsupported for DMA copies");
+
+        auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
+            executor.AttachLockedBuffer(buffer, std::move(lock));
+        })};
+        ContextLock dstBufLock{executor.tag, dstBuf};
+
+
+        dstBuf.Write(src.cast<u8>(), 0, [&]() {
+            executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
+            // This will prevent any CPU accesses to backing for the duration of the usage
+            dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
+
+            auto srcGpuAllocation{gpu.megaBufferAllocator.Push(executor.cycle, src.cast<u8>())};
+            executor.AddOutsideRpCommand([srcGpuAllocation, dstBuf, src](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
+                vk::BufferCopy copyRegion{
+                    .size = src.size_bytes(),
+                    .srcOffset = srcGpuAllocation.offset,
+                    .dstOffset = dstBuf.GetOffset()
+                };
+                commandBuffer.copyBuffer(srcGpuAllocation.buffer, dstBuf.GetBuffer()->GetBacking(), copyRegion);
+                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
+                    .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+                    .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
+                }, {}, {});
+            });
+        });
+    }
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h
@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <soc/gm20b/gmmu.h>
+
+namespace skyline::gpu {
+    class GPU;
+}
+
+namespace skyline::soc::gm20b {
+    struct ChannelContext;
+}
+
+namespace skyline::gpu::interconnect {
+    class CommandExecutor;
+
+    /**
+     * @brief Handles translating I2M operations to Vulkan
+     */
+    class Inline2Memory {
+      private:
+        using IOVA = soc::gm20b::IOVA;
+
+        GPU &gpu;
+        soc::gm20b::ChannelContext &channelCtx;
+        gpu::interconnect::CommandExecutor &executor;
+
+      public:
+        Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
+
+        void Upload(IOVA dst, span<u32> src);
+    };
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <gpu/buffer_manager.h>
+#include <soc/gm20b/gmmu.h>
+#include <soc/gm20b/channel.h>
+#include "maxwell_dma.h"
+
+namespace skyline::gpu::interconnect {
+    using IOVA = soc::gm20b::IOVA;
+
+    MaxwellDma::MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
+        : gpu{gpu},
+          channelCtx{channelCtx},
+          executor{channelCtx.executor} {}
+
+    void MaxwellDma::Copy(IOVA dst, IOVA src, size_t size) {
+        auto srcMappings{channelCtx.asCtx->gmmu.TranslateRange(src, size)};
+        auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, size)};
+
+        if (srcMappings.size() > 1 || dstMappings.size() > 1)
+            Logger::Warn("Split mapping are unsupported for DMA copies");
+
+        auto srcBuf{gpu.buffer.FindOrCreate(srcMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
+            executor.AttachLockedBuffer(buffer, std::move(lock));
+        })};
+        ContextLock srcBufLock{executor.tag, srcBuf};
+
+        auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
+            executor.AttachLockedBuffer(buffer, std::move(lock));
+        })};
+        ContextLock dstBufLock{executor.tag, dstBuf};
+
+        dstBuf.CopyFrom(srcBuf, [&]() {
+            executor.AttachLockedBufferView(srcBuf, std::move(srcBufLock));
+            executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
+            // This will prevent any CPU accesses to backing for the duration of the usage
+            // GPU dirtiness will be handled on the CopyFrom end as it's not always necessary
+            srcBuf.GetBuffer()->BlockAllCpuBackingWrites();
+            dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
+
+            executor.AddOutsideRpCommand([srcBuf, dstBuf](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
+                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer, {}, vk::MemoryBarrier{
+                    .srcAccessMask = vk::AccessFlagBits::eMemoryRead,
+                    .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite
+                }, {}, {});
+                vk::BufferCopy copyRegion{
+                    .size = srcBuf.size,
+                    .srcOffset = srcBuf.GetOffset(),
+                    .dstOffset = dstBuf.GetOffset()
+                };
+                commandBuffer.copyBuffer(srcBuf.GetBuffer()->GetBacking(), dstBuf.GetBuffer()->GetBacking(), copyRegion);
+                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
+                    .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+                    .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
+                }, {}, {});
+            });
+        });
+    }
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h
@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <soc/gm20b/gmmu.h>
+
+namespace skyline::gpu {
+    class GPU;
+}
+
+namespace skyline::soc::gm20b {
+    struct ChannelContext;
+}
+
+namespace skyline::gpu::interconnect {
+    class CommandExecutor;
+
+    /**
+     * @brief Handles translating Maxwell DMA operations to Vulkan
+     */
+    class MaxwellDma {
+      private:
+        using IOVA = soc::gm20b::IOVA;
+
+        GPU &gpu;
+        soc::gm20b::ChannelContext &channelCtx;
+        gpu::interconnect::CommandExecutor &executor;
+
+      public:
+        MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
+
+        void Copy(IOVA dst, IOVA src, size_t size);
+    };
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.cpp
@ -5,7 +5,9 @@
 #include "inline2memory.h"

 namespace skyline::soc::gm20b::engine {
-    Inline2MemoryBackend::Inline2MemoryBackend(ChannelContext &channelCtx) : channelCtx(channelCtx) {}
+    Inline2MemoryBackend::Inline2MemoryBackend(const DeviceState &state, ChannelContext &channelCtx)
+        : interconnect{*state.gpu, channelCtx},
+          channelCtx{channelCtx} {}

    void Inline2MemoryBackend::LaunchDma(Inline2MemoryBackend::RegisterState &state) {
        writeOffset = 0;
@ -17,13 +19,11 @@ namespace skyline::soc::gm20b::engine {
        if (state.launchDma.completion == RegisterState::DmaCompletionType::ReleaseSemaphore)
            throw exception("Semaphore release on I2M completion is not supported!");

-        channelCtx.executor.Submit();
-
        if (state.launchDma.layout == RegisterState::DmaDstMemoryLayout::Pitch && state.lineCount == 1) {
-            // TODO: we can do this with the buffer manager to avoid some overhead in the future
            Logger::Debug("range: 0x{:X} -> 0x{:X}", u64{state.offsetOut}, u64{state.offsetOut} + buffer.size() * 0x4);
-            channelCtx.asCtx->gmmu.Write(state.offsetOut, span(buffer));
+            interconnect.Upload(u64{state.offsetOut}, span{buffer});
        } else {
+            channelCtx.executor.Submit();
            Logger::Warn("Non-linear I2M uploads are not supported!");
        }
    }
@ -49,7 +49,7 @@ namespace skyline::soc::gm20b::engine {
            CompleteDma(state);
    }

-    Inline2Memory::Inline2Memory(ChannelContext &channelCtx) : backend(channelCtx) {}
+    Inline2Memory::Inline2Memory(const DeviceState &state, ChannelContext &channelCtx) : backend{state, channelCtx} {}

    __attribute__((always_inline)) void Inline2Memory::CallMethod(u32 method, u32 argument) {
        Logger::Verbose("Called method in I2M: 0x{:X} args: 0x{:X}", method, argument);
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.h
@ -4,6 +4,7 @@
 #pragma once

 #include <common.h>
+#include <gpu/interconnect/inline2memory.h>
 #include "engine.h"

 namespace skyline::soc::gm20b {
@ -18,6 +19,7 @@ namespace skyline::soc::gm20b::engine {
      private:
        std::vector<u32> buffer; //!< Temporary buffer to hold data being currently uploaded
        u32 writeOffset{}; //!< Current write offset in words into `buffer`
+        gpu::interconnect::Inline2Memory interconnect;
        ChannelContext &channelCtx;

      public:
@ -124,7 +126,7 @@ namespace skyline::soc::gm20b::engine {
        void CompleteDma(RegisterState &state);

      public:
-        Inline2MemoryBackend(ChannelContext &channelCtx);
+        Inline2MemoryBackend(const DeviceState &state, ChannelContext &channelCtx);

        /**
         * @brief Should be called when launchDma in `state` is written to
@ -164,7 +166,7 @@ namespace skyline::soc::gm20b::engine {
        } registers{};

      public:
-        Inline2Memory(ChannelContext &channelCtx);
+        Inline2Memory(const DeviceState &state, ChannelContext &channelCtx);

        void CallMethod(u32 method, u32 argument);

--- a/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp
@ -8,7 +8,7 @@

 namespace skyline::soc::gm20b::engine {
    KeplerCompute::KeplerCompute(const DeviceState &state, ChannelContext &channelCtx)
-        : syncpoints(state.soc->host1x.syncpoints), i2m(channelCtx) {}
+        : syncpoints{state.soc->host1x.syncpoints}, i2m{state, channelCtx} {}

    __attribute__((always_inline)) void KeplerCompute::CallMethod(u32 method, u32 argument) {
        Logger::Verbose("Called method in Kepler compute: 0x{:X} args: 0x{:X}", method, argument);
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
@ -11,8 +11,10 @@
 #include "maxwell_dma.h"

 namespace skyline::soc::gm20b::engine {
-    MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor)
-        : channelCtx(channelCtx), syncpoints(state.soc->host1x.syncpoints), executor(executor) {}
+    MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx)
+        : channelCtx{channelCtx},
+          syncpoints{state.soc->host1x.syncpoints},
+          interconnect{*state.gpu, channelCtx} {}

    __attribute__((always_inline)) void MaxwellDma::CallMethod(u32 method, u32 argument) {
        Logger::Verbose("Called method in Maxwell DMA: 0x{:X} args: 0x{:X}", method, argument);
@ -36,8 +38,8 @@ namespace skyline::soc::gm20b::engine {
            return;
        }

-        executor.Submit();
        if (registers.launchDma->multiLineEnable) {
+            channelCtx.executor.Submit();
            if (registers.launchDma->srcMemoryLayout == Registers::LaunchDma::MemoryLayout::Pitch &&
                registers.launchDma->dstMemoryLayout == Registers::LaunchDma::MemoryLayout::BlockLinear)
                CopyPitchToBlockLinear();
@ -51,7 +53,7 @@ namespace skyline::soc::gm20b::engine {
            // 1D buffer copy
            // TODO: implement swizzled 1D copies based on VMM 'kind'
            Logger::Debug("src: 0x{:X} dst: 0x{:X} size: 0x{:X}", u64{*registers.offsetIn}, u64{*registers.offsetOut}, *registers.lineLengthIn);
-            channelCtx.asCtx->gmmu.Copy(*registers.offsetOut, *registers.offsetIn, *registers.lineLengthIn);
+            interconnect.Copy(u64{*registers.offsetOut}, u64{*registers.offsetIn}, u64{*registers.lineLengthIn});
        }

        ReleaseSemaphore();
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h
@ -3,6 +3,7 @@

 #pragma once

+#include <gpu/interconnect/maxwell_dma.h>
 #include "engine.h"

 namespace skyline::gpu::interconnect {
@ -21,7 +22,7 @@ namespace skyline::soc::gm20b::engine {
      private:
        host1x::SyncpointSet &syncpoints;
        ChannelContext &channelCtx;
-        gpu::interconnect::CommandExecutor &executor;
+        gpu::interconnect::MaxwellDma interconnect;

        void HandleMethod(u32 method, u32 argument);

@ -253,7 +254,7 @@ namespace skyline::soc::gm20b::engine {
        static_assert(sizeof(Registers) == (EngineMethodsEnd * 0x4));
        #pragma pack(pop)

-        MaxwellDma(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor);
+        MaxwellDma(const DeviceState &state, ChannelContext &channelCtx);

        void CallMethod(u32 method, u32 argument);