From b7d0f2fafabad3a940aa863dbc2dc083a2c86993 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Wed, 13 Oct 2021 21:46:30 +0100
Subject: [PATCH] Implement support for pushbuffer methods split across
 multiple GpEntries

These are used heavily in OpenGL games, which now, together with the
previous syncpoint changes, work perfectly. The actual implementation is
rather novel as rather than using a per-class state machine for all
methods we only use it for those that are known to be split across
GpEntry boundaries, as a result only a single bounds check is added to
the hot path of contiguous method execution and the performance loss is
negligible.
---
 .../services/nvdrv/devices/nvhost/as_gpu.h    |  4 +-
 .../nvdrv/devices/nvhost/gpu_channel.h        | 12 +--
 app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp | 85 ++++++++++++++++---
 app/src/main/cpp/skyline/soc/gm20b/gpfifo.h   | 24 +++++-
 4 files changed, 106 insertions(+), 19 deletions(-)
diff --git a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
index a9a64062..1487b2de 100644
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
@@ -62,12 +62,12 @@ namespace skyline::service::nvdrv::device::nvhost {
             using Allocator = FlatAllocator<u32, 0, 32>;
 
             std::unique_ptr<Allocator> bigPageAllocator;
-            std::shared_ptr<Allocator> smallPageAllocator; // Shared as this is also used by nvhost::GpuChannel
+            std::shared_ptr<Allocator> smallPageAllocator; //! Shared as this is also used by nvhost::GpuChannel
 
             bool initialised{};
         } vm;
 
-        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx;
+        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx; //!< The guest GPU AS context that is associated with each particular instance
 
         friend GpuChannel;
 
diff --git a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.h b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.h
index 5d35e7a1..b5f3461c 100644
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.h
@@ -23,14 +23,14 @@ namespace skyline::service::nvdrv::device::nvhost {
         std::shared_ptr<type::KEvent> smExceptionBreakpointPauseReportEvent;
         std::shared_ptr<type::KEvent> errorNotifierEvent;
 
-        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx;
-        std::shared_ptr<AsGpu::VM::Allocator> asAllocator;
-        std::unique_ptr<soc::gm20b::ChannelContext> channelCtx;
+        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx; //!< The guest GPU AS context submits from this channel are bound to
+        std::shared_ptr<AsGpu::VM::Allocator> asAllocator; //!< The small page allocator context for the AS that's bound to this channel, used to allocate space for `pushBufferMemory`
+        std::unique_ptr<soc::gm20b::ChannelContext> channelCtx; //!< The entire guest GPU context specific to this channel
 
 
-        u64 pushBufferAddr{};
-        size_t pushBufferMemoryOffset{};
-        std::vector<u32> pushBufferMemory;
+        u64 pushBufferAddr{}; //!< The GPU address `pushBufferMemory` is mapped to
+        size_t pushBufferMemoryOffset{}; //!< The current offset for which to write new pushbuffer method data into for post-increment and pre-wait
+        std::vector<u32> pushBufferMemory; //!< Mapped into the guest GPU As and used to store method data for pre/post increment commands
 
         friend AsGpu;
 
diff --git a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
index cc3cb865..23c580d5 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@@ -115,28 +115,92 @@ namespace skyline::soc::gm20b {
         pushBufferData.resize(gpEntry.size);
         channelCtx.asCtx->gmmu.Read<u32>(pushBufferData, gpEntry.Address());
 
-        for (auto entry{pushBufferData.begin()}; entry != pushBufferData.end(); entry++) {
+        // There will be at least one entry here
+        auto entry{pushBufferData.begin()};
+
+        // Executes the current split method, returning once execution is finished or the current GpEntry has reached its end
+        auto resumeSplitMethod{[&](){
+            switch (resumeState.state) {
+                case MethodResumeState::State::Inc:
+                    while (entry != pushBufferData.end() && resumeState.remaining)
+                        Send(resumeState.address++, *(entry++), resumeState.subChannel, --resumeState.remaining == 0);
+
+                    break;
+                case MethodResumeState::State::OneInc:
+                    Send(resumeState.address++, *(entry++), resumeState.subChannel, --resumeState.remaining == 0);
+
+                    // After the first increment OneInc methods work the same as a NonInc method, this is needed so they can resume correctly if they are broken up by multiple GpEntries
+                    resumeState.state = MethodResumeState::State::NonInc;
+                    [[fallthrough]];
+                case MethodResumeState::State::NonInc:
+                    while (entry != pushBufferData.end() && resumeState.remaining)
+                        Send(resumeState.address, *(entry++), resumeState.subChannel, --resumeState.remaining == 0);
+
+                    break;
+            }
+        }};
+
+        // We've a method from a previous GpEntry that needs resuming
+        if (resumeState.remaining)
+            resumeSplitMethod();
+
+        // Process more methods if the entries are still not all used up after handling resuming
+        for (; entry != pushBufferData.end(); entry++) {
             // An entry containing all zeroes is a NOP, skip over it
             if (*entry == 0)
                 continue;
 
             PushBufferMethodHeader methodHeader{.raw = *entry};
+
+            // Needed in order to check for methods split across multiple GpEntries
+            auto remainingEntries{std::distance(entry, pushBufferData.end()) - 1};
+
+            // Handles storing state and initial execution for methods that are split across multiple GpEntries
+            auto startSplitMethod{[&](auto methodState) {
+                resumeState = {
+                    .remaining = methodHeader.methodCount,
+                    .address = methodHeader.methodAddress,
+                    .subChannel = methodHeader.methodSubChannel,
+                    .state = methodState
+                };
+
+                // Skip over method header as `resumeSplitMethod` doesn't expect it to be there
+                entry++;
+
+                resumeSplitMethod();
+            }};
+
             switch (methodHeader.secOp) {
                 case PushBufferMethodHeader::SecOp::IncMethod:
-                    for (u32 i{}; i < methodHeader.methodCount; i++)
-                        Send(methodHeader.methodAddress + i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
-                    break;
+                    if (remainingEntries >= methodHeader.methodCount) {
+                        for (u32 i{}; i < methodHeader.methodCount; i++)
+                            Send(methodHeader.methodAddress + i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
 
+                        break;
+                    } else {
+                        startSplitMethod(MethodResumeState::State::Inc);
+                        return;
+                    }
                 case PushBufferMethodHeader::SecOp::NonIncMethod:
-                    for (u32 i{}; i < methodHeader.methodCount; i++)
-                        Send(methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
-                    break;
+                    if (remainingEntries >= methodHeader.methodCount) {
+                        for (u32 i{}; i < methodHeader.methodCount; i++)
+                            Send(methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
 
+                        break;
+                    } else {
+                        startSplitMethod(MethodResumeState::State::NonInc);
+                        return;
+                    }
                 case PushBufferMethodHeader::SecOp::OneInc:
-                    for (u32 i{}; i < methodHeader.methodCount; i++)
-                        Send(methodHeader.methodAddress + !!i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
-                    break;
+                    if (remainingEntries >= methodHeader.methodCount) {
+                        for (u32 i{}; i < methodHeader.methodCount; i++)
+                            Send(methodHeader.methodAddress + (i ? 1 : 0), *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
 
+                        break;
+                    } else {
+                        startSplitMethod(MethodResumeState::State::OneInc);
+                        return;
+                    }
                 case PushBufferMethodHeader::SecOp::ImmdDataMethod:
                     Send(methodHeader.methodAddress, methodHeader.immdData, methodHeader.methodSubChannel, true);
                     break;
@@ -154,6 +218,7 @@ namespace skyline::soc::gm20b {
         pthread_setname_np(pthread_self(), "GPFIFO");
         try {
             signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
+
             gpEntries.Process([this](GpEntry gpEntry) {
                 state.logger->Debug("Processing pushbuffer: 0x{:X}, Size: 0x{:X}", gpEntry.Address(), +gpEntry.size);
                 Process(gpEntry);
diff --git a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
index c8222c97..00994e07 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
@@ -87,6 +87,7 @@ namespace skyline::soc::gm20b {
      * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_pbdma.ref.txt#L62
      */
     class ChannelGpfifo {
+      private:
         const DeviceState &state;
         ChannelContext &channelCtx;
         engine::GPFIFO gpfifoEngine; //!< The engine for processing GPFIFO method calls
@@ -94,11 +95,32 @@ namespace skyline::soc::gm20b {
         std::thread thread; //!< The thread that manages processing of pushbuffers
         std::vector<u32> pushBufferData; //!< Persistent vector storing pushbuffer data to avoid constant reallocations
 
+        /**
+         * @brief Holds the required state in order to resume a method started from one call to `Process` in another
+         * @note This is needed as games (especially OpenGL ones) can split method entries over multiple GpEntries
+         */
+        struct MethodResumeState {
+            u32 remaining; //!< The number of entries left to handle until the method is finished
+            u32 address; //!< The method address in the GPU block specified by `subchannel` that is the target of the command
+            u8 subChannel;
+
+            /**
+             * @brief This is a simplified version of the full method type enum
+             */
+            enum class State : u8 {
+                NonInc,
+                Inc,
+                OneInc //!< Will be switched to NonInc after the first call
+            } state; //!< The type of method to resume
+        } resumeState{};
+
+
         /**
          * @brief Sends a method call to the GPU hardware
          */
         void Send(u32 method, u32 argument, u32 subchannel, bool lastCall);
 
+
         /**
          * @brief Processes the pushbuffer contained within the given GpEntry, calling methods as needed
          */
@@ -118,7 +140,7 @@ namespace skyline::soc::gm20b {
         void Run();
 
         /**
-         * @brief Pushes a list of entries to the FIFO, these commands will be executed on calls to 'Step'
+         * @brief Pushes a list of entries to the FIFO, these commands will be executed on calls to 'Process'
          */
         void Push(span<GpEntry> entries);