Implement support for pushbuffer methods split across multiple GpEntries

These are used heavily in OpenGL games, which now, together with the previous syncpoint changes, work perfectly. The actual implementation is rather novel as rather than using a per-class state machine for all methods we only use it for those that are known to be split across GpEntry boundaries, as a result only a single bounds check is added to the hot path of contiguous method execution and the performance loss is negligible.
2025-07-22 18:41:33 +03:00 · 2021-10-13 21:46:30 +01:00 · 2021-10-13 21:46:30 +01:00 · b7d0f2fafa
commit b7d0f2fafa
parent fc017e1e95
4 changed files with 106 additions and 19 deletions
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
@ -62,12 +62,12 @@ namespace skyline::service::nvdrv::device::nvhost {
            using Allocator = FlatAllocator<u32, 0, 32>;

            std::unique_ptr<Allocator> bigPageAllocator;
-            std::shared_ptr<Allocator> smallPageAllocator; // Shared as this is also used by nvhost::GpuChannel
+            std::shared_ptr<Allocator> smallPageAllocator; //! Shared as this is also used by nvhost::GpuChannel

            bool initialised{};
        } vm;

-        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx;
+        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx; //!< The guest GPU AS context that is associated with each particular instance

        friend GpuChannel;

--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.h
@ -23,14 +23,14 @@ namespace skyline::service::nvdrv::device::nvhost {
        std::shared_ptr<type::KEvent> smExceptionBreakpointPauseReportEvent;
        std::shared_ptr<type::KEvent> errorNotifierEvent;

-        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx;
-        std::shared_ptr<AsGpu::VM::Allocator> asAllocator;
-        std::unique_ptr<soc::gm20b::ChannelContext> channelCtx;
+        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx; //!< The guest GPU AS context submits from this channel are bound to
+        std::shared_ptr<AsGpu::VM::Allocator> asAllocator; //!< The small page allocator context for the AS that's bound to this channel, used to allocate space for `pushBufferMemory`
+        std::unique_ptr<soc::gm20b::ChannelContext> channelCtx; //!< The entire guest GPU context specific to this channel


-        u64 pushBufferAddr{};
-        size_t pushBufferMemoryOffset{};
-        std::vector<u32> pushBufferMemory;
+        u64 pushBufferAddr{}; //!< The GPU address `pushBufferMemory` is mapped to
+        size_t pushBufferMemoryOffset{}; //!< The current offset for which to write new pushbuffer method data into for post-increment and pre-wait
+        std::vector<u32> pushBufferMemory; //!< Mapped into the guest GPU As and used to store method data for pre/post increment commands

        friend AsGpu;

--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@ -115,28 +115,92 @@ namespace skyline::soc::gm20b {
        pushBufferData.resize(gpEntry.size);
        channelCtx.asCtx->gmmu.Read<u32>(pushBufferData, gpEntry.Address());

-        for (auto entry{pushBufferData.begin()}; entry != pushBufferData.end(); entry++) {
+        // There will be at least one entry here
+        auto entry{pushBufferData.begin()};
+
+        // Executes the current split method, returning once execution is finished or the current GpEntry has reached its end
+        auto resumeSplitMethod{[&](){
+            switch (resumeState.state) {
+                case MethodResumeState::State::Inc:
+                    while (entry != pushBufferData.end() && resumeState.remaining)
+                        Send(resumeState.address++, *(entry++), resumeState.subChannel, --resumeState.remaining == 0);
+
+                    break;
+                case MethodResumeState::State::OneInc:
+                    Send(resumeState.address++, *(entry++), resumeState.subChannel, --resumeState.remaining == 0);
+
+                    // After the first increment OneInc methods work the same as a NonInc method, this is needed so they can resume correctly if they are broken up by multiple GpEntries
+                    resumeState.state = MethodResumeState::State::NonInc;
+                    [[fallthrough]];
+                case MethodResumeState::State::NonInc:
+                    while (entry != pushBufferData.end() && resumeState.remaining)
+                        Send(resumeState.address, *(entry++), resumeState.subChannel, --resumeState.remaining == 0);
+
+                    break;
+            }
+        }};
+
+        // We've a method from a previous GpEntry that needs resuming
+        if (resumeState.remaining)
+            resumeSplitMethod();
+
+        // Process more methods if the entries are still not all used up after handling resuming
+        for (; entry != pushBufferData.end(); entry++) {
            // An entry containing all zeroes is a NOP, skip over it
            if (*entry == 0)
                continue;

            PushBufferMethodHeader methodHeader{.raw = *entry};
+
+            // Needed in order to check for methods split across multiple GpEntries
+            auto remainingEntries{std::distance(entry, pushBufferData.end()) - 1};
+
+            // Handles storing state and initial execution for methods that are split across multiple GpEntries
+            auto startSplitMethod{[&](auto methodState) {
+                resumeState = {
+                    .remaining = methodHeader.methodCount,
+                    .address = methodHeader.methodAddress,
+                    .subChannel = methodHeader.methodSubChannel,
+                    .state = methodState
+                };
+
+                // Skip over method header as `resumeSplitMethod` doesn't expect it to be there
+                entry++;
+
+                resumeSplitMethod();
+            }};
+
            switch (methodHeader.secOp) {
                case PushBufferMethodHeader::SecOp::IncMethod:
-                    for (u32 i{}; i < methodHeader.methodCount; i++)
-                        Send(methodHeader.methodAddress + i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
-                    break;
+                    if (remainingEntries >= methodHeader.methodCount) {
+                        for (u32 i{}; i < methodHeader.methodCount; i++)
+                            Send(methodHeader.methodAddress + i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);

+                        break;
+                    } else {
+                        startSplitMethod(MethodResumeState::State::Inc);
+                        return;
+                    }
                case PushBufferMethodHeader::SecOp::NonIncMethod:
-                    for (u32 i{}; i < methodHeader.methodCount; i++)
-                        Send(methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
-                    break;
+                    if (remainingEntries >= methodHeader.methodCount) {
+                        for (u32 i{}; i < methodHeader.methodCount; i++)
+                            Send(methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);

+                        break;
+                    } else {
+                        startSplitMethod(MethodResumeState::State::NonInc);
+                        return;
+                    }
                case PushBufferMethodHeader::SecOp::OneInc:
-                    for (u32 i{}; i < methodHeader.methodCount; i++)
-                        Send(methodHeader.methodAddress + !!i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
-                    break;
+                    if (remainingEntries >= methodHeader.methodCount) {
+                        for (u32 i{}; i < methodHeader.methodCount; i++)
+                            Send(methodHeader.methodAddress + (i ? 1 : 0), *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);

+                        break;
+                    } else {
+                        startSplitMethod(MethodResumeState::State::OneInc);
+                        return;
+                    }
                case PushBufferMethodHeader::SecOp::ImmdDataMethod:
                    Send(methodHeader.methodAddress, methodHeader.immdData, methodHeader.methodSubChannel, true);
                    break;
@ -154,6 +218,7 @@ namespace skyline::soc::gm20b {
        pthread_setname_np(pthread_self(), "GPFIFO");
        try {
            signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
+
            gpEntries.Process([this](GpEntry gpEntry) {
                state.logger->Debug("Processing pushbuffer: 0x{:X}, Size: 0x{:X}", gpEntry.Address(), +gpEntry.size);
                Process(gpEntry);
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
@ -87,6 +87,7 @@ namespace skyline::soc::gm20b {
     * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_pbdma.ref.txt#L62
     */
    class ChannelGpfifo {
+      private:
        const DeviceState &state;
        ChannelContext &channelCtx;
        engine::GPFIFO gpfifoEngine; //!< The engine for processing GPFIFO method calls
@ -94,11 +95,32 @@ namespace skyline::soc::gm20b {
        std::thread thread; //!< The thread that manages processing of pushbuffers
        std::vector<u32> pushBufferData; //!< Persistent vector storing pushbuffer data to avoid constant reallocations

+        /**
+         * @brief Holds the required state in order to resume a method started from one call to `Process` in another
+         * @note This is needed as games (especially OpenGL ones) can split method entries over multiple GpEntries
+         */
+        struct MethodResumeState {
+            u32 remaining; //!< The number of entries left to handle until the method is finished
+            u32 address; //!< The method address in the GPU block specified by `subchannel` that is the target of the command
+            u8 subChannel;
+
+            /**
+             * @brief This is a simplified version of the full method type enum
+             */
+            enum class State : u8 {
+                NonInc,
+                Inc,
+                OneInc //!< Will be switched to NonInc after the first call
+            } state; //!< The type of method to resume
+        } resumeState{};
+
+
        /**
         * @brief Sends a method call to the GPU hardware
         */
        void Send(u32 method, u32 argument, u32 subchannel, bool lastCall);

+
        /**
         * @brief Processes the pushbuffer contained within the given GpEntry, calling methods as needed
         */
@ -118,7 +140,7 @@ namespace skyline::soc::gm20b {
        void Run();

        /**
-         * @brief Pushes a list of entries to the FIFO, these commands will be executed on calls to 'Step'
+         * @brief Pushes a list of entries to the FIFO, these commands will be executed on calls to 'Process'
         */
        void Push(span<GpEntry> entries);