Rework per-view megabuffering to cache allocs in the buffer itself

The original intention was to cache on the user side, but especially with shader constant buffers that's difficult and costly. Instead we can cache on the buffer side, with a page-table like structure to hold variable sized allocations indexed by the aligned view base address. This avoids most redundant copies from repeated use of the same buffer without updates inbetween.
2024-10-17 01:26:50 +03:00 · 2022-09-17 12:46:31 +01:00 · 2022-09-17 12:46:31 +01:00 · a24aec03a6
commit a24aec03a6
parent b810470601
2 changed files with 78 additions and 48 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -82,7 +82,10 @@ namespace skyline::gpu {
          backing{gpu.memory.AllocateBuffer(guest.size())},
          guest{guest},
          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
-          id{id} {}
+          id{id},
+          megaBufferTableShift{std::max(std::bit_width(guest.size() / MegaBufferTableMaxEntries - 1), MegaBufferTableShiftMin)} {
+        megaBufferTable.resize(guest.size() / (1 << megaBufferTableShift));
+    }

    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, vk::DeviceSize size, size_t id)
        : gpu{gpu},
@ -270,12 +273,33 @@ namespace skyline::gpu {
            return {};
    }

-    std::pair<u64, span<u8>> Buffer::AcquireCurrentSequence() {
+    BufferBinding Buffer::TryMegaBufferView(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, size_t executionNumber,
+                                            vk::DeviceSize offset, vk::DeviceSize size) {
        if (!SynchronizeGuest(false, true))
            // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate
            return {};

-        return {sequenceNumber, mirror};
+        if (!everHadInlineUpdate)
+            // Don't megabuffer buffers that have never had inline updates and are not frequently synced since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
+            return {};
+
+        if (size > MegaBufferingDisableThreshold)
+            return {};
+
+        size_t entryIdx{offset >> megaBufferTableShift};
+        size_t bufferEntryOffset{entryIdx << megaBufferTableShift};
+        size_t entryViewOffset{offset - bufferEntryOffset};
+        auto &entry{megaBufferTable[entryIdx]};
+
+        // If the cached allocation is invalid or not up to date, allocate a new one
+        if (!entry.allocation || entry.executionNumber != executionNumber ||
+              entry.sequenceNumber != sequenceNumber || entry.allocation.region.size() + entryViewOffset < size) {
+            // Use max(oldSize, newSize) to avoid redundant reallocations within an execution if a larger allocation comes along later
+            auto mirrorAllocationRegion{mirror.subspan(bufferEntryOffset, std::max(entryViewOffset + size, entry.allocation.region.size()))};
+            entry.allocation = allocator.Push(pCycle, mirrorAllocationRegion, true);
+        }
+
+        return {entry.allocation.buffer, entry.allocation.offset + entryViewOffset, size};
    }

    void Buffer::AdvanceSequence() {
@ -359,30 +383,13 @@ namespace skyline::gpu {
        GetBuffer()->Read(isFirstUsage, flushHostCallback, data, readOffset + GetOffset());
    }

-    bool BufferView::Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback) const {
-        // If megabuffering can't be enabled we have to do a GPU-side copy to ensure sequencing
-        bool gpuCopy{size > MegaBufferingDisableThreshold};
-        if (gpuCopy)
-            GetBuffer()->BlockSequencedCpuBackingWrites();
-
+    bool BufferView::Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback,
+                           span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback) const {
        return GetBuffer()->Write(isFirstUsage, flushHostCallback, data, writeOffset + GetOffset(), gpuCopyCallback);
    }

-    MegaBufferAllocator::Allocation BufferView::AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator) const {
-        if (!GetBuffer()->EverHadInlineUpdate())
-            // Don't megabuffer buffers that have never had inline updates since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
-            return {};
-
-        if (size > MegaBufferingDisableThreshold)
-            return {};
-
-        auto [newSequence, sequenceSpan]{GetBuffer()->AcquireCurrentSequence()};
-        if (!newSequence)
-            return {}; // If the sequence can't be acquired then the buffer is GPU dirty and we can't megabuffer
-
-        auto viewBackingSpan{sequenceSpan.subspan(GetOffset(), size)};
-
-        return allocator.Push(pCycle, viewBackingSpan, true); // Success!
+    BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, size_t executionNumber, size_t sizeOverride) const {
+        return GetBuffer()->TryMegaBufferView(pCycle, allocator, executionNumber, GetOffset(), sizeOverride ? sizeOverride : size);
    }

    span<u8> BufferView::GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -18,6 +18,24 @@ namespace skyline::gpu {
    class BufferManager;
    class BufferDelegate;

+
+    /**
+     * @brief Represents a bound Vulkan buffer that can be used for state updates
+     */
+    struct BufferBinding {
+        vk::Buffer buffer{};
+        vk::DeviceSize offset{};
+        vk::DeviceSize size{};
+
+        BufferBinding() = default;
+
+        BufferBinding(vk::Buffer buffer, vk::DeviceSize offset = 0, vk::DeviceSize size = 0) : buffer{buffer}, offset{offset}, size{size} {}
+
+        operator bool() const {
+            return buffer;
+        }
+    };
+
    /**
     * @brief A buffer which is backed by host constructs while being synchronized with the underlying guest buffer
     * @note This class conforms to the Lockable and BasicLockable C++ named requirements
@ -51,13 +69,24 @@ namespace skyline::gpu {

        bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates

-      public:
-
        static constexpr u64 InitialSequenceNumber{1}; //!< Sequence number that all buffers start off with
-
-      private:
        u64 sequenceNumber{InitialSequenceNumber}; //!< Sequence number that is incremented after all modifications to the host side `backing` buffer, used to prevent redundant copies of the buffer being stored in the megabuffer by views

+        constexpr static vk::DeviceSize MegaBufferingDisableThreshold{1024 * 128}; //!< The threshold at which a view is considered to be too large to be megabuffered (128KiB)
+
+        /**
+         * @brief Holds a single megabuffer copy with sequencing information for an offset within the buffer
+         */
+        struct MegaBufferTableEntry {
+            MegaBufferAllocator::Allocation allocation{}; //!< The allocation in the megabuffer for the entry, can be any size
+            size_t executionNumber; //!< Execution number of when the allocation was made
+            size_t sequenceNumber; //!< Sequence number of when the allocation was made
+        };
+
+        static constexpr int MegaBufferTableShiftMin{std::countr_zero(0x100U)}; //!< The minimum shift for megabuffer table entries, giving an alignment of at least 256 bytes
+        static constexpr size_t MegaBufferTableMaxEntries{0x500U}; //!< Maximum number of entries in the megabuffer table, `megaBufferTableShift` is set based on this and the total buffer size
+        int megaBufferTableShift; //!< Shift to apply to buffer offsets to get their megabuffer table index
+        std::vector<MegaBufferTableEntry> megaBufferTable; //!< Table of megabuffer allocations for regions of the buffer

      private:
        BufferDelegate *delegate;
@ -184,13 +213,6 @@ namespace skyline::gpu {
            return SequencedCpuBackingWritesBlocked();
        }

-        /**
-         * @note The buffer **must** be locked prior to calling this
-         */
-        bool EverHadInlineUpdate() const {
-            return everHadInlineUpdate;
-        }
-
        /**
         * @brief Waits on a fence cycle if it exists till it's signalled and resets it after
         * @note The buffer **must** be locked prior to calling this
@ -263,14 +285,14 @@ namespace skyline::gpu {
         */
        BufferView TryGetView(span<u8> mapping);

-        /**
-         * @brief Attempts to return the current sequence number and prepare the buffer for read accesses from the returned span
-         * @return The current sequence number and a span of the buffers guest mirror given that the buffer is not GPU dirty, if it is then a zero sequence number is returned
-         * @note The contents of the returned span can be cached safely given the sequence number is unchanged
+
+        /*
+         * @brief If megabuffering is determined to be beneficial for this buffer, allocates and copies the given view of buffer into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region
+         * @return A binding to the megabuffer allocation for the view, may be invalid if megabuffering is not beneficial
         * @note The buffer **must** be locked prior to calling this
-         * @note An implicit CPU -> GPU sync will be performed when calling this, an immediate GPU -> CPU sync will also be attempted if the buffer is GPU dirty
         */
-        std::pair<u64, span<u8>> AcquireCurrentSequence();
+        BufferBinding TryMegaBufferView(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, size_t executionNumber,
+                                        vk::DeviceSize offset, vk::DeviceSize size);

        /**
         * @brief Increments the sequence number of the buffer, any futher calls to AcquireCurrentSequence will return this new sequence number. See the comment for `sequenceNumber`
@ -330,8 +352,6 @@ namespace skyline::gpu {
     */
    class BufferView {
      private:
-        constexpr static vk::DeviceSize MegaBufferingDisableThreshold{1024 * 128}; //!< The threshold at which the view is considered to be too large to be megabuffered (128KiB)
-
        BufferDelegate *delegate{};
        vk::DeviceSize offset{};

@ -418,14 +438,17 @@ namespace skyline::gpu {
         * @note The view **must** be locked prior to calling this
         * @note See Buffer::Write
         */
-        bool Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &cycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback = {}) const;
+        bool Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &cycle, const std::function<void()> &flushHostCallback,
+                   span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback = {}) const;

-        /**
-         * @brief If megabuffering is beneficial for the view, pushes its contents into the megabuffer and returns the offset of the pushed data
-         * @return The megabuffer allocation for the view, may be invalid if megabuffering is not beneficial
+
+        /*
+         * @brief If megabuffering is determined to be beneficial for the underlying buffer, allocates and copies this view into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region
+         * @param sizeOverride If non-zero, specifies the size of the megabuffer region to allocate and copy to, *MUST* be smaller than the size of the view
         * @note The view **must** be locked prior to calling this
+         * @note See Buffer::TryMegaBufferView
         */
-        MegaBufferAllocator::Allocation AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator) const;
+        BufferBinding TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, size_t executionNumber, size_t sizeOverride = 0) const;

        /**
         * @return A span of the backing buffer contents