Use a linear allocator for most per-execution GPU allocations

Currently we heavily thrash the heap each draw, with malloc/free taking up about 10% of GPFIFOs execution time. Using a linear allocator for the main offenders of buffer usage callbacks and index/vertex state helps to reduce this to about 4%
2025-07-16 18:06:16 +03:00 · 2022-07-31 13:41:28 +01:00 · 2022-07-31 13:41:28 +01:00 · 683cd594ad
commit 683cd594ad
parent 70eec5a414
6 changed files with 22 additions and 20 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -320,19 +320,15 @@ namespace skyline::gpu {

    BufferView::BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}

-    void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
+    void BufferView::RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback) {
+        if (!bufferDelegate->usageCallbacks)
+            bufferDelegate->usageCallbacks = decltype(bufferDelegate->usageCallbacks)::value_type{allocator};
+
        // Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further sequenced writes in the current cycle to occur on the GPU
        bufferDelegate->buffer->BlockSequencedCpuBackingWrites();

        usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
-        if (!bufferDelegate->usageCallback) {
-            bufferDelegate->usageCallback = usageCallback;
-        } else {
-            bufferDelegate->usageCallback = [usageCallback, oldCallback = std::move(bufferDelegate->usageCallback)](const Buffer::BufferViewStorage &pView, const std::shared_ptr<Buffer> &buffer) {
-                oldCallback(pView, buffer);
-                usageCallback(pView, buffer);
-            };
-        }
+        bufferDelegate->usageCallbacks->emplace_back(std::move(usageCallback));
    }

    void BufferView::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -6,6 +6,7 @@
 #include <unordered_set>
 #include <boost/functional/hash.hpp>
 #include <common/lockable_shared_ptr.h>
+#include <common/linear_allocator.h>
 #include <nce.h>
 #include <gpu/tag_allocator.h>
 #include "megabuffer.h"
@ -101,7 +102,8 @@ namespace skyline::gpu {
            LockableSharedPtr<Buffer> buffer;
            const Buffer::BufferViewStorage *view;
            bool attached{};
-            std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)> usageCallback;
+            using UsageCallback = std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)>;
+            std::optional<std::vector<UsageCallback, LinearAllocator<UsageCallback>>> usageCallbacks;
            std::list<BufferDelegate *>::iterator iterator;

            BufferDelegate(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view);
@ -405,7 +407,7 @@ namespace skyline::gpu {
         * @note The callback will be automatically called the first time after registration
         * @note The view **must** be locked prior to calling this
         */
-        void RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback);
+        void RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback);

        /**
         * @brief Reads data at the specified offset in the view
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@ -152,8 +152,9 @@ namespace skyline::gpu {
            // Transfer all delegates references from the overlapping buffer to the new buffer
            for (auto &delegate : srcBuffer->delegates) {
                delegate->buffer = *newBuffer;
-                if (delegate->usageCallback)
-                    delegate->usageCallback(*delegate->view, *newBuffer);
+                if (delegate->usageCallbacks)
+                    for (auto &callback : *delegate->usageCallbacks)
+                        callback(*delegate->view, *newBuffer);
            }

            newBuffer->delegates.splice(newBuffer->delegates.end(), srcBuffer->delegates);
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -324,7 +324,7 @@ namespace skyline::gpu::interconnect {
        textureManagerLock.reset();

        for (const auto &delegate : attachedBufferDelegates) {
-            delegate->usageCallback = nullptr;
+            delegate->usageCallbacks.reset();
            delegate->attached = false;
            delegate->view->megaBufferAllocation = {};
        }
@ -333,6 +333,7 @@ namespace skyline::gpu::interconnect {
        attachedBuffers.clear();
        bufferManagerLock.reset();
        megaBufferAllocatorLock.reset();
+        allocator.Reset();
    }

    void CommandExecutor::Submit() {
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -5,6 +5,7 @@

 #include <boost/container/stable_vector.hpp>
 #include <unordered_set>
+#include <common/linear_allocator.h>
 #include <gpu/megabuffer.h>
 #include "command_nodes.h"

@ -98,6 +99,7 @@ namespace skyline::gpu::interconnect {

      public:
        std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
+        LinearAllocatorState<> allocator;
        ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag

        CommandExecutor(const DeviceState &state);
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -1122,7 +1122,7 @@ namespace skyline::gpu::interconnect {
                                .range = view->view->size
                            };
                        } else {
-                            view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                            view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                                *descriptor = vk::DescriptorBufferInfo{
                                    .buffer = buffer->GetBacking(),
                                    .offset = view.offset,
@ -1157,7 +1157,7 @@ namespace skyline::gpu::interconnect {
                        if (storageBuffer.is_written)
                            view->buffer->MarkGpuDirty();

-                        view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                        view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                            *descriptor = vk::DescriptorBufferInfo{
                                .buffer = buffer->GetBacking(),
                                .offset = view.offset,
@ -2829,14 +2829,14 @@ namespace skyline::gpu::interconnect {
                auto indexBufferView{GetIndexBuffer(count)};
                executor.AttachBuffer(indexBufferView);

-                boundIndexBuffer = std::make_shared<BoundIndexBuffer>();
+                boundIndexBuffer = std::allocate_shared<BoundIndexBuffer, LinearAllocator<BoundIndexBuffer>>(executor.allocator);
                boundIndexBuffer->type = indexBuffer.type;
                if (auto megaBufferAllocation{indexBufferView.AcquireMegaBuffer(executor.cycle, executor.AcquireMegaBufferAllocator())}) {
                    // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
                    boundIndexBuffer->handle = megaBufferAllocation.buffer;
                    boundIndexBuffer->offset = megaBufferAllocation.offset;
                } else {
-                    indexBufferView.RegisterUsage(executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                    indexBufferView.RegisterUsage(executor.allocator, executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                        boundIndexBuffer->handle = buffer->GetBacking();
                        boundIndexBuffer->offset = view.offset;
                    });
@ -2858,7 +2858,7 @@ namespace skyline::gpu::interconnect {
                std::array<vk::Buffer, maxwell3d::VertexBufferCount> handles{};
                std::array<vk::DeviceSize, maxwell3d::VertexBufferCount> offsets{};
            };
-            auto boundVertexBuffers{std::make_shared<BoundVertexBuffers>()};
+            auto boundVertexBuffers{std::allocate_shared<BoundVertexBuffers, LinearAllocator<BoundVertexBuffers>>(executor.allocator)};

            boost::container::static_vector<vk::VertexInputBindingDescription, maxwell3d::VertexBufferCount> vertexBindingDescriptions{};
            boost::container::static_vector<vk::VertexInputBindingDivisorDescriptionEXT, maxwell3d::VertexBufferCount> vertexBindingDivisorsDescriptions{};
@ -2877,7 +2877,7 @@ namespace skyline::gpu::interconnect {
                        boundVertexBuffers->handles[index] = megaBufferAllocation.buffer;
                        boundVertexBuffers->offsets[index] = megaBufferAllocation.offset;
                    } else {
-                        vertexBufferView.RegisterUsage(executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                        vertexBufferView.RegisterUsage(executor.allocator, executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                            *handle = buffer->GetBacking();
                            *offset = view.offset;
                        });