mirror of
https://github.com/skyline-emu/skyline.git
synced 2024-12-29 14:45:28 +03:00
Use a linear allocator for most per-execution GPU allocations
Currently we heavily thrash the heap each draw, with malloc/free taking up about 10% of GPFIFOs execution time. Using a linear allocator for the main offenders of buffer usage callbacks and index/vertex state helps to reduce this to about 4%
This commit is contained in:
parent
70eec5a414
commit
683cd594ad
@ -320,19 +320,15 @@ namespace skyline::gpu {
|
||||
|
||||
BufferView::BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}
|
||||
|
||||
void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
|
||||
void BufferView::RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback) {
|
||||
if (!bufferDelegate->usageCallbacks)
|
||||
bufferDelegate->usageCallbacks = decltype(bufferDelegate->usageCallbacks)::value_type{allocator};
|
||||
|
||||
// Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further sequenced writes in the current cycle to occur on the GPU
|
||||
bufferDelegate->buffer->BlockSequencedCpuBackingWrites();
|
||||
|
||||
usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
|
||||
if (!bufferDelegate->usageCallback) {
|
||||
bufferDelegate->usageCallback = usageCallback;
|
||||
} else {
|
||||
bufferDelegate->usageCallback = [usageCallback, oldCallback = std::move(bufferDelegate->usageCallback)](const Buffer::BufferViewStorage &pView, const std::shared_ptr<Buffer> &buffer) {
|
||||
oldCallback(pView, buffer);
|
||||
usageCallback(pView, buffer);
|
||||
};
|
||||
}
|
||||
bufferDelegate->usageCallbacks->emplace_back(std::move(usageCallback));
|
||||
}
|
||||
|
||||
void BufferView::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <unordered_set>
|
||||
#include <boost/functional/hash.hpp>
|
||||
#include <common/lockable_shared_ptr.h>
|
||||
#include <common/linear_allocator.h>
|
||||
#include <nce.h>
|
||||
#include <gpu/tag_allocator.h>
|
||||
#include "megabuffer.h"
|
||||
@ -101,7 +102,8 @@ namespace skyline::gpu {
|
||||
LockableSharedPtr<Buffer> buffer;
|
||||
const Buffer::BufferViewStorage *view;
|
||||
bool attached{};
|
||||
std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)> usageCallback;
|
||||
using UsageCallback = std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)>;
|
||||
std::optional<std::vector<UsageCallback, LinearAllocator<UsageCallback>>> usageCallbacks;
|
||||
std::list<BufferDelegate *>::iterator iterator;
|
||||
|
||||
BufferDelegate(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view);
|
||||
@ -405,7 +407,7 @@ namespace skyline::gpu {
|
||||
* @note The callback will be automatically called the first time after registration
|
||||
* @note The view **must** be locked prior to calling this
|
||||
*/
|
||||
void RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback);
|
||||
void RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback);
|
||||
|
||||
/**
|
||||
* @brief Reads data at the specified offset in the view
|
||||
|
@ -152,8 +152,9 @@ namespace skyline::gpu {
|
||||
// Transfer all delegates references from the overlapping buffer to the new buffer
|
||||
for (auto &delegate : srcBuffer->delegates) {
|
||||
delegate->buffer = *newBuffer;
|
||||
if (delegate->usageCallback)
|
||||
delegate->usageCallback(*delegate->view, *newBuffer);
|
||||
if (delegate->usageCallbacks)
|
||||
for (auto &callback : *delegate->usageCallbacks)
|
||||
callback(*delegate->view, *newBuffer);
|
||||
}
|
||||
|
||||
newBuffer->delegates.splice(newBuffer->delegates.end(), srcBuffer->delegates);
|
||||
|
@ -324,7 +324,7 @@ namespace skyline::gpu::interconnect {
|
||||
textureManagerLock.reset();
|
||||
|
||||
for (const auto &delegate : attachedBufferDelegates) {
|
||||
delegate->usageCallback = nullptr;
|
||||
delegate->usageCallbacks.reset();
|
||||
delegate->attached = false;
|
||||
delegate->view->megaBufferAllocation = {};
|
||||
}
|
||||
@ -333,6 +333,7 @@ namespace skyline::gpu::interconnect {
|
||||
attachedBuffers.clear();
|
||||
bufferManagerLock.reset();
|
||||
megaBufferAllocatorLock.reset();
|
||||
allocator.Reset();
|
||||
}
|
||||
|
||||
void CommandExecutor::Submit() {
|
||||
|
@ -5,6 +5,7 @@
|
||||
|
||||
#include <boost/container/stable_vector.hpp>
|
||||
#include <unordered_set>
|
||||
#include <common/linear_allocator.h>
|
||||
#include <gpu/megabuffer.h>
|
||||
#include "command_nodes.h"
|
||||
|
||||
@ -98,6 +99,7 @@ namespace skyline::gpu::interconnect {
|
||||
|
||||
public:
|
||||
std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
|
||||
LinearAllocatorState<> allocator;
|
||||
ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
|
||||
|
||||
CommandExecutor(const DeviceState &state);
|
||||
|
@ -1122,7 +1122,7 @@ namespace skyline::gpu::interconnect {
|
||||
.range = view->view->size
|
||||
};
|
||||
} else {
|
||||
view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
|
||||
view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
|
||||
*descriptor = vk::DescriptorBufferInfo{
|
||||
.buffer = buffer->GetBacking(),
|
||||
.offset = view.offset,
|
||||
@ -1157,7 +1157,7 @@ namespace skyline::gpu::interconnect {
|
||||
if (storageBuffer.is_written)
|
||||
view->buffer->MarkGpuDirty();
|
||||
|
||||
view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
|
||||
view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
|
||||
*descriptor = vk::DescriptorBufferInfo{
|
||||
.buffer = buffer->GetBacking(),
|
||||
.offset = view.offset,
|
||||
@ -2829,14 +2829,14 @@ namespace skyline::gpu::interconnect {
|
||||
auto indexBufferView{GetIndexBuffer(count)};
|
||||
executor.AttachBuffer(indexBufferView);
|
||||
|
||||
boundIndexBuffer = std::make_shared<BoundIndexBuffer>();
|
||||
boundIndexBuffer = std::allocate_shared<BoundIndexBuffer, LinearAllocator<BoundIndexBuffer>>(executor.allocator);
|
||||
boundIndexBuffer->type = indexBuffer.type;
|
||||
if (auto megaBufferAllocation{indexBufferView.AcquireMegaBuffer(executor.cycle, executor.AcquireMegaBufferAllocator())}) {
|
||||
// If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
|
||||
boundIndexBuffer->handle = megaBufferAllocation.buffer;
|
||||
boundIndexBuffer->offset = megaBufferAllocation.offset;
|
||||
} else {
|
||||
indexBufferView.RegisterUsage(executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
|
||||
indexBufferView.RegisterUsage(executor.allocator, executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
|
||||
boundIndexBuffer->handle = buffer->GetBacking();
|
||||
boundIndexBuffer->offset = view.offset;
|
||||
});
|
||||
@ -2858,7 +2858,7 @@ namespace skyline::gpu::interconnect {
|
||||
std::array<vk::Buffer, maxwell3d::VertexBufferCount> handles{};
|
||||
std::array<vk::DeviceSize, maxwell3d::VertexBufferCount> offsets{};
|
||||
};
|
||||
auto boundVertexBuffers{std::make_shared<BoundVertexBuffers>()};
|
||||
auto boundVertexBuffers{std::allocate_shared<BoundVertexBuffers, LinearAllocator<BoundVertexBuffers>>(executor.allocator)};
|
||||
|
||||
boost::container::static_vector<vk::VertexInputBindingDescription, maxwell3d::VertexBufferCount> vertexBindingDescriptions{};
|
||||
boost::container::static_vector<vk::VertexInputBindingDivisorDescriptionEXT, maxwell3d::VertexBufferCount> vertexBindingDivisorsDescriptions{};
|
||||
@ -2877,7 +2877,7 @@ namespace skyline::gpu::interconnect {
|
||||
boundVertexBuffers->handles[index] = megaBufferAllocation.buffer;
|
||||
boundVertexBuffers->offsets[index] = megaBufferAllocation.offset;
|
||||
} else {
|
||||
vertexBufferView.RegisterUsage(executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
|
||||
vertexBufferView.RegisterUsage(executor.allocator, executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
|
||||
*handle = buffer->GetBacking();
|
||||
*offset = view.offset;
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user