Implement overhead-free sequenced buffer updates with megabuffers

Previously constant buffer updates would be handled on the CPU and only the end result would be synced to the GPU before execute. This caused issues as if the constant buffer contents was changed between each draw in a renderpass (e.g. text rendering) the draws themselves would only see the final resulting constant buffer.

We had earlier tried to fix this by using vkCmdUpdateBuffer however this caused significant performance loss due to an oversight in Adreno drivers. We could have worked around this simply by using vkCmdCopy buffer however there would still be a performance loss due to renderpasses being split up with copies inbetween.

To avoid this we introduce 'megabuffers', a brand new technique not done before in any other switch emulators. Rather than replaying the copies in sequence on the GPU, we take advantage of the fact that buffers are generally small in order to replay buffers on the GPU instead. Each write and subsequent usage of a buffer will cause a copy of the buffer with that write, and all prior applied to be pushed into the megabuffer, this way at the start of execute the megabuffer will hold all used states of the buffer simultaneously. Draws then reference these individual states in sequence to allow everything to work without any copies. In order to support this buffers have been moved to an immediate sync model, with synchronisation being done at usage-time rather than execute (in order to keep contents properly sequenced) and GPU-side writes now need to be explictly marked (since they prevent megabuffering). It should also be noted that a fallback path using cmdCopyBuffer exists for the cases where buffers are too large or GPU dirty.
This commit is contained in:
Billy Laws 2022-04-23 18:10:39 +01:00
parent 0d9992cb8e
commit de796cd2cd
7 changed files with 363 additions and 59 deletions

View File

@ -8,6 +8,11 @@
#include "buffer.h" #include "buffer.h"
namespace skyline::gpu { namespace skyline::gpu {
void Buffer::TryEnableMegaBuffering() {
megaBufferOffset = 0;
megaBufferingEnabled = backing.size() < MegaBufferingDisableThreshold;
}
void Buffer::SetupGuestMappings() { void Buffer::SetupGuestMappings() {
u8 *alignedData{util::AlignDown(guest->data(), PAGE_SIZE)}; u8 *alignedData{util::AlignDown(guest->data(), PAGE_SIZE)};
size_t alignedSize{static_cast<size_t>(util::AlignUp(guest->data() + guest->size(), PAGE_SIZE) - alignedData)}; size_t alignedSize{static_cast<size_t>(util::AlignUp(guest->data() + guest->size(), PAGE_SIZE) - alignedData)};
@ -28,10 +33,64 @@ namespace skyline::gpu {
} }
Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) { Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
TryEnableMegaBuffering();
SetupGuestMappings(); SetupGuestMappings();
} }
Buffer::Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
std::scoped_lock bufLock{*this};
TryEnableMegaBuffering();
SetupGuestMappings();
// Source buffers don't necessarily fully overlap with us so we have to perform a sync here to prevent any gaps
SynchronizeHost(false);
// Copies between two buffers based off of their mappings in guest memory
auto copyBuffer{[](auto dstGuest, auto srcGuest, auto dstPtr, auto srcPtr) {
if (dstGuest.begin().base() <= srcGuest.begin().base()) {
size_t dstOffset{static_cast<size_t>(srcGuest.begin().base() - dstGuest.begin().base())};
size_t copySize{std::min(dstGuest.size() - dstOffset, srcGuest.size())};
std::memcpy(dstPtr + dstOffset, srcPtr, copySize);
} else if (dstGuest.begin().base() > srcGuest.begin().base()) {
size_t srcOffset{static_cast<size_t>(dstGuest.begin().base() - srcGuest.begin().base())};
size_t copySize{std::min(dstGuest.size(), srcGuest.size() - srcOffset)};
std::memcpy(dstPtr, srcPtr + srcOffset, copySize);
}
}};
// Transfer data/state from source buffers
for (const auto &srcBuffer : srcBuffers) {
std::scoped_lock lock{*srcBuffer};
if (srcBuffer->guest) {
if (!srcBuffer->megaBufferingEnabled)
megaBufferingEnabled = false;
if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
// If the source buffer is GPU dirty we cannot directly copy over its GPU backing contents
// Only sync back the buffer if it's not attched to the current fence cycle, otherwise propagate the GPU dirtiness
if (!srcBuffer->cycle.owner_before(pCycle)) {
// Perform a GPU -> CPU sync on the source then do a CPU -> GPU sync for the region occupied by the source
// This is required since if we were created from a two buffers: one GPU dirty in the current cycle, and one GPU dirty in the previous cycle, if we marked ourselves as CPU dirty here then the GPU dirtiness from the current cycle buffer would be ignored and cause writes to be missed
srcBuffer->SynchronizeGuest(true);
copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->mirror.data());
} else {
MarkGpuDirty();
}
} else if (srcBuffer->dirtyState == Buffer::DirtyState::Clean) {
// For clean buffers we can just copy over the GPU backing data directly
// This is necessary since clean buffers may not have matching GPU/CPU data in the case of non-megabuffered inline updates
copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->backing.data());
}
// CPU dirty buffers are already synchronized in the initial SynchronizeHost call so don't need special handling
}
}
}
Buffer::Buffer(GPU &gpu, vk::DeviceSize size) : gpu(gpu), backing(gpu.memory.AllocateBuffer(size)) { Buffer::Buffer(GPU &gpu, vk::DeviceSize size) : gpu(gpu), backing(gpu.memory.AllocateBuffer(size)) {
TryEnableMegaBuffering();
dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
} }
@ -47,6 +106,8 @@ namespace skyline::gpu {
void Buffer::MarkGpuDirty() { void Buffer::MarkGpuDirty() {
if (dirtyState == DirtyState::GpuDirty || !guest) if (dirtyState == DirtyState::GpuDirty || !guest)
return; return;
megaBufferingEnabled = false; // We can no longer megabuffer this buffer after it has been written by the GPU
gpu.state.nce->RetrapRegions(*trapHandle, false); gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty; dirtyState = DirtyState::GpuDirty;
} }
@ -61,6 +122,15 @@ namespace skyline::gpu {
} }
} }
bool Buffer::PollFence() {
auto lCycle{cycle.lock()};
if (lCycle && lCycle->Poll()) {
cycle.reset();
return true;
}
return false;
}
void Buffer::SynchronizeHost(bool rwTrap) { void Buffer::SynchronizeHost(bool rwTrap) {
if (dirtyState != DirtyState::CpuDirty || !guest) if (dirtyState != DirtyState::CpuDirty || !guest)
return; // If the buffer has not been modified on the CPU or there's no guest buffer, there is no need to synchronize it return; // If the buffer has not been modified on the CPU or there's no guest buffer, there is no need to synchronize it
@ -69,9 +139,13 @@ namespace skyline::gpu {
TRACE_EVENT("gpu", "Buffer::SynchronizeHost"); TRACE_EVENT("gpu", "Buffer::SynchronizeHost");
// If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date
InvalidateMegaBuffer();
std::memcpy(backing.data(), mirror.data(), mirror.size()); std::memcpy(backing.data(), mirror.data(), mirror.size());
if (rwTrap) { if (rwTrap) {
megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
gpu.state.nce->RetrapRegions(*trapHandle, false); gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty; dirtyState = DirtyState::GpuDirty;
} else { } else {
@ -89,9 +163,13 @@ namespace skyline::gpu {
TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle"); TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");
// If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date so force a recreation
InvalidateMegaBuffer();
std::memcpy(backing.data(), mirror.data(), mirror.size()); std::memcpy(backing.data(), mirror.data(), mirror.size());
if (rwTrap) { if (rwTrap) {
megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
gpu.state.nce->RetrapRegions(*trapHandle, false); gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty; dirtyState = DirtyState::GpuDirty;
} else { } else {
@ -100,11 +178,13 @@ namespace skyline::gpu {
} }
} }
void Buffer::SynchronizeGuest(bool skipTrap, bool skipFence) { void Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) {
if (dirtyState != DirtyState::GpuDirty || !guest) if (dirtyState != DirtyState::GpuDirty || !guest)
return; // If the buffer has not been used on the GPU or there's no guest buffer, there is no need to synchronize it return; // If the buffer has not been used on the GPU or there's no guest buffer, there is no need to synchronize it
if (!skipFence) if (nonBlocking && !PollFence())
return;
else if (!nonBlocking)
WaitOnFence(); WaitOnFence();
TRACE_EVENT("gpu", "Buffer::SynchronizeGuest"); TRACE_EVENT("gpu", "Buffer::SynchronizeGuest");
@ -113,7 +193,9 @@ namespace skyline::gpu {
if (!skipTrap) if (!skipTrap)
gpu.state.nce->RetrapRegions(*trapHandle, true); gpu.state.nce->RetrapRegions(*trapHandle, true);
dirtyState = DirtyState::Clean; dirtyState = DirtyState::Clean;
TryEnableMegaBuffering(); // If megaBuffering was disabled due to potential GPU dirtiness we can safely try to re-enable it now that the buffer is clean
} }
/** /**
@ -138,18 +220,45 @@ namespace skyline::gpu {
cycle = pCycle; cycle = pCycle;
} }
void Buffer::Read(span<u8> data, vk::DeviceSize offset) { void Buffer::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean) if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean) {
std::memcpy(data.data(), mirror.data() + offset, data.size()); std::memcpy(data.data(), mirror.data() + offset, data.size());
else if (dirtyState == DirtyState::GpuDirty) } else if (dirtyState == DirtyState::GpuDirty) {
// If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that we read valid data
if (!cycle.owner_before(pCycle))
flushHostCallback();
SynchronizeGuest();
std::memcpy(data.data(), backing.data() + offset, data.size()); std::memcpy(data.data(), backing.data() + offset, data.size());
} }
}
void Buffer::Write(span<u8> data, vk::DeviceSize offset) { void Buffer::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean) InvalidateMegaBuffer(); // Since we're writing to the backing buffer the megabuffer contents will require refresh
std::memcpy(mirror.data() + offset, data.data(), data.size());
if (dirtyState == DirtyState::GpuDirty || dirtyState == DirtyState::Clean) if (dirtyState == DirtyState::CpuDirty) {
SynchronizeHostWithCycle(pCycle); // Perform a CPU -> GPU sync to ensure correct ordering of writes
} else if (dirtyState == DirtyState::GpuDirty) {
// If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that writes are correctly ordered
if (!cycle.owner_before(pCycle))
flushHostCallback();
SynchronizeGuest();
}
if (dirtyState != DirtyState::Clean)
Logger::Error("Attempting to write to a dirty buffer"); // This should never happen since we do syncs in both directions above
std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents
if (megaBufferingEnabled) {
// If megabuffering is enabled then we don't need to do any special sequencing here, we can write directly to the backing and the sequencing for it will be handled at usage time
std::memcpy(backing.data() + offset, data.data(), data.size()); std::memcpy(backing.data() + offset, data.data(), data.size());
} else {
// Fallback to a GPU-side inline update for the buffer contents to ensure correct sequencing with draws
gpuCopyCallback();
}
} }
Buffer::BufferViewStorage::BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) : offset(offset), size(size), format(format) {} Buffer::BufferViewStorage::BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) : offset(offset), size(size), format(format) {}
@ -207,6 +316,25 @@ namespace skyline::gpu {
return BufferView{shared_from_this(), &views.back()}; return BufferView{shared_from_this(), &views.back()};
} }
vk::DeviceSize Buffer::AcquireMegaBuffer() {
SynchronizeGuest(false, true); // First try and enable megabuffering by doing an immediate sync
if (!megaBufferingEnabled)
return 0; // Bail out if megabuffering is disabled for this buffer
SynchronizeHost(); // Since pushes to the megabuffer use the GPU backing contents ensure they're up-to-date by performing a CPU -> GPU sync
if (megaBufferOffset)
return megaBufferOffset; // If the current buffer contents haven't been changed since the last acquire, we can just return the existing offset
megaBufferOffset = gpu.buffer.megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer
return megaBufferOffset;
}
void Buffer::InvalidateMegaBuffer() {
megaBufferOffset = 0;
}
BufferView::BufferView(std::shared_ptr<Buffer> buffer, Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {} BufferView::BufferView(std::shared_ptr<Buffer> buffer, Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}
void BufferView::AttachCycle(const std::shared_ptr<FenceCycle> &cycle) { void BufferView::AttachCycle(const std::shared_ptr<FenceCycle> &cycle) {
@ -230,11 +358,21 @@ namespace skyline::gpu {
} }
} }
void BufferView::Read(span<u8> data, vk::DeviceSize offset) const { void BufferView::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
bufferDelegate->buffer->Read(data, offset + bufferDelegate->view->offset); bufferDelegate->buffer->Read(pCycle, flushHostCallback, data, offset + bufferDelegate->view->offset);
} }
void BufferView::Write(span<u8> data, vk::DeviceSize offset) const { void BufferView::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
bufferDelegate->buffer->Write(data, offset + bufferDelegate->view->offset); bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
}
vk::DeviceSize BufferView::AcquireMegaBuffer() const {
vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer()};
// Propagate 0 results since they signify that megabuffering isn't supported for a buffer
if (bufferOffset)
return bufferOffset + bufferDelegate->view->offset;
else
return 0;
} }
} }

View File

@ -32,6 +32,16 @@ namespace skyline::gpu {
GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated
} dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer } dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer
constexpr static vk::DeviceSize MegaBufferingDisableThreshold{0x10'000}; //!< The threshold at which the buffer is considered to be too large to be megabuffered (64KiB)
bool megaBufferingEnabled{}; //!< If megabuffering can be used for this buffer at the current moment, is set based on MegaBufferingDisableThreshold and dirty state
vk::DeviceSize megaBufferOffset{}; //!< The offset into the megabuffer where the current buffer contents are stored, 0 if there is no up-to-date megabuffer entry for the current buffer contents
/**
* @brief Resets megabuffering state based off of the buffer size
*/
void TryEnableMegaBuffering();
public: public:
/** /**
* @brief Storage for all metadata about a specific view into the buffer, used to prevent redundant view creation and duplication of VkBufferView(s) * @brief Storage for all metadata about a specific view into the buffer, used to prevent redundant view creation and duplication of VkBufferView(s)
@ -99,6 +109,13 @@ namespace skyline::gpu {
Buffer(GPU &gpu, GuestBuffer guest); Buffer(GPU &gpu, GuestBuffer guest);
/**
* @brief Creates a Buffer that is pre-synchronised with the contents of the input buffers
* @param pCycle The FenceCycle associated with the current workload, utilised for synchronising GPU dirty buffers
* @param srcBuffers Span of overlapping source buffers
*/
Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers);
/** /**
* @brief Creates a host-only Buffer which isn't backed by any guest buffer * @brief Creates a host-only Buffer which isn't backed by any guest buffer
* @note The created buffer won't have a mirror so any operations cannot depend on a mirror existing * @note The created buffer won't have a mirror so any operations cannot depend on a mirror existing
@ -144,6 +161,13 @@ namespace skyline::gpu {
*/ */
void WaitOnFence(); void WaitOnFence();
/**
* @brief Polls a fence cycle if it exists and resets it if signalled
* @return Whether the fence cycle was signalled
* @note The buffer **must** be locked prior to calling this
*/
bool PollFence();
/** /**
* @brief Synchronizes the host buffer with the guest * @brief Synchronizes the host buffer with the guest
* @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after * @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
@ -162,10 +186,10 @@ namespace skyline::gpu {
/** /**
* @brief Synchronizes the guest buffer with the host buffer * @brief Synchronizes the guest buffer with the host buffer
* @param skipTrap If true, setting up a CPU trap will be skipped and the dirty state will be Clean/CpuDirty * @param skipTrap If true, setting up a CPU trap will be skipped and the dirty state will be Clean/CpuDirty
* @param skipFence If true, waiting on the currently attached fence will be skipped * @param nonBlocking If true, the call will return immediately if the fence is not signalled, skipping the sync
* @note The buffer **must** be locked prior to calling this * @note The buffer **must** be locked prior to calling this
*/ */
void SynchronizeGuest(bool skipTrap = false, bool skipFence = false); void SynchronizeGuest(bool skipTrap = false, bool nonBlocking = false);
/** /**
* @brief Synchronizes the guest buffer with the host buffer when the FenceCycle is signalled * @brief Synchronizes the guest buffer with the host buffer when the FenceCycle is signalled
@ -176,19 +200,40 @@ namespace skyline::gpu {
/** /**
* @brief Reads data at the specified offset in the buffer * @brief Reads data at the specified offset in the buffer
* @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
* @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
*/ */
void Read(span<u8> data, vk::DeviceSize offset); void Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset);
/** /**
* @brief Writes data at the specified offset in the buffer * @brief Writes data at the specified offset in the buffer
* @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
* @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
* @param gpuCopyCallback Callback to perform a GPU-side copy for this Write
*/ */
void Write(span<u8> data, vk::DeviceSize offset); void Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset);
/** /**
* @return A cached or newly created view into this buffer with the supplied attributes * @return A cached or newly created view into this buffer with the supplied attributes
* @note The buffer **must** be locked prior to calling this * @note The buffer **must** be locked prior to calling this
*/ */
BufferView GetView(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format = {}); BufferView GetView(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format = {});
/**
* @brief Pushes the current buffer contents into the megabuffer (if necessary)
* @return The offset of the pushed buffer contents in the megabuffer
* @note The buffer **must** be locked prior to calling this
* @note This will only push into the megabuffer when there have been modifications after the previous acquire, otherwise the previous offset will be reused
* @note An implicit CPU -> GPU sync will be performed when calling this, an immediate GPU -> CPU sync will also be attempted if the buffer is GPU dirty in the hope that megabuffering can be reenabled
*/
vk::DeviceSize AcquireMegaBuffer();
/**
* @brief Forces the buffer contents to be pushed into the megabuffer on the next AcquireMegaBuffer call
* @note The buffer **must** be locked prior to calling this
* @note This **must** be called after any modifications of the backing buffer data
*/
void InvalidateMegaBuffer();
}; };
/** /**
@ -254,13 +299,23 @@ namespace skyline::gpu {
/** /**
* @brief Reads data at the specified offset in the view * @brief Reads data at the specified offset in the view
* @note The view **must** be locked prior to calling this * @note The view **must** be locked prior to calling this
* @note See Buffer::Read
*/ */
void Read(span<u8> data, vk::DeviceSize offset) const; void Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const;
/** /**
* @brief Writes data at the specified offset in the view * @brief Writes data at the specified offset in the view
* @note The view **must** be locked prior to calling this * @note The view **must** be locked prior to calling this
* @note See Buffer::Write
*/ */
void Write(span<u8> data, vk::DeviceSize offset) const; void Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const;
/**
* @brief Pushes the current buffer contents into the megabuffer (if necessary)
* @return The offset of the pushed buffer contents in the megabuffer
* @note The view **must** be locked prior to calling this
* @note See Buffer::AcquireMegaBuffer
*/
vk::DeviceSize AcquireMegaBuffer() const;
}; };
} }

View File

@ -6,7 +6,39 @@
#include "buffer_manager.h" #include "buffer_manager.h"
namespace skyline::gpu { namespace skyline::gpu {
BufferManager::BufferManager(GPU &gpu) : gpu(gpu) {} MegaBuffer::MegaBuffer(GPU &gpu) : backing(gpu.memory.AllocateBuffer(Size)), freeRegion(backing.subspan(PAGE_SIZE)) {}
void MegaBuffer::Reset() {
std::scoped_lock lock{mutex};
freeRegion = backing.subspan(PAGE_SIZE);
}
vk::Buffer MegaBuffer::GetBacking() const {
return backing.vkBuffer;
}
vk::DeviceSize MegaBuffer::Push(span<u8> data, bool pageAlign) {
std::scoped_lock lock{mutex};
if (data.size() > freeRegion.size())
throw exception("Ran out of megabuffer space! Alloc size: 0x{:X}", data.size());
if (pageAlign) {
// If page aligned data was requested then align the free
auto alignedFreeBase{util::AlignUp(static_cast<size_t>(freeRegion.data() - backing.data()), PAGE_SIZE)};
freeRegion = backing.subspan(alignedFreeBase);
}
// Allocate space for data from the free region
auto resultSpan{freeRegion.subspan(0, data.size())};
resultSpan.copy_from(data);
// Move the free region along
freeRegion = freeRegion.subspan(data.size());
return static_cast<vk::DeviceSize>(resultSpan.data() - backing.data());
}
BufferManager::BufferManager(GPU &gpu) : gpu(gpu), megaBuffer(gpu) {}
bool BufferManager::BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer) { bool BufferManager::BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer) {
return it->guest->begin().base() < pointer; return it->guest->begin().base() < pointer;
@ -49,14 +81,10 @@ namespace skyline::gpu {
highestAddress = mapping.end().base(); highestAddress = mapping.end().base();
} }
auto newBuffer{std::make_shared<Buffer>(gpu, span<u8>(lowestAddress, highestAddress))}; auto newBuffer{std::make_shared<Buffer>(gpu, cycle, span<u8>(lowestAddress, highestAddress), overlaps)};
for (auto &overlap : overlaps) { for (auto &overlap : overlaps) {
std::scoped_lock overlapLock{*overlap}; std::scoped_lock overlapLock{*overlap};
if (!overlap->cycle.owner_before(cycle))
overlap->WaitOnFence(); // We want to only wait on the fence cycle if it's not the current fence cycle
overlap->SynchronizeGuest(true, true); // Sync back the buffer before we destroy it
buffers.erase(std::find(buffers.begin(), buffers.end(), overlap)); buffers.erase(std::find(buffers.begin(), buffers.end(), overlap));
// Transfer all views from the overlapping buffer to the new buffer with the new buffer and updated offset // Transfer all views from the overlapping buffer to the new buffer with the new buffer and updated offset

View File

@ -6,6 +6,37 @@
#include "buffer.h" #include "buffer.h"
namespace skyline::gpu { namespace skyline::gpu {
/**
* @brief A simple linearly allocated GPU-side buffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
*/
class MegaBuffer {
private:
constexpr static vk::DeviceSize Size{0x6'400'000}; //!< Size in bytes of the megabuffer (100MiB)
memory::Buffer backing; //!< The backing GPU buffer
std::mutex mutex; //!< Synchronizes access to freeRegion
span<u8> freeRegion; //!< Span of unallocated space in the megabuffer
public:
MegaBuffer(GPU &gpu);
/**
* @brief Resets the free region of the megabuffer to its initial state, data is left intact but may be overwritten
*/
void Reset();
/**
* @brief Returns the underlying Vulkan buffer for the megabuffer
*/
vk::Buffer GetBacking() const;
/**
* @brief Pushes data to the megabuffer and returns the offset at which it was written
* @param pageAlign Whether the pushed data should be page aligned in the megabuffer
*/
vk::DeviceSize Push(span<u8> data, bool pageAlign = false);
};
/** /**
* @brief The Buffer Manager is responsible for maintaining a global view of buffers being mapped from the guest to the host, any lookups and creation of host buffer from equivalent guest buffer alongside reconciliation of any overlaps with existing textures * @brief The Buffer Manager is responsible for maintaining a global view of buffers being mapped from the guest to the host, any lookups and creation of host buffer from equivalent guest buffer alongside reconciliation of any overlaps with existing textures
*/ */
@ -21,6 +52,8 @@ namespace skyline::gpu {
static bool BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer); static bool BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer);
public: public:
MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
BufferManager(GPU &gpu); BufferManager(GPU &gpu);
/** /**

View File

@ -30,18 +30,20 @@ namespace skyline::gpu::interconnect {
void CommandExecutor::AttachTexture(TextureView *view) { void CommandExecutor::AttachTexture(TextureView *view) {
auto texture{view->texture.get()}; auto texture{view->texture.get()};
if (!syncTextures.contains(texture)) { if (!attachedTextures.contains(texture)) {
texture->WaitOnFence(); texture->WaitOnFence();
texture->cycle = cycle; texture->cycle = cycle;
syncTextures.emplace(texture); attachedTextures.emplace(texture);
} }
cycle->AttachObject(view->shared_from_this()); cycle->AttachObject(view->shared_from_this());
} }
void CommandExecutor::AttachBuffer(BufferView &view) { void CommandExecutor::AttachBuffer(BufferView &view) {
if (!syncBuffers.contains(view.bufferDelegate)) { view->buffer->SynchronizeHost();
if (!attachedBuffers.contains(view.bufferDelegate)) {
view.AttachCycle(cycle); view.AttachCycle(cycle);
syncBuffers.emplace(view.bufferDelegate); attachedBuffers.emplace(view.bufferDelegate);
} }
} }
@ -142,16 +144,13 @@ namespace skyline::gpu::interconnect {
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
}); });
for (auto texture : syncTextures) { for (auto texture : attachedTextures) {
texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true); texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
texture->MarkGpuDirty(); texture->MarkGpuDirty();
} }
for (const auto& delegate : syncBuffers) { for (const auto& delegate : attachedBuffers)
delegate->buffer->SynchronizeHostWithCycle(cycle, true);
delegate->buffer->MarkGpuDirty();
delegate->usageCallback = nullptr; delegate->usageCallback = nullptr;
}
vk::RenderPass lRenderPass; vk::RenderPass lRenderPass;
u32 subpassIndex; u32 subpassIndex;
@ -182,11 +181,16 @@ namespace skyline::gpu::interconnect {
commandBuffer.end(); commandBuffer.end();
gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence()); gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());
for (const auto& delegate : attachedBuffers)
delegate->buffer->InvalidateMegaBuffer();
nodes.clear(); nodes.clear();
syncTextures.clear(); attachedTextures.clear();
syncBuffers.clear(); attachedBuffers.clear();
cycle = activeCommandBuffer.Reset(); cycle = activeCommandBuffer.Reset();
gpu.buffer.megaBuffer.Reset();
} }
} }
} }

View File

@ -19,10 +19,10 @@ namespace skyline::gpu::interconnect {
boost::container::stable_vector<node::NodeVariant> nodes; boost::container::stable_vector<node::NodeVariant> nodes;
node::RenderPassNode *renderPass{}; node::RenderPassNode *renderPass{};
size_t subpassCount{}; //!< The number of subpasses in the current render pass size_t subpassCount{}; //!< The number of subpasses in the current render pass
std::unordered_set<Texture *> syncTextures; //!< All textures that need to be synced prior to and after execution std::unordered_set<Texture *> attachedTextures; //!< All textures that need to be synced prior to and after execution
using SharedBufferDelegate = std::shared_ptr<Buffer::BufferDelegate>; using SharedBufferDelegate = std::shared_ptr<Buffer::BufferDelegate>;
std::unordered_set<SharedBufferDelegate> syncBuffers; //!< All buffers that need to be synced prior to and after execution std::unordered_set<SharedBufferDelegate> attachedBuffers; //!< All buffers that are attached to the current execution
/** /**
* @return If a new render pass was created by the function or the current one was reused as it was compatible * @return If a new render pass was created by the function or the current one was reused as it was compatible

View File

@ -618,10 +618,13 @@ namespace skyline::gpu::interconnect {
* @note This must only be called when the GuestBuffer is resolved correctly * @note This must only be called when the GuestBuffer is resolved correctly
*/ */
template<typename T> template<typename T>
T Read(size_t offset) const { T Read(CommandExecutor &pExecutor, size_t dstOffset) const {
T object; T object;
std::scoped_lock lock{view}; std::scoped_lock lock{view};
view.Read(span<T>(object).template cast<u8>(), offset); view.Read(pExecutor.cycle, []() {
// TODO: here we should trigger an execute, however that doesn't currently work due to Read being called mid-draw and attached objects not handling this case
Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
}, span<T>(object).template cast<u8>(), dstOffset);
return object; return object;
} }
@ -630,9 +633,26 @@ namespace skyline::gpu::interconnect {
* @note This must only be called when the GuestBuffer is resolved correctly * @note This must only be called when the GuestBuffer is resolved correctly
*/ */
template<typename T> template<typename T>
void Write(span<T> buf, size_t offset) { void Write(CommandExecutor &pExecutor, MegaBuffer &megaBuffer, span<T> buf, size_t dstOffset) {
auto srcCpuBuf{buf.template cast<u8>()};
std::scoped_lock lock{view}; std::scoped_lock lock{view};
view.Write(buf.template cast<u8>(), offset); view.Write(pExecutor.cycle, []() {
// TODO: see Read()
Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
}, [&megaBuffer, &pExecutor, srcCpuBuf, dstOffset, view = this->view]() {
auto srcGpuOffset{megaBuffer.Push(srcCpuBuf)};
auto srcGpuBuf{megaBuffer.GetBacking()};
pExecutor.AddOutsideRpCommand([=](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
std::scoped_lock lock{view};
vk::BufferCopy copyRegion{
.size = srcCpuBuf.size_bytes(),
.srcOffset = srcGpuOffset,
.dstOffset = view->view->offset + dstOffset
};
commandBuffer.copyBuffer(srcGpuBuf, view->buffer->GetBacking(), copyRegion);
});
}, srcCpuBuf, dstOffset);
} }
}; };
ConstantBuffer constantBufferSelector; //!< The constant buffer selector is used to bind a constant buffer to a stage or update data in it ConstantBuffer constantBufferSelector; //!< The constant buffer selector is used to bind a constant buffer to a stage or update data in it
@ -710,7 +730,7 @@ namespace skyline::gpu::interconnect {
void ConstantBufferUpdate(std::vector<u32> data, u32 offset) { void ConstantBufferUpdate(std::vector<u32> data, u32 offset) {
auto constantBuffer{GetConstantBufferSelector().value()}; auto constantBuffer{GetConstantBufferSelector().value()};
constantBuffer.Write<u32>(data, offset); constantBuffer.Write<u32>(executor, gpu.buffer.megaBuffer, data, offset);
} }
/* Shader Program */ /* Shader Program */
@ -869,7 +889,7 @@ namespace skyline::gpu::interconnect {
}; };
auto &cbuf{constantBuffers[descriptor.cbuf_index]}; auto &cbuf{constantBuffers[descriptor.cbuf_index]};
auto ssbo{cbuf.Read<SsboDescriptor>(descriptor.cbuf_offset)}; auto ssbo{cbuf.Read<SsboDescriptor>(executor, descriptor.cbuf_offset)};
auto mappings{channelCtx.asCtx->gmmu.TranslateRange(ssbo.iova, ssbo.size)}; auto mappings{channelCtx.asCtx->gmmu.TranslateRange(ssbo.iova, ssbo.size)};
if (mappings.size() != 1) if (mappings.size() != 1)
@ -1024,15 +1044,27 @@ namespace skyline::gpu::interconnect {
}); });
auto view{pipelineStage.constantBuffers[constantBuffer.index].view}; auto view{pipelineStage.constantBuffers[constantBuffer.index].view};
std::scoped_lock lock(view); std::scoped_lock lock(view);
view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) { if (auto megaBufferOffset{view.AcquireMegaBuffer()}) {
// If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
bufferDescriptors[bufferIndex] = vk::DescriptorBufferInfo{
.buffer = gpu.buffer.megaBuffer.GetBacking(),
.offset = megaBufferOffset,
.range = view->view->size
};
} else {
view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
*descriptor = vk::DescriptorBufferInfo{ *descriptor = vk::DescriptorBufferInfo{
.buffer = buffer->GetBacking(), .buffer = buffer->GetBacking(),
.offset = view.offset, .offset = view.offset,
.range = view.size, .range = view.size,
}; };
}); });
}
executor.AttachBuffer(view); executor.AttachBuffer(view);
bufferIndex++;
} }
} }
@ -1053,7 +1085,9 @@ namespace skyline::gpu::interconnect {
}); });
auto view{GetSsboViewFromDescriptor(storageBuffer, pipelineStage.constantBuffers)}; auto view{GetSsboViewFromDescriptor(storageBuffer, pipelineStage.constantBuffers)};
std::scoped_lock lock{view}; std::scoped_lock lock{view};
view->buffer->MarkGpuDirty(); // SSBOs may be written to by the GPU so mark as dirty (will also disable megabuffering)
view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) { view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
*descriptor = vk::DescriptorBufferInfo{ *descriptor = vk::DescriptorBufferInfo{
.buffer = buffer->GetBacking(), .buffer = buffer->GetBacking(),
@ -1105,7 +1139,7 @@ namespace skyline::gpu::interconnect {
u32 textureIndex : 20; u32 textureIndex : 20;
u32 samplerIndex : 12; u32 samplerIndex : 12;
}; };
} handle{constantBuffer.Read<u32>(texture.cbuf_offset)}; } handle{constantBuffer.Read<u32>(executor, texture.cbuf_offset)};
auto sampler{GetSampler(handle.samplerIndex)}; auto sampler{GetSampler(handle.samplerIndex)};
auto textureView{GetPoolTextureView(handle.textureIndex)}; auto textureView{GetPoolTextureView(handle.textureIndex)};
@ -2634,10 +2668,16 @@ namespace skyline::gpu::interconnect {
std::scoped_lock lock(indexBufferView); std::scoped_lock lock(indexBufferView);
boundIndexBuffer->type = indexBuffer.type; boundIndexBuffer->type = indexBuffer.type;
if (auto megaBufferOffset{indexBufferView.AcquireMegaBuffer()}) {
// If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
boundIndexBuffer->handle = gpu.buffer.megaBuffer.GetBacking();
boundIndexBuffer->offset = megaBufferOffset;
} else {
indexBufferView.RegisterUsage([=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) { indexBufferView.RegisterUsage([=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
boundIndexBuffer->handle = buffer->GetBacking(); boundIndexBuffer->handle = buffer->GetBacking();
boundIndexBuffer->offset = view.offset; boundIndexBuffer->offset = view.offset;
}); });
}
executor.AttachBuffer(indexBufferView); executor.AttachBuffer(indexBufferView);
} }
@ -2662,11 +2702,17 @@ namespace skyline::gpu::interconnect {
vertexBindingDivisorsDescriptions.push_back(vertexBuffer->bindingDivisorDescription); vertexBindingDivisorsDescriptions.push_back(vertexBuffer->bindingDivisorDescription);
std::scoped_lock vertexBufferLock(vertexBufferView); std::scoped_lock vertexBufferLock(vertexBufferView);
if (auto megaBufferOffset{vertexBufferView.AcquireMegaBuffer()}) {
// If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
boundVertexBuffers->handles[index] = gpu.buffer.megaBuffer.GetBacking();
boundVertexBuffers->offsets[index] = megaBufferOffset;
} else {
vertexBufferView.RegisterUsage([handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) { vertexBufferView.RegisterUsage([handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
*handle = buffer->GetBacking(); *handle = buffer->GetBacking();
*offset = view.offset; *offset = view.offset;
}); });
}
executor.AttachBuffer(vertexBufferView); executor.AttachBuffer(vertexBufferView);
} }
} }