Implement access-driven Buffer synchronization

Similar to constant redundant synchronization for textures, there is a lot of redundant synchronization of buffers. Albeit, buffer synchronization is far cheaper than texture synchronization it still has associated costs which have now been reduced by only synchronizing on access.
This commit is contained in:
PixelyIon 2022-03-06 21:07:37 +05:30
parent 7532eaf050
commit 881bb969c4
3 changed files with 79 additions and 28 deletions

View File

@ -45,20 +45,39 @@ namespace skyline::gpu {
alignedMirror = gpu.state.process->memory.CreateMirrors(alignedMappings); alignedMirror = gpu.state.process->memory.CreateMirrors(alignedMappings);
mirror = alignedMirror.subspan(static_cast<size_t>(frontMapping.data() - alignedData), totalSize); mirror = alignedMirror.subspan(static_cast<size_t>(frontMapping.data() - alignedData), totalSize);
} }
trapHandle = gpu.state.nce->TrapRegions(mappings, true, [this] {
std::lock_guard lock(*this);
SynchronizeGuest(true); // We can skip trapping since the caller will do it
WaitOnFence();
}, [this] {
std::lock_guard lock(*this);
SynchronizeGuest(true);
dirtyState = DirtyState::CpuDirty; // We need to assume the buffer is dirty since we don't know what the guest is writing
WaitOnFence();
});
} }
Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), size(guest.BufferSize()), backing(gpu.memory.AllocateBuffer(size)), guest(std::move(guest)) { Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), size(guest.BufferSize()), backing(gpu.memory.AllocateBuffer(size)), guest(std::move(guest)) {
SetupGuestMappings(); SetupGuestMappings();
SynchronizeHost();
} }
Buffer::~Buffer() { Buffer::~Buffer() {
std::lock_guard lock(*this); std::lock_guard lock(*this);
if (trapHandle)
gpu.state.nce->DeleteTrap(*trapHandle);
SynchronizeGuest(true); SynchronizeGuest(true);
if (alignedMirror.valid()) if (alignedMirror.valid())
munmap(alignedMirror.data(), alignedMirror.size()); munmap(alignedMirror.data(), alignedMirror.size());
} }
void Buffer::MarkGpuDirty() {
if (dirtyState == DirtyState::GpuDirty)
return;
gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty;
}
void Buffer::WaitOnFence() { void Buffer::WaitOnFence() {
TRACE_EVENT("gpu", "Buffer::WaitOnFence"); TRACE_EVENT("gpu", "Buffer::WaitOnFence");
@ -69,44 +88,58 @@ namespace skyline::gpu {
} }
} }
void Buffer::SynchronizeHost() { void Buffer::SynchronizeHost(bool rwTrap) {
if (dirtyState != DirtyState::CpuDirty)
return; // If the buffer has not been modified on the CPU, there is no need to synchronize it
WaitOnFence(); WaitOnFence();
TRACE_EVENT("gpu", "Buffer::SynchronizeHost"); TRACE_EVENT("gpu", "Buffer::SynchronizeHost");
auto host{backing.data()}; std::memcpy(backing.data(), mirror.data(), mirror.size());
for (auto &mapping : guest.mappings) {
auto mappingSize{mapping.size_bytes()}; if (rwTrap) {
std::memcpy(host, mapping.data(), mappingSize); gpu.state.nce->RetrapRegions(*trapHandle, false);
host += mappingSize; dirtyState = DirtyState::GpuDirty;
} else {
gpu.state.nce->RetrapRegions(*trapHandle, true);
dirtyState = DirtyState::Clean;
} }
} }
void Buffer::SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &pCycle) { void Buffer::SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &pCycle, bool rwTrap) {
if (dirtyState != DirtyState::CpuDirty)
return;
if (pCycle != cycle.lock()) if (pCycle != cycle.lock())
WaitOnFence(); WaitOnFence();
TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle"); TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");
auto host{backing.data()}; std::memcpy(backing.data(), mirror.data(), mirror.size());
for (auto &mapping : guest.mappings) {
auto mappingSize{mapping.size_bytes()}; if (rwTrap) {
std::memcpy(host, mapping.data(), mappingSize); gpu.state.nce->RetrapRegions(*trapHandle, false);
host += mappingSize; dirtyState = DirtyState::GpuDirty;
} else {
gpu.state.nce->RetrapRegions(*trapHandle, true);
dirtyState = DirtyState::Clean;
} }
} }
void Buffer::SynchronizeGuest() { void Buffer::SynchronizeGuest(bool skipTrap) {
if (dirtyState != DirtyState::GpuDirty)
return; // If the buffer has not been used on the GPU, there is no need to synchronize it
WaitOnFence(); WaitOnFence();
TRACE_EVENT("gpu", "Buffer::SynchronizeGuest"); TRACE_EVENT("gpu", "Buffer::SynchronizeGuest");
auto host{backing.data()}; std::memcpy(mirror.data(), backing.data(), mirror.size());
for (auto &mapping : guest.mappings) {
auto mappingSize{mapping.size_bytes()}; if (!skipTrap)
std::memcpy(mapping.data(), host, mappingSize); gpu.state.nce->RetrapRegions(*trapHandle, true);
host += mappingSize; dirtyState = DirtyState::Clean;
}
} }
/** /**
@ -132,7 +165,10 @@ namespace skyline::gpu {
} }
void Buffer::Write(span<u8> data, vk::DeviceSize offset) { void Buffer::Write(span<u8> data, vk::DeviceSize offset) {
if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean)
std::memcpy(mirror.data() + offset, data.data(), data.size()); std::memcpy(mirror.data() + offset, data.data(), data.size());
if (dirtyState == DirtyState::GpuDirty || dirtyState == DirtyState::Clean)
std::memcpy(backing.data() + offset, data.data(), data.size());
} }
std::shared_ptr<BufferView> Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize range, vk::Format format) { std::shared_ptr<BufferView> Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize range, vk::Format format) {

View File

@ -3,6 +3,7 @@
#pragma once #pragma once
#include <nce.h>
#include "memory_manager.h" #include "memory_manager.h"
namespace skyline::gpu { namespace skyline::gpu {
@ -36,6 +37,13 @@ namespace skyline::gpu {
span<u8> mirror{}; //!< A contiguous mirror of all the guest mappings to allow linear access on the CPU span<u8> mirror{}; //!< A contiguous mirror of all the guest mappings to allow linear access on the CPU
span<u8> alignedMirror{}; //!< The mirror mapping aligned to page size to reflect the full mapping span<u8> alignedMirror{}; //!< The mirror mapping aligned to page size to reflect the full mapping
std::optional<nce::NCE::TrapHandle> trapHandle{}; //!< The handle of the traps for the guest mappings
enum class DirtyState {
Clean, //!< The CPU mappings are in sync with the GPU buffer
CpuDirty, //!< The CPU mappings have been modified but the GPU buffer is not up to date
GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated
} dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer
std::vector<std::weak_ptr<BufferView>> views; //!< BufferView(s) that are backed by this Buffer, used for repointing to a new Buffer on deletion std::vector<std::weak_ptr<BufferView>> views; //!< BufferView(s) that are backed by this Buffer, used for repointing to a new Buffer on deletion
friend BufferView; friend BufferView;
@ -81,6 +89,13 @@ namespace skyline::gpu {
return mutex.try_lock(); return mutex.try_lock();
} }
/**
* @brief Marks the buffer as dirty on the GPU, it will be synced on the next call to SynchronizeGuest
* @note This **must** be called after syncing the buffer to the GPU not before
* @note The buffer **must** be locked prior to calling this
*/
void MarkGpuDirty();
/** /**
* @brief Waits on a fence cycle if it exists till it's signalled and resets it after * @brief Waits on a fence cycle if it exists till it's signalled and resets it after
* @note The buffer **must** be locked prior to calling this * @note The buffer **must** be locked prior to calling this
@ -89,22 +104,25 @@ namespace skyline::gpu {
/** /**
* @brief Synchronizes the host buffer with the guest * @brief Synchronizes the host buffer with the guest
* @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
* @note The buffer **must** be locked prior to calling this * @note The buffer **must** be locked prior to calling this
*/ */
void SynchronizeHost(); void SynchronizeHost(bool rwTrap = false);
/** /**
* @brief Synchronizes the host buffer with the guest * @brief Synchronizes the host buffer with the guest
* @param cycle A FenceCycle that is checked against the held one to skip waiting on it when equal * @param cycle A FenceCycle that is checked against the held one to skip waiting on it when equal
* @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
* @note The buffer **must** be locked prior to calling this * @note The buffer **must** be locked prior to calling this
*/ */
void SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &cycle); void SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &cycle, bool rwTrap = false);
/** /**
* @brief Synchronizes the guest buffer with the host buffer * @brief Synchronizes the guest buffer with the host buffer
* @param skipTrap If true, setting up a CPU trap will be skipped and the dirty state will be Clean/CpuDirty
* @note The buffer **must** be locked prior to calling this * @note The buffer **must** be locked prior to calling this
*/ */
void SynchronizeGuest(); void SynchronizeGuest(bool skipTrap = false);
/** /**
* @brief Synchronizes the guest buffer with the host buffer when the FenceCycle is signalled * @brief Synchronizes the guest buffer with the host buffer when the FenceCycle is signalled

View File

@ -134,7 +134,7 @@ namespace skyline::gpu::interconnect {
texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true); texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
for (auto buffer : syncBuffers) for (auto buffer : syncBuffers)
buffer->SynchronizeHostWithCycle(cycle); buffer->SynchronizeHostWithCycle(cycle, true);
vk::RenderPass lRenderPass; vk::RenderPass lRenderPass;
u32 subpassIndex; u32 subpassIndex;
@ -162,9 +162,6 @@ namespace skyline::gpu::interconnect {
#undef NODE #undef NODE
} }
for (auto buffer : syncBuffers)
buffer->SynchronizeGuestWithCycle(cycle);
commandBuffer.end(); commandBuffer.end();
gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence()); gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());