Impl preserve attached buffers/textures to avoid GPFIFO lock thrashing

When profiling SMO, it became obvious that the constant locking of textures and buffers in SyncDescriptors took up a large amount of CPU time (3-5%), a precious resource in intensive areas like Metro. This commit implements somewhat of a workaround to avoid constant relocking, if a buffer is frequently attached on the GPU and almost never used on the CPU we can keep the lock held between executions. Of course it's not that simple though, if the guest tries to lock a texture for the first time which has already been locked as preserve on the GPFIFO we need to avoid a deadlock. This is acheived through a combination of two things: first we periodically clear the locked attachments every 2*SlotCount submissions, preventing a complete deadlock on the CPU (just a long wait instead) and meaning that the next time the resource is attached on the GPU it will not be marked for preservation due to having been locked on the guest before; second, we always need to unlock everything when the GPU thread runs out of work, as the perioding clearing will not execute in this case which would otherwise leave the textures locked on the GPFIFO thread forever (if guest was waiting on a lock to submit work). It should be noted that we don't clear preserve attached resources in the latter scenario, only unlock them and then relock when more work is available.
This commit is contained in:
Billy Laws 2022-10-09 13:23:42 +01:00
parent 0e8ccf1e99
commit 98c0cc3e7f
5 changed files with 94 additions and 23 deletions

View File

@ -345,7 +345,7 @@ namespace skyline::gpu {
void Buffer::unlock() {
tag = ContextTag{};
backingImmutability = BackingImmutability::None;
AllowAllBackingWrites();
mutex.unlock();
}

View File

@ -43,7 +43,7 @@ namespace skyline::gpu {
class Buffer : public std::enable_shared_from_this<Buffer> {
private:
GPU &gpu;
SpinLock mutex; //!< Synchronizes any mutations to the buffer or its backing
RecursiveSpinLock mutex; //!< Synchronizes any mutations to the buffer or its backing
std::atomic<ContextTag> tag{}; //!< The tag associated with the last lock call
memory::Buffer backing;
std::optional<GuestBuffer> guest;
@ -190,6 +190,11 @@ namespace skyline::gpu {
backingImmutability = BackingImmutability::AllWrites;
}
void AllowAllBackingWrites() {
std::scoped_lock lock{stateMutex};
backingImmutability = BackingImmutability::None;
}
/**
* @return If sequenced writes to the backing must not occur on the CPU
* @note The buffer **must** be locked prior to calling this

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <range/v3/view.hpp>
#include <loader/loader.h>
#include <gpu.h>
#include "command_executor.h"
@ -234,8 +235,12 @@ namespace skyline::gpu::interconnect {
textureManagerLock.emplace(gpu.texture);
bool didLock{view->LockWithTag(tag)};
if (didLock)
attachedTextures.emplace_back(view->texture);
if (didLock) {
if (view->texture->FrequentlyLocked())
attachedTextures.emplace_back(view->texture);
else
preserveAttachedTextures.emplace_back(view->texture);
}
return didLock;
}
@ -259,8 +264,12 @@ namespace skyline::gpu::interconnect {
bufferManagerLock.emplace(gpu.buffer);
bool didLock{view.LockWithTag(tag)};
if (didLock)
attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
if (didLock) {
if (view.GetBuffer()->FrequentlyLocked())
attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
else
preserveAttachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
}
return didLock;
}
@ -271,14 +280,20 @@ namespace skyline::gpu::interconnect {
if (lock.OwnsLock()) {
// Transfer ownership to executor so that the resource will stay locked for the period it is used on the GPU
attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
if (view.GetBuffer()->FrequentlyLocked())
attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
else
preserveAttachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
lock.Release(); // The executor will handle unlocking the lock so it doesn't need to be handled here
}
}
void CommandExecutor::AttachLockedBuffer(std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
if (lock.OwnsLock()) {
attachedBuffers.emplace_back(std::move(buffer));
if (buffer->FrequentlyLocked())
attachedBuffers.emplace_back(std::move(buffer));
else
preserveAttachedBuffers.emplace_back(std::move(buffer));
lock.Release(); // See AttachLockedBufferView(...)
}
}
@ -381,24 +396,25 @@ namespace skyline::gpu::interconnect {
}, {}, {}
);
for (const auto &texture : attachedTextures)
boost::container::small_vector<FenceCycle *, 8> chainedCycles;
for (const auto &texture : ranges::views::concat(attachedTextures, preserveAttachedTextures)) {
texture->SynchronizeHostInline(slot->commandBuffer, cycle, true);
// We don't need to attach the Texture to the cycle as a TextureView will already be attached
if (ranges::find(chainedCycles, texture->cycle.get()) == chainedCycles.end()) {
cycle->ChainCycle(texture->cycle);
chainedCycles.emplace_back(texture->cycle.get());
}
texture->cycle = cycle;
}
}
for (const auto &attachedBuffer : attachedBuffers)
if (attachedBuffer->SequencedCpuBackingWritesBlocked())
for (const auto &attachedBuffer : ranges::views::concat(attachedBuffers, preserveAttachedBuffers)) {
if (attachedBuffer->RequiresCycleAttach()) {
attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer
for (const auto &attachedTexture : attachedTextures) {
// We don't need to attach the Texture to the cycle as a TextureView will already be attached
cycle->ChainCycle(attachedTexture->cycle);
attachedTexture->cycle = cycle;
}
for (const auto &attachedBuffer : attachedBuffers) {
if (attachedBuffer->RequiresCycleAttach() ) {
cycle->AttachObject(attachedBuffer.buffer);
attachedBuffer->UpdateCycle(cycle);
attachedBuffer->AllowAllBackingWrites();
}
}
@ -412,6 +428,12 @@ namespace skyline::gpu::interconnect {
bufferManagerLock.reset();
megaBufferAllocatorLock.reset();
allocator->Reset();
// Periodically clear preserve attachments just in case there are new waiters which would otherwise end up waiting forever
if ((submissionNumber % CommandRecordThread::ActiveRecordSlots * 2) == 0) {
preserveAttachedBuffers.clear();
preserveAttachedTextures.clear();
}
}
void CommandExecutor::Submit() {
@ -423,7 +445,33 @@ namespace skyline::gpu::interconnect {
if (!slot->nodes.empty()) {
TRACE_EVENT("gpu", "CommandExecutor::Submit");
SubmitInternal();
submissionNumber++;
}
ResetInternal();
}
void CommandExecutor::LockPreserve() {
if (!preserveLocked) {
preserveLocked = true;
for (auto &buffer : preserveAttachedBuffers)
buffer->LockWithTag(tag);
for (auto &texture : preserveAttachedTextures)
texture->LockWithTag(tag);
}
}
void CommandExecutor::UnlockPreserve() {
if (preserveLocked) {
for (auto &buffer : preserveAttachedBuffers)
buffer->unlock();
for (auto &texture : preserveAttachedTextures)
texture->unlock();
preserveLocked = false;
}
}
}

View File

@ -15,6 +15,8 @@ namespace skyline::gpu::interconnect {
*/
class CommandRecordThread {
public:
static constexpr size_t ActiveRecordSlots{6}; //!< Maximum number of simultaneously active slots
/**
* @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread
*/
@ -40,7 +42,6 @@ namespace skyline::gpu::interconnect {
const DeviceState &state;
std::thread thread;
static constexpr size_t ActiveRecordSlots{6}; //!< Maximum number of simultaneously active slots
CircularQueue<Slot *> incoming{ActiveRecordSlots}; //!< Slots pending recording
CircularQueue<Slot *> outgoing{ActiveRecordSlots}; //!< Slots that have been submitted, may still be active on the GPU
@ -76,6 +77,7 @@ namespace skyline::gpu::interconnect {
std::optional<std::scoped_lock<TextureManager>> textureManagerLock; //!< The lock on the texture manager, this is locked for the duration of the command execution from the first usage inside an execution to the submission
std::optional<std::scoped_lock<BufferManager>> bufferManagerLock; //!< The lock on the buffer manager, see above for details
std::optional<std::scoped_lock<MegaBufferAllocator>> megaBufferAllocatorLock; //!< The lock on the megabuffer allocator, see above for details
bool preserveLocked{};
/**
* @brief A wrapper of a Texture object that has been locked beforehand and must be unlocked afterwards
@ -94,6 +96,7 @@ namespace skyline::gpu::interconnect {
~LockedTexture();
};
std::vector<LockedTexture> preserveAttachedTextures;
std::vector<LockedTexture> attachedTextures; //!< All textures that are attached to the current execution
/**
@ -113,6 +116,7 @@ namespace skyline::gpu::interconnect {
~LockedBuffer();
};
std::vector<LockedBuffer> preserveAttachedBuffers;
std::vector<LockedBuffer> attachedBuffers; //!< All textures that are attached to the current execution
@ -154,6 +158,7 @@ namespace skyline::gpu::interconnect {
std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
LinearAllocatorState<> *allocator;
ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
size_t submissionNumber{};
u32 executionNumber{};
CommandExecutor(const DeviceState &state);
@ -258,5 +263,17 @@ namespace skyline::gpu::interconnect {
* @brief Execute all the nodes and submit the resulting command buffer to the GPU
*/
void Submit();
/**
* @brief Locks all preserve attached buffers/textures
* @note This **MUST** be called before attaching any buffers/textures to an execution
*/
void LockPreserve();
/**
* @brief Unlocks all preserve attached buffers/textures
* @note This **MUST** be called when there is no GPU work left to be done to avoid deadlocks where the guest will try to lock a buffer/texture but the GPFIFO thread has no work so won't periodically unlock it
*/
void UnlockPreserve();
};
}

View File

@ -3,6 +3,7 @@
#pragma once
#include <common/spin_lock.h>
#include <common/lockable_shared_ptr.h>
#include <nce.h>
#include <gpu/tag_allocator.h>
@ -356,9 +357,9 @@ namespace skyline::gpu {
class Texture : public std::enable_shared_from_this<Texture> {
private:
GPU &gpu;
std::mutex mutex; //!< Synchronizes any mutations to the texture or its backing
RecursiveSpinLock mutex; //!< Synchronizes any mutations to the texture or its backing
std::atomic<ContextTag> tag{}; //!< The tag associated with the last lock call
std::condition_variable backingCondition; //!< Signalled when a valid backing has been swapped in
std::condition_variable_any backingCondition; //!< Signalled when a valid backing has been swapped in
using BackingType = std::variant<vk::Image, vk::raii::Image, memory::Image>;
BackingType backing; //!< The Vulkan image that backs this texture, it is nullable