mirror of
https://github.com/skyline-emu/skyline.git
synced 2025-01-01 10:05:29 +03:00
Introduce FenceCycle
Waiter Thread
A substantial amount of time is spent destroying dependencies for any threads waiting or polling `FenceCycle`s, this is not optimal as it blocks them from moving onto other tasks while destruction is a fundamentally async task and can be delayed. This commit solves this by introducing a thread that is dedicated to waiting on every `FenceCycle` then signalling and destroying all dependencies which entirely fixes the issue of destruction blocking on more important threads.
This commit is contained in:
parent
5f8619f791
commit
e1a4325137
@ -368,7 +368,7 @@ namespace skyline::gpu {
|
|||||||
vkDevice(CreateDevice(vkContext, vkPhysicalDevice, vkQueueFamilyIndex, traits)),
|
vkDevice(CreateDevice(vkContext, vkPhysicalDevice, vkQueueFamilyIndex, traits)),
|
||||||
vkQueue(vkDevice, vkQueueFamilyIndex, 0),
|
vkQueue(vkDevice, vkQueueFamilyIndex, 0),
|
||||||
memory(*this),
|
memory(*this),
|
||||||
scheduler(*this),
|
scheduler(state, *this),
|
||||||
presentation(state, *this),
|
presentation(state, *this),
|
||||||
texture(*this),
|
texture(*this),
|
||||||
buffer(*this),
|
buffer(*this),
|
||||||
|
@ -2,19 +2,53 @@
|
|||||||
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
||||||
|
|
||||||
#include <gpu.h>
|
#include <gpu.h>
|
||||||
|
#include <loader/loader.h>
|
||||||
#include "command_scheduler.h"
|
#include "command_scheduler.h"
|
||||||
|
|
||||||
namespace skyline::gpu {
|
namespace skyline::gpu {
|
||||||
|
void CommandScheduler::WaiterThread() {
|
||||||
|
if (int result{pthread_setname_np(pthread_self(), "Sky-CycleWaiter")})
|
||||||
|
Logger::Warn("Failed to set the thread name: {}", strerror(result));
|
||||||
|
|
||||||
|
try {
|
||||||
|
signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
|
||||||
|
|
||||||
|
cycleQueue.Process([](const std::shared_ptr<FenceCycle> &cycle) {
|
||||||
|
cycle->Wait(true);
|
||||||
|
}, [] {});
|
||||||
|
} catch (const signal::SignalException &e) {
|
||||||
|
Logger::Error("{}\nStack Trace:{}", e.what(), state.loader->GetStackTrace(e.frames));
|
||||||
|
if (state.process)
|
||||||
|
state.process->Kill(false);
|
||||||
|
else
|
||||||
|
std::rethrow_exception(std::current_exception());
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
Logger::Error(e.what());
|
||||||
|
if (state.process)
|
||||||
|
state.process->Kill(false);
|
||||||
|
else
|
||||||
|
std::rethrow_exception(std::current_exception());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool)
|
CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool)
|
||||||
: device(device),
|
: device(device),
|
||||||
commandBuffer(device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)),
|
commandBuffer(device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)),
|
||||||
fence(device, vk::FenceCreateInfo{}),
|
fence(device, vk::FenceCreateInfo{}),
|
||||||
cycle(std::make_shared<FenceCycle>(device, *fence)) {}
|
cycle(std::make_shared<FenceCycle>(device, *fence)) {}
|
||||||
|
|
||||||
CommandScheduler::CommandScheduler(GPU &pGpu) : gpu(pGpu), pool(std::ref(pGpu.vkDevice), vk::CommandPoolCreateInfo{
|
CommandScheduler::CommandScheduler(const DeviceState &state, GPU &pGpu)
|
||||||
.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
|
: state{state},
|
||||||
.queueFamilyIndex = pGpu.vkQueueFamilyIndex,
|
gpu{pGpu},
|
||||||
}) {}
|
waiterThread{&CommandScheduler::WaiterThread, this},
|
||||||
|
pool{std::ref(pGpu.vkDevice), vk::CommandPoolCreateInfo{
|
||||||
|
.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
|
||||||
|
.queueFamilyIndex = pGpu.vkQueueFamilyIndex,
|
||||||
|
}} {}
|
||||||
|
|
||||||
|
CommandScheduler::~CommandScheduler() {
|
||||||
|
waiterThread.join();
|
||||||
|
}
|
||||||
|
|
||||||
CommandScheduler::ActiveCommandBuffer CommandScheduler::AllocateCommandBuffer() {
|
CommandScheduler::ActiveCommandBuffer CommandScheduler::AllocateCommandBuffer() {
|
||||||
for (auto &slot : pool->buffers) {
|
for (auto &slot : pool->buffers) {
|
||||||
@ -42,11 +76,15 @@ namespace skyline::gpu {
|
|||||||
return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)};
|
return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)};
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, vk::Fence fence) {
|
void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle) {
|
||||||
std::scoped_lock lock(gpu.queueMutex);
|
{
|
||||||
gpu.vkQueue.submit(vk::SubmitInfo{
|
std::scoped_lock lock(gpu.queueMutex);
|
||||||
.commandBufferCount = 1,
|
gpu.vkQueue.submit(vk::SubmitInfo{
|
||||||
.pCommandBuffers = &*commandBuffer,
|
.commandBufferCount = 1,
|
||||||
}, fence);
|
.pCommandBuffers = &*commandBuffer,
|
||||||
|
}, cycle->fence);
|
||||||
|
}
|
||||||
|
|
||||||
|
cycleQueue.Push(cycle);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <common/thread_local.h>
|
#include <common/thread_local.h>
|
||||||
|
#include <common/circular_queue.h>
|
||||||
#include "fence_cycle.h"
|
#include "fence_cycle.h"
|
||||||
|
|
||||||
namespace skyline::gpu {
|
namespace skyline::gpu {
|
||||||
@ -25,6 +26,7 @@ namespace skyline::gpu {
|
|||||||
CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool);
|
CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const DeviceState &state;
|
||||||
GPU &gpu;
|
GPU &gpu;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -40,6 +42,12 @@ namespace skyline::gpu {
|
|||||||
};
|
};
|
||||||
ThreadLocal<CommandPool> pool;
|
ThreadLocal<CommandPool> pool;
|
||||||
|
|
||||||
|
std::thread waiterThread; //!< A thread that waits on and signals FenceCycle(s) then clears any associated resources
|
||||||
|
static constexpr size_t FenceCycleWaitCount{256}; //!< The amount of fence cycles the cycle queue can hold
|
||||||
|
CircularQueue<std::shared_ptr<FenceCycle>> cycleQueue{FenceCycleWaitCount}; //!< A circular queue containing all the active cycles that can be waited on
|
||||||
|
|
||||||
|
void WaiterThread();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* @brief An active command buffer occupies a slot and ensures that its status is updated correctly
|
* @brief An active command buffer occupies a slot and ensures that its status is updated correctly
|
||||||
@ -92,7 +100,9 @@ namespace skyline::gpu {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
CommandScheduler(GPU &gpu);
|
CommandScheduler(const DeviceState &state, GPU &gpu);
|
||||||
|
|
||||||
|
~CommandScheduler();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Allocates an existing or new primary command buffer from the pool
|
* @brief Allocates an existing or new primary command buffer from the pool
|
||||||
@ -100,9 +110,11 @@ namespace skyline::gpu {
|
|||||||
ActiveCommandBuffer AllocateCommandBuffer();
|
ActiveCommandBuffer AllocateCommandBuffer();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Submits a single command buffer to the GPU queue with an optional fence
|
* @brief Submits a single command buffer to the GPU queue while queuing it up to be waited on
|
||||||
|
* @note The supplied command buffer and cycle **must** be from AllocateCommandBuffer()
|
||||||
|
* @note Any cycle submitted via this method does not need to destroy dependencies manually, the waiter thread will handle this
|
||||||
*/
|
*/
|
||||||
void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, vk::Fence fence = {});
|
void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Submits a command buffer recorded with the supplied function synchronously
|
* @brief Submits a command buffer recorded with the supplied function synchronously
|
||||||
@ -116,8 +128,10 @@ namespace skyline::gpu {
|
|||||||
});
|
});
|
||||||
recordFunction(*commandBuffer);
|
recordFunction(*commandBuffer);
|
||||||
commandBuffer->end();
|
commandBuffer->end();
|
||||||
SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
|
|
||||||
return commandBuffer.GetFenceCycle();
|
auto cycle{commandBuffer.GetFenceCycle()};
|
||||||
|
SubmitCommandBuffer(*commandBuffer, cycle);
|
||||||
|
return cycle;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
commandBuffer.GetFenceCycle()->Cancel();
|
commandBuffer.GetFenceCycle()->Cancel();
|
||||||
std::rethrow_exception(std::current_exception());
|
std::rethrow_exception(std::current_exception());
|
||||||
@ -130,14 +144,16 @@ namespace skyline::gpu {
|
|||||||
template<typename RecordFunction>
|
template<typename RecordFunction>
|
||||||
std::shared_ptr<FenceCycle> SubmitWithCycle(RecordFunction recordFunction) {
|
std::shared_ptr<FenceCycle> SubmitWithCycle(RecordFunction recordFunction) {
|
||||||
auto commandBuffer{AllocateCommandBuffer()};
|
auto commandBuffer{AllocateCommandBuffer()};
|
||||||
|
auto cycle{commandBuffer.GetFenceCycle()};
|
||||||
try {
|
try {
|
||||||
commandBuffer->begin(vk::CommandBufferBeginInfo{
|
commandBuffer->begin(vk::CommandBufferBeginInfo{
|
||||||
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
|
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
|
||||||
});
|
});
|
||||||
recordFunction(*commandBuffer, commandBuffer.GetFenceCycle());
|
recordFunction(*commandBuffer, cycle);
|
||||||
commandBuffer->end();
|
commandBuffer->end();
|
||||||
SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
|
|
||||||
return commandBuffer.GetFenceCycle();
|
SubmitCommandBuffer(*commandBuffer, cycle);
|
||||||
|
return cycle;
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
commandBuffer.GetFenceCycle()->Cancel();
|
commandBuffer.GetFenceCycle()->Cancel();
|
||||||
std::rethrow_exception(std::current_exception());
|
std::rethrow_exception(std::current_exception());
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
#include <common/atomic_forward_list.h>
|
#include <common/atomic_forward_list.h>
|
||||||
|
|
||||||
namespace skyline::gpu {
|
namespace skyline::gpu {
|
||||||
struct FenceCycle;
|
class CommandScheduler;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief A wrapper around a Vulkan Fence which only tracks a single reset -> signal cycle with the ability to attach lifetimes of objects to it
|
* @brief A wrapper around a Vulkan Fence which only tracks a single reset -> signal cycle with the ability to attach lifetimes of objects to it
|
||||||
@ -17,19 +17,23 @@ namespace skyline::gpu {
|
|||||||
*/
|
*/
|
||||||
struct FenceCycle {
|
struct FenceCycle {
|
||||||
private:
|
private:
|
||||||
std::atomic_flag signalled;
|
std::atomic_flag signalled{}; //!< If the underlying fence has been signalled since the creation of this FenceCycle, this doesn't necessarily mean the dependencies have been destroyed
|
||||||
|
std::atomic_flag alreadyDestroyed{}; //!< If the cycle's dependencies are already destroyed, this prevents multiple destructions
|
||||||
const vk::raii::Device &device;
|
const vk::raii::Device &device;
|
||||||
vk::Fence fence;
|
vk::Fence fence;
|
||||||
|
|
||||||
|
friend CommandScheduler;
|
||||||
|
|
||||||
AtomicForwardList<std::shared_ptr<void>> dependencies; //!< A list of all dependencies on this fence cycle
|
AtomicForwardList<std::shared_ptr<void>> dependencies; //!< A list of all dependencies on this fence cycle
|
||||||
AtomicForwardList<std::shared_ptr<FenceCycle>> chainedCycles; //!< A list of all chained FenceCycles, this is used to express multi-fence dependencies
|
AtomicForwardList<std::shared_ptr<FenceCycle>> chainedCycles; //!< A list of all chained FenceCycles, this is used to express multi-fence dependencies
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Sequentially iterate through the shared_ptr linked list of dependencies and reset all pointers in a thread-safe atomic manner
|
* @brief Destroy all the dependencies of this cycle
|
||||||
* @note We cannot simply nullify the base pointer of the list as a false dependency chain is maintained between the objects when retained externally
|
* @note We cannot delete the chained cycles associated with this fence as they may be iterated over during the deletion, it is only safe to delete them during the destruction of the cycle
|
||||||
*/
|
*/
|
||||||
void DestroyDependencies() {
|
void DestroyDependencies() {
|
||||||
dependencies.Clear();
|
if (!alreadyDestroyed.test_and_set(std::memory_order_release))
|
||||||
|
dependencies.Clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -45,19 +49,23 @@ namespace skyline::gpu {
|
|||||||
* @brief Signals this fence regardless of if the underlying fence has been signalled or not
|
* @brief Signals this fence regardless of if the underlying fence has been signalled or not
|
||||||
*/
|
*/
|
||||||
void Cancel() {
|
void Cancel() {
|
||||||
if (!signalled.test_and_set(std::memory_order_release))
|
signalled.test_and_set(std::memory_order_release);
|
||||||
DestroyDependencies();
|
DestroyDependencies();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Wait on a fence cycle till it has been signalled
|
* @brief Wait on a fence cycle till it has been signalled
|
||||||
|
* @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled
|
||||||
*/
|
*/
|
||||||
void Wait() {
|
void Wait(bool shouldDestroy = false) {
|
||||||
if (signalled.test(std::memory_order_consume))
|
if (signalled.test(std::memory_order_consume)) {
|
||||||
|
if (shouldDestroy)
|
||||||
|
DestroyDependencies();
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
chainedCycles.Iterate([](auto &cycle) {
|
chainedCycles.Iterate([shouldDestroy](auto &cycle) {
|
||||||
cycle->Wait();
|
cycle->Wait(shouldDestroy);
|
||||||
});
|
});
|
||||||
|
|
||||||
vk::Result waitResult;
|
vk::Result waitResult;
|
||||||
@ -73,21 +81,26 @@ namespace skyline::gpu {
|
|||||||
throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
|
throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!signalled.test_and_set(std::memory_order_release))
|
signalled.test_and_set(std::memory_order_release);
|
||||||
|
if (shouldDestroy)
|
||||||
DestroyDependencies();
|
DestroyDependencies();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Wait on a fence cycle with a timeout in nanoseconds
|
* @brief Wait on a fence cycle with a timeout in nanoseconds
|
||||||
|
* @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled
|
||||||
* @return If the wait was successful or timed out
|
* @return If the wait was successful or timed out
|
||||||
*/
|
*/
|
||||||
bool Wait(i64 timeoutNs) {
|
bool Wait(i64 timeoutNs, bool shouldDestroy = false) {
|
||||||
if (signalled.test(std::memory_order_consume))
|
if (signalled.test(std::memory_order_consume)) {
|
||||||
|
if (shouldDestroy)
|
||||||
|
DestroyDependencies();
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
i64 startTime{util::GetTimeNs()}, initialTimeout{timeoutNs};
|
i64 startTime{util::GetTimeNs()}, initialTimeout{timeoutNs};
|
||||||
if (!chainedCycles.AllOf([&](auto &cycle) {
|
if (!chainedCycles.AllOf([&](auto &cycle) {
|
||||||
if (!cycle->Wait(timeoutNs))
|
if (!cycle->Wait(timeoutNs, shouldDestroy))
|
||||||
return false;
|
return false;
|
||||||
timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
|
timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
|
||||||
return true;
|
return true;
|
||||||
@ -108,7 +121,8 @@ namespace skyline::gpu {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (waitResult == vk::Result::eSuccess) {
|
if (waitResult == vk::Result::eSuccess) {
|
||||||
if (!signalled.test_and_set(std::memory_order_release))
|
signalled.test_and_set(std::memory_order_release);
|
||||||
|
if (shouldDestroy)
|
||||||
DestroyDependencies();
|
DestroyDependencies();
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
@ -116,23 +130,31 @@ namespace skyline::gpu {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Wait(std::chrono::duration<i64, std::nano> timeout) {
|
bool Wait(std::chrono::duration<i64, std::nano> timeout, bool shouldDestroy = false) {
|
||||||
return Wait(timeout.count());
|
return Wait(timeout.count(), shouldDestroy);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @param quick Skips the call to check the fence's status, just checking the signalled flag
|
||||||
* @return If the fence is signalled currently or not
|
* @return If the fence is signalled currently or not
|
||||||
*/
|
*/
|
||||||
bool Poll() {
|
bool Poll(bool quick = true, bool shouldDestroy = false) {
|
||||||
if (signalled.test(std::memory_order_consume))
|
if (signalled.test(std::memory_order_consume)) {
|
||||||
|
if (shouldDestroy)
|
||||||
|
DestroyDependencies();
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if (!chainedCycles.AllOf([](auto &cycle) { return cycle->Poll(); }))
|
if (quick)
|
||||||
|
return false; // We need to return early if we're not waiting on the fence
|
||||||
|
|
||||||
|
if (!chainedCycles.AllOf([=](auto &cycle) { return cycle->Poll(quick, shouldDestroy); }))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
auto status{(*device).getFenceStatus(fence, *device.getDispatcher())};
|
auto status{(*device).getFenceStatus(fence, *device.getDispatcher())};
|
||||||
if (status == vk::Result::eSuccess) {
|
if (status == vk::Result::eSuccess) {
|
||||||
if (!signalled.test_and_set(std::memory_order_release))
|
signalled.test_and_set(std::memory_order_release);
|
||||||
|
if (shouldDestroy)
|
||||||
DestroyDependencies();
|
DestroyDependencies();
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
@ -289,7 +289,7 @@ namespace skyline::gpu::interconnect {
|
|||||||
for (const auto &attachedBuffer : attachedBuffers)
|
for (const auto &attachedBuffer : attachedBuffers)
|
||||||
attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls
|
attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls
|
||||||
|
|
||||||
gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());
|
gpu.scheduler.SubmitCommandBuffer(commandBuffer, cycle);
|
||||||
|
|
||||||
nodes.clear();
|
nodes.clear();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user