mirror of
https://github.com/skyline-emu/skyline.git
synced 2025-01-16 17:07:55 +03:00
Implement accelerated uploads/copies through buffer manager
Previously, both I2M uploads and DMA copies would force GPU serialisation if they happened to hit a trap or were used to copy GPU dirty buffers. By using the buffer manager to implement them on the host GPU we can avoid such slowdowns entiely.
This commit is contained in:
parent
c5ec484d9a
commit
cac287d9fd
@ -186,6 +186,8 @@ add_library(skyline SHARED
|
||||
${source_DIR}/skyline/gpu/cache/renderpass_cache.cpp
|
||||
${source_DIR}/skyline/gpu/cache/framebuffer_cache.cpp
|
||||
${source_DIR}/skyline/gpu/interconnect/fermi_2d.cpp
|
||||
${source_DIR}/skyline/gpu/interconnect/maxwell_dma.cpp
|
||||
${source_DIR}/skyline/gpu/interconnect/inline2memory.cpp
|
||||
${source_DIR}/skyline/gpu/interconnect/maxwell_3d/common.cpp
|
||||
${source_DIR}/skyline/gpu/interconnect/maxwell_3d/active_state.cpp
|
||||
${source_DIR}/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp
|
||||
|
50
app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
Normal file
50
app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
Normal file
@ -0,0 +1,50 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
|
||||
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
||||
|
||||
#include <gpu/buffer_manager.h>
|
||||
#include <soc/gm20b/gmmu.h>
|
||||
#include <soc/gm20b/channel.h>
|
||||
#include "inline2memory.h"
|
||||
|
||||
namespace skyline::gpu::interconnect {
|
||||
using IOVA = soc::gm20b::IOVA;
|
||||
|
||||
Inline2Memory::Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
|
||||
: gpu{gpu},
|
||||
channelCtx{channelCtx},
|
||||
executor{channelCtx.executor} {}
|
||||
|
||||
void Inline2Memory::Upload(IOVA dst, span<u32> src) {
|
||||
auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, src.size_bytes())};
|
||||
|
||||
if (dstMappings.size() > 1)
|
||||
Logger::Warn("Split mapping are unsupported for DMA copies");
|
||||
|
||||
auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
|
||||
executor.AttachLockedBuffer(buffer, std::move(lock));
|
||||
})};
|
||||
ContextLock dstBufLock{executor.tag, dstBuf};
|
||||
|
||||
|
||||
dstBuf.Write(src.cast<u8>(), 0, [&]() {
|
||||
executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
|
||||
// This will prevent any CPU accesses to backing for the duration of the usage
|
||||
dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
|
||||
|
||||
auto srcGpuAllocation{gpu.megaBufferAllocator.Push(executor.cycle, src.cast<u8>())};
|
||||
executor.AddOutsideRpCommand([srcGpuAllocation, dstBuf, src](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
|
||||
vk::BufferCopy copyRegion{
|
||||
.size = src.size_bytes(),
|
||||
.srcOffset = srcGpuAllocation.offset,
|
||||
.dstOffset = dstBuf.GetOffset()
|
||||
};
|
||||
commandBuffer.copyBuffer(srcGpuAllocation.buffer, dstBuf.GetBuffer()->GetBacking(), copyRegion);
|
||||
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
|
||||
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
|
||||
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
|
||||
}, {}, {});
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
36
app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h
Normal file
36
app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h
Normal file
@ -0,0 +1,36 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
|
||||
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <soc/gm20b/gmmu.h>
|
||||
|
||||
namespace skyline::gpu {
|
||||
class GPU;
|
||||
}
|
||||
|
||||
namespace skyline::soc::gm20b {
|
||||
struct ChannelContext;
|
||||
}
|
||||
|
||||
namespace skyline::gpu::interconnect {
|
||||
class CommandExecutor;
|
||||
|
||||
/**
|
||||
* @brief Handles translating I2M operations to Vulkan
|
||||
*/
|
||||
class Inline2Memory {
|
||||
private:
|
||||
using IOVA = soc::gm20b::IOVA;
|
||||
|
||||
GPU &gpu;
|
||||
soc::gm20b::ChannelContext &channelCtx;
|
||||
gpu::interconnect::CommandExecutor &executor;
|
||||
|
||||
public:
|
||||
Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
|
||||
|
||||
void Upload(IOVA dst, span<u32> src);
|
||||
};
|
||||
}
|
61
app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
Normal file
61
app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
Normal file
@ -0,0 +1,61 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
|
||||
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
||||
|
||||
#include <gpu/buffer_manager.h>
|
||||
#include <soc/gm20b/gmmu.h>
|
||||
#include <soc/gm20b/channel.h>
|
||||
#include "maxwell_dma.h"
|
||||
|
||||
namespace skyline::gpu::interconnect {
|
||||
using IOVA = soc::gm20b::IOVA;
|
||||
|
||||
MaxwellDma::MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
|
||||
: gpu{gpu},
|
||||
channelCtx{channelCtx},
|
||||
executor{channelCtx.executor} {}
|
||||
|
||||
void MaxwellDma::Copy(IOVA dst, IOVA src, size_t size) {
|
||||
auto srcMappings{channelCtx.asCtx->gmmu.TranslateRange(src, size)};
|
||||
auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, size)};
|
||||
|
||||
if (srcMappings.size() > 1 || dstMappings.size() > 1)
|
||||
Logger::Warn("Split mapping are unsupported for DMA copies");
|
||||
|
||||
auto srcBuf{gpu.buffer.FindOrCreate(srcMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
|
||||
executor.AttachLockedBuffer(buffer, std::move(lock));
|
||||
})};
|
||||
ContextLock srcBufLock{executor.tag, srcBuf};
|
||||
|
||||
auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
|
||||
executor.AttachLockedBuffer(buffer, std::move(lock));
|
||||
})};
|
||||
ContextLock dstBufLock{executor.tag, dstBuf};
|
||||
|
||||
dstBuf.CopyFrom(srcBuf, [&]() {
|
||||
executor.AttachLockedBufferView(srcBuf, std::move(srcBufLock));
|
||||
executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
|
||||
// This will prevent any CPU accesses to backing for the duration of the usage
|
||||
// GPU dirtiness will be handled on the CopyFrom end as it's not always necessary
|
||||
srcBuf.GetBuffer()->BlockAllCpuBackingWrites();
|
||||
dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
|
||||
|
||||
executor.AddOutsideRpCommand([srcBuf, dstBuf](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
|
||||
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer, {}, vk::MemoryBarrier{
|
||||
.srcAccessMask = vk::AccessFlagBits::eMemoryRead,
|
||||
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite
|
||||
}, {}, {});
|
||||
vk::BufferCopy copyRegion{
|
||||
.size = srcBuf.size,
|
||||
.srcOffset = srcBuf.GetOffset(),
|
||||
.dstOffset = dstBuf.GetOffset()
|
||||
};
|
||||
commandBuffer.copyBuffer(srcBuf.GetBuffer()->GetBacking(), dstBuf.GetBuffer()->GetBacking(), copyRegion);
|
||||
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
|
||||
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
|
||||
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
|
||||
}, {}, {});
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
36
app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h
Normal file
36
app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h
Normal file
@ -0,0 +1,36 @@
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
|
||||
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <soc/gm20b/gmmu.h>
|
||||
|
||||
namespace skyline::gpu {
|
||||
class GPU;
|
||||
}
|
||||
|
||||
namespace skyline::soc::gm20b {
|
||||
struct ChannelContext;
|
||||
}
|
||||
|
||||
namespace skyline::gpu::interconnect {
|
||||
class CommandExecutor;
|
||||
|
||||
/**
|
||||
* @brief Handles translating Maxwell DMA operations to Vulkan
|
||||
*/
|
||||
class MaxwellDma {
|
||||
private:
|
||||
using IOVA = soc::gm20b::IOVA;
|
||||
|
||||
GPU &gpu;
|
||||
soc::gm20b::ChannelContext &channelCtx;
|
||||
gpu::interconnect::CommandExecutor &executor;
|
||||
|
||||
public:
|
||||
MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
|
||||
|
||||
void Copy(IOVA dst, IOVA src, size_t size);
|
||||
};
|
||||
}
|
@ -5,7 +5,9 @@
|
||||
#include "inline2memory.h"
|
||||
|
||||
namespace skyline::soc::gm20b::engine {
|
||||
Inline2MemoryBackend::Inline2MemoryBackend(ChannelContext &channelCtx) : channelCtx(channelCtx) {}
|
||||
Inline2MemoryBackend::Inline2MemoryBackend(const DeviceState &state, ChannelContext &channelCtx)
|
||||
: interconnect{*state.gpu, channelCtx},
|
||||
channelCtx{channelCtx} {}
|
||||
|
||||
void Inline2MemoryBackend::LaunchDma(Inline2MemoryBackend::RegisterState &state) {
|
||||
writeOffset = 0;
|
||||
@ -17,13 +19,11 @@ namespace skyline::soc::gm20b::engine {
|
||||
if (state.launchDma.completion == RegisterState::DmaCompletionType::ReleaseSemaphore)
|
||||
throw exception("Semaphore release on I2M completion is not supported!");
|
||||
|
||||
channelCtx.executor.Submit();
|
||||
|
||||
if (state.launchDma.layout == RegisterState::DmaDstMemoryLayout::Pitch && state.lineCount == 1) {
|
||||
// TODO: we can do this with the buffer manager to avoid some overhead in the future
|
||||
Logger::Debug("range: 0x{:X} -> 0x{:X}", u64{state.offsetOut}, u64{state.offsetOut} + buffer.size() * 0x4);
|
||||
channelCtx.asCtx->gmmu.Write(state.offsetOut, span(buffer));
|
||||
interconnect.Upload(u64{state.offsetOut}, span{buffer});
|
||||
} else {
|
||||
channelCtx.executor.Submit();
|
||||
Logger::Warn("Non-linear I2M uploads are not supported!");
|
||||
}
|
||||
}
|
||||
@ -49,7 +49,7 @@ namespace skyline::soc::gm20b::engine {
|
||||
CompleteDma(state);
|
||||
}
|
||||
|
||||
Inline2Memory::Inline2Memory(ChannelContext &channelCtx) : backend(channelCtx) {}
|
||||
Inline2Memory::Inline2Memory(const DeviceState &state, ChannelContext &channelCtx) : backend{state, channelCtx} {}
|
||||
|
||||
__attribute__((always_inline)) void Inline2Memory::CallMethod(u32 method, u32 argument) {
|
||||
Logger::Verbose("Called method in I2M: 0x{:X} args: 0x{:X}", method, argument);
|
||||
|
@ -4,6 +4,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <common.h>
|
||||
#include <gpu/interconnect/inline2memory.h>
|
||||
#include "engine.h"
|
||||
|
||||
namespace skyline::soc::gm20b {
|
||||
@ -18,6 +19,7 @@ namespace skyline::soc::gm20b::engine {
|
||||
private:
|
||||
std::vector<u32> buffer; //!< Temporary buffer to hold data being currently uploaded
|
||||
u32 writeOffset{}; //!< Current write offset in words into `buffer`
|
||||
gpu::interconnect::Inline2Memory interconnect;
|
||||
ChannelContext &channelCtx;
|
||||
|
||||
public:
|
||||
@ -124,7 +126,7 @@ namespace skyline::soc::gm20b::engine {
|
||||
void CompleteDma(RegisterState &state);
|
||||
|
||||
public:
|
||||
Inline2MemoryBackend(ChannelContext &channelCtx);
|
||||
Inline2MemoryBackend(const DeviceState &state, ChannelContext &channelCtx);
|
||||
|
||||
/**
|
||||
* @brief Should be called when launchDma in `state` is written to
|
||||
@ -164,7 +166,7 @@ namespace skyline::soc::gm20b::engine {
|
||||
} registers{};
|
||||
|
||||
public:
|
||||
Inline2Memory(ChannelContext &channelCtx);
|
||||
Inline2Memory(const DeviceState &state, ChannelContext &channelCtx);
|
||||
|
||||
void CallMethod(u32 method, u32 argument);
|
||||
|
||||
|
@ -8,7 +8,7 @@
|
||||
|
||||
namespace skyline::soc::gm20b::engine {
|
||||
KeplerCompute::KeplerCompute(const DeviceState &state, ChannelContext &channelCtx)
|
||||
: syncpoints(state.soc->host1x.syncpoints), i2m(channelCtx) {}
|
||||
: syncpoints{state.soc->host1x.syncpoints}, i2m{state, channelCtx} {}
|
||||
|
||||
__attribute__((always_inline)) void KeplerCompute::CallMethod(u32 method, u32 argument) {
|
||||
Logger::Verbose("Called method in Kepler compute: 0x{:X} args: 0x{:X}", method, argument);
|
||||
|
@ -11,8 +11,10 @@
|
||||
#include "maxwell_dma.h"
|
||||
|
||||
namespace skyline::soc::gm20b::engine {
|
||||
MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor)
|
||||
: channelCtx(channelCtx), syncpoints(state.soc->host1x.syncpoints), executor(executor) {}
|
||||
MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx)
|
||||
: channelCtx{channelCtx},
|
||||
syncpoints{state.soc->host1x.syncpoints},
|
||||
interconnect{*state.gpu, channelCtx} {}
|
||||
|
||||
__attribute__((always_inline)) void MaxwellDma::CallMethod(u32 method, u32 argument) {
|
||||
Logger::Verbose("Called method in Maxwell DMA: 0x{:X} args: 0x{:X}", method, argument);
|
||||
@ -36,8 +38,8 @@ namespace skyline::soc::gm20b::engine {
|
||||
return;
|
||||
}
|
||||
|
||||
executor.Submit();
|
||||
if (registers.launchDma->multiLineEnable) {
|
||||
channelCtx.executor.Submit();
|
||||
if (registers.launchDma->srcMemoryLayout == Registers::LaunchDma::MemoryLayout::Pitch &&
|
||||
registers.launchDma->dstMemoryLayout == Registers::LaunchDma::MemoryLayout::BlockLinear)
|
||||
CopyPitchToBlockLinear();
|
||||
@ -51,7 +53,7 @@ namespace skyline::soc::gm20b::engine {
|
||||
// 1D buffer copy
|
||||
// TODO: implement swizzled 1D copies based on VMM 'kind'
|
||||
Logger::Debug("src: 0x{:X} dst: 0x{:X} size: 0x{:X}", u64{*registers.offsetIn}, u64{*registers.offsetOut}, *registers.lineLengthIn);
|
||||
channelCtx.asCtx->gmmu.Copy(*registers.offsetOut, *registers.offsetIn, *registers.lineLengthIn);
|
||||
interconnect.Copy(u64{*registers.offsetOut}, u64{*registers.offsetIn}, u64{*registers.lineLengthIn});
|
||||
}
|
||||
|
||||
ReleaseSemaphore();
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <gpu/interconnect/maxwell_dma.h>
|
||||
#include "engine.h"
|
||||
|
||||
namespace skyline::gpu::interconnect {
|
||||
@ -21,7 +22,7 @@ namespace skyline::soc::gm20b::engine {
|
||||
private:
|
||||
host1x::SyncpointSet &syncpoints;
|
||||
ChannelContext &channelCtx;
|
||||
gpu::interconnect::CommandExecutor &executor;
|
||||
gpu::interconnect::MaxwellDma interconnect;
|
||||
|
||||
void HandleMethod(u32 method, u32 argument);
|
||||
|
||||
@ -253,7 +254,7 @@ namespace skyline::soc::gm20b::engine {
|
||||
static_assert(sizeof(Registers) == (EngineMethodsEnd * 0x4));
|
||||
#pragma pack(pop)
|
||||
|
||||
MaxwellDma(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor);
|
||||
MaxwellDma(const DeviceState &state, ChannelContext &channelCtx);
|
||||
|
||||
void CallMethod(u32 method, u32 argument);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user