Handle OOB blits by adding to the texture base offset

The previous method would cause OOB reads for the last row to clamp, and adding an extra row would potentially encounter unmapped memory. So use this technique based on how Ryu does it.
This commit is contained in:
Billy Laws 2023-04-05 14:57:52 +01:00
parent 6aef7fdd1e
commit 7d0b7f0b71
4 changed files with 23 additions and 17 deletions

View File

@ -13,7 +13,7 @@ namespace skyline::gpu::interconnect {
using IOVA = soc::gm20b::IOVA; using IOVA = soc::gm20b::IOVA;
using MemoryLayout = skyline::soc::gm20b::engine::fermi2d::type::MemoryLayout; using MemoryLayout = skyline::soc::gm20b::engine::fermi2d::type::MemoryLayout;
gpu::GuestTexture Fermi2D::GetGuestTexture(const Surface &surface) { std::pair<gpu::GuestTexture, bool> Fermi2D::GetGuestTexture(const Surface &surface, u32 oobReadStart, u32 oobReadWidth) {
auto determineFormat = [&](Surface::SurfaceFormat format) -> skyline::gpu::texture::Format { auto determineFormat = [&](Surface::SurfaceFormat format) -> skyline::gpu::texture::Format {
#define FORMAT_CASE(fermiFmt, skFmt, fmtType) \ #define FORMAT_CASE(fermiFmt, skFmt, fmtType) \
case Surface::SurfaceFormat::fermiFmt ## fmtType: \ case Surface::SurfaceFormat::fermiFmt ## fmtType: \
@ -84,8 +84,15 @@ namespace skyline::gpu::interconnect {
texture.layerCount = 1; texture.layerCount = 1;
texture.viewType = vk::ImageViewType::e2D; texture.viewType = vk::ImageViewType::e2D;
u64 addressOffset{};
if (surface.memoryLayout == MemoryLayout::Pitch) { if (surface.memoryLayout == MemoryLayout::Pitch) {
texture.dimensions = gpu::texture::Dimensions{surface.stride / texture.format->bpb, surface.height, 1}; texture.dimensions = gpu::texture::Dimensions{surface.stride / texture.format->bpb, surface.height, 1};
// OpenGL games rely on reads wrapping around to the next line when reading out of bounds, emulate this behaviour by offsetting the address
if (oobReadStart && surface.width == (oobReadWidth + oobReadStart) && (oobReadWidth + oobReadStart) > texture.dimensions.width)
addressOffset += oobReadStart * texture.format->bpb;
texture.tileConfig = gpu::texture::TileConfig{ texture.tileConfig = gpu::texture::TileConfig{
.mode = gpu::texture::TileMode::Pitch, .mode = gpu::texture::TileMode::Pitch,
.pitch = surface.stride .pitch = surface.stride
@ -99,11 +106,11 @@ namespace skyline::gpu::interconnect {
}; };
} }
IOVA iova{surface.address}; u64 iova{u64{surface.address} + addressOffset};
auto mappings{channelCtx.asCtx->gmmu.TranslateRange(iova, texture.GetSize())}; auto mappings{channelCtx.asCtx->gmmu.TranslateRange(iova, texture.GetSize())};
texture.mappings.assign(mappings.begin(), mappings.end()); texture.mappings.assign(mappings.begin(), mappings.end());
return texture; return {texture, addressOffset != 0};
} }
Fermi2D::Fermi2D(GPU &gpu, soc::gm20b::ChannelContext &channelCtx) Fermi2D::Fermi2D(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
@ -114,10 +121,19 @@ namespace skyline::gpu::interconnect {
void Fermi2D::Blit(const Surface &srcSurface, const Surface &dstSurface, float srcRectX, float srcRectY, u32 dstRectWidth, u32 dstRectHeight, u32 dstRectX, u32 dstRectY, float duDx, float dvDy, SampleModeOrigin sampleOrigin, bool resolve, SampleModeFilter filter) { void Fermi2D::Blit(const Surface &srcSurface, const Surface &dstSurface, float srcRectX, float srcRectY, u32 dstRectWidth, u32 dstRectHeight, u32 dstRectX, u32 dstRectY, float duDx, float dvDy, SampleModeOrigin sampleOrigin, bool resolve, SampleModeFilter filter) {
TRACE_EVENT("gpu", "Fermi2D::Blit"); TRACE_EVENT("gpu", "Fermi2D::Blit");
// TODO: When we support MSAA perform a resolve operation rather than blit when the `resolve` flag is set. // Blit shader always samples from centre so adjust if necessary
auto srcGuestTexture{GetGuestTexture(srcSurface)}; float centredSrcRectX{sampleOrigin == SampleModeOrigin::Corner ? srcRectX - 0.5f : srcRectX};
auto dstGuestTexture{GetGuestTexture(dstSurface)}; float centredSrcRectY{sampleOrigin == SampleModeOrigin::Corner ? srcRectY - 0.5f : srcRectY};
u32 oobReadStart{static_cast<u32>(centredSrcRectX)};
u32 oobReadWidth{static_cast<u32>(duDx * static_cast<float>(dstRectWidth))};
// TODO: When we support MSAA perform a resolve operation rather than blit when the `resolve` flag is set.
auto [srcGuestTexture, srcWentOob]{GetGuestTexture(srcSurface, oobReadStart, oobReadWidth)};
if (srcWentOob)
centredSrcRectX = 0.0f;
auto [dstGuestTexture, dstWentOob]{GetGuestTexture(dstSurface)};
auto srcTextureView{gpu.texture.FindOrCreate(srcGuestTexture, executor.tag)}; auto srcTextureView{gpu.texture.FindOrCreate(srcGuestTexture, executor.tag)};
executor.AttachDependency(srcTextureView); executor.AttachDependency(srcTextureView);
executor.AttachTexture(srcTextureView.get()); executor.AttachTexture(srcTextureView.get());
@ -127,10 +143,6 @@ namespace skyline::gpu::interconnect {
executor.AttachTexture(dstTextureView.get()); executor.AttachTexture(dstTextureView.get());
dstTextureView->texture->MarkGpuDirty(executor.usageTracker); dstTextureView->texture->MarkGpuDirty(executor.usageTracker);
// Blit shader always samples from centre so adjust if necessary
float centredSrcRectX{sampleOrigin == SampleModeOrigin::Corner ? srcRectX - 0.5f : srcRectX};
float centredSrcRectY{sampleOrigin == SampleModeOrigin::Corner ? srcRectY - 0.5f : srcRectY};
executor.AddCheckpoint("Before blit"); executor.AddCheckpoint("Before blit");
gpu.helperShaders.blitHelperShader.Blit( gpu.helperShaders.blitHelperShader.Blit(
gpu, gpu,

View File

@ -33,7 +33,7 @@ namespace skyline::gpu::interconnect {
soc::gm20b::ChannelContext &channelCtx; soc::gm20b::ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor; gpu::interconnect::CommandExecutor &executor;
gpu::GuestTexture GetGuestTexture(const Surface &surface); std::pair<gpu::GuestTexture, bool> GetGuestTexture(const Surface &surface, u32 oobReadStart = 0, u32 oobReadWidth = 0);
public: public:
Fermi2D(GPU &gpu, soc::gm20b::ChannelContext &channelCtx); Fermi2D(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);

View File

@ -175,7 +175,6 @@ namespace skyline::gpu {
struct FragmentPushConstantLayout { struct FragmentPushConstantLayout {
glsl::Vec2 srcOriginUV; glsl::Vec2 srcOriginUV;
glsl::Vec2 dstSrcScaleFactor; glsl::Vec2 dstSrcScaleFactor;
float srcHeightRecip;
}; };
constexpr static std::array<vk::PushConstantRange, 2> PushConstantRanges{ constexpr static std::array<vk::PushConstantRange, 2> PushConstantRanges{
@ -254,7 +253,6 @@ namespace skyline::gpu {
}, blit::FragmentPushConstantLayout{ }, blit::FragmentPushConstantLayout{
.srcOriginUV = {srcRect.x / srcImageDimensions.width, srcRect.y / srcImageDimensions.height}, .srcOriginUV = {srcRect.x / srcImageDimensions.width, srcRect.y / srcImageDimensions.height},
.dstSrcScaleFactor = {dstSrcScaleFactorX * (srcRect.width / srcImageDimensions.width), dstSrcScaleFactorY * (srcRect.height / srcImageDimensions.height)}, .dstSrcScaleFactor = {dstSrcScaleFactorX * (srcRect.width / srcImageDimensions.width), dstSrcScaleFactorY * (srcRect.height / srcImageDimensions.height)},
.srcHeightRecip = 1.0f / srcImageDimensions.height
}, },
GetPipeline(gpu, GetPipeline(gpu,
{dstImageView->format->vkFormat, {dstImageView->format->vkFormat,

View File

@ -8,14 +8,10 @@ layout (push_constant) uniform constants {
layout (offset = 16) layout (offset = 16)
vec2 srcOriginUV; vec2 srcOriginUV;
vec2 dstSrcScaleFactor; vec2 dstSrcScaleFactor;
float srcHeightRecip;
} PC; } PC;
void main() void main()
{ {
vec2 srcUV = dstUV * PC.dstSrcScaleFactor + PC.srcOriginUV; vec2 srcUV = dstUV * PC.dstSrcScaleFactor + PC.srcOriginUV;
// Account for out of bounds blits by moving to the next line of the source texture for the copy
srcUV.y += floor(srcUV.x) * PC.srcHeightRecip;
srcUV.x = srcUV.x - floor(srcUV.x);
colour.rgba = texture(src, srcUV); colour.rgba = texture(src, srcUV);
} }