Rewrite Fermi 2D engine to use the blit helper shader

Entirely rewrites the engine and interconnect code to take advantage of the subpixel and OOB blit support offered by the blit helper shader. The interconnect code is also cleaned up significantly with the 'context' naming being dropped due to potential conflicts with the 'context' from context lock
This commit is contained in:
Billy Laws 2022-07-31 15:05:51 +01:00
parent 395f665a13
commit 06053d3caf
8 changed files with 221 additions and 192 deletions

View File

@ -1,155 +0,0 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <gpu/texture/format.h>
#include <gpu/texture/texture.h>
#include <gpu/texture_manager.h>
#include <gpu/buffer.h>
#include <soc/gm20b/gmmu.h>
#include <soc/gm20b/channel.h>
#include <soc/gm20b/engines/fermi/types.h>
namespace skyline::gpu::interconnect {
using IOVA = soc::gm20b::IOVA;
namespace fermi2d = skyline::soc::gm20b::engine::fermi2d::type;
/**
* @brief Handles translating Fermi 2D engine blit operations to Vulkan
*/
class BlitContext {
private:
GPU &gpu;
soc::gm20b::ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor;
gpu::GuestTexture GetGuestTexture(const fermi2d::Surface &surface) {
auto determineFormat = [&](fermi2d::Surface::SurfaceFormat format) -> skyline::gpu::texture::Format {
#define FORMAT_CASE(fermiFmt, skFmt, fmtType) \
case fermi2d::Surface::SurfaceFormat::fermiFmt ## fmtType: \
return skyline::gpu::format::skFmt ## fmtType
#define FORMAT_SAME_CASE(fmt, type) FORMAT_CASE(fmt, fmt, type)
#define FORMAT_NORM_CASE(fermiFmt, skFmt) \
FORMAT_CASE(fermiFmt, skFmt, Snorm); \
FORMAT_CASE(fermiFmt, skFmt, Unorm)
#define FORMAT_SAME_NORM_CASE(fmt) FORMAT_NORM_CASE(fmt, fmt)
#define FORMAT_NORM_FLOAT_CASE(fermiFmt, skFmt) \
FORMAT_NORM_CASE(fermiFmt, skFmt); \
FORMAT_CASE(fermiFmt, skFmt, Float)
#define FORMAT_SAME_NORM_FLOAT_CASE(fmt) FORMAT_NORM_FLOAT_CASE(fmt, fmt)
switch (format) {
FORMAT_SAME_NORM_CASE(R8);
FORMAT_SAME_NORM_FLOAT_CASE(R16);
FORMAT_SAME_NORM_CASE(R8G8);
FORMAT_SAME_CASE(B5G6R5, Unorm);
FORMAT_SAME_CASE(B5G5R5A1, Unorm);
FORMAT_SAME_CASE(R32, Float);
FORMAT_SAME_CASE(B10G11R11, Float);
FORMAT_SAME_NORM_FLOAT_CASE(R16G16);
FORMAT_SAME_CASE(R8G8B8A8, Unorm);
FORMAT_SAME_CASE(R8G8B8A8, Srgb);
FORMAT_NORM_CASE(R8G8B8X8, R8G8B8A8);
FORMAT_CASE(R8G8B8X8, R8G8B8A8, Srgb);
FORMAT_SAME_CASE(B8G8R8A8, Unorm);
FORMAT_SAME_CASE(B8G8R8A8, Srgb);
FORMAT_SAME_CASE(A2B10G10R10, Unorm);
FORMAT_SAME_CASE(R32G32, Float);
FORMAT_SAME_CASE(R16G16B16A16, Float);
FORMAT_NORM_FLOAT_CASE(R16G16B16X16, R16G16B16A16);
FORMAT_SAME_CASE(R32G32B32A32, Float);
FORMAT_CASE(R32G32B32X32, R32G32B32A32, Float);
default:
throw exception("Cannot translate the supplied surface format: 0x{:X}", static_cast<u32>(format));
}
#undef FORMAT_CASE
#undef FORMAT_SAME_CASE
#undef FORMAT_NORM_CASE
#undef FORMAT_SAME_NORM_CASE
#undef FORMAT_NORM_FLOAT_CASE
#undef FORMAT_SAME_NORM_FLOAT_CASE
};
GuestTexture texture{};
texture.format = determineFormat(surface.format);
texture.aspect = texture.format->vkAspect;
texture.baseArrayLayer = 0;
texture.layerCount = 1;
texture.viewType = vk::ImageViewType::e2D;
if (surface.memoryLayout == fermi2d::MemoryLayout::Pitch) {
texture.dimensions = gpu::texture::Dimensions{surface.stride / texture.format->bpb, surface.height, 1};
texture.tileConfig = gpu::texture::TileConfig{
.mode = gpu::texture::TileMode::Pitch,
.pitch = surface.stride
};
} else {
texture.dimensions = gpu::texture::Dimensions{surface.width, surface.height, surface.depth};
texture.tileConfig = gpu::texture::TileConfig{
.mode = gpu::texture::TileMode::Block,
.blockHeight = surface.blockSize.Height(),
.blockDepth = surface.blockSize.Depth(),
};
}
IOVA iova{surface.address};
auto mappings{channelCtx.asCtx->gmmu.TranslateRange(iova, texture.GetSize())};
texture.mappings.assign(mappings.begin(), mappings.end());
return texture;
}
public:
BlitContext(GPU &gpu, soc::gm20b::ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor) : gpu(gpu), channelCtx(channelCtx), executor(executor) {}
void Blit(const fermi2d::Surface &srcSurface, const fermi2d::Surface &dstSurface, i32 srcX, i32 srcY, i32 srcWidth, i32 srcHeight, i32 dstX, i32 dstY, i32 dstWidth, i32 dstHeight, bool resolve, bool linearFilter) {
// TODO: OOB blit: https://github.com/Ryujinx/Ryujinx/blob/master/Ryujinx.Graphics.Gpu/Engine/Twod/TwodClass.cs#L287
// TODO: When we support MSAA perform a resolve operation rather than blit when the `resolve` flag is set.
auto srcGuestTexture{GetGuestTexture(srcSurface)};
auto dstGuestTexture{GetGuestTexture(dstSurface)};
auto &textureManager{executor.AcquireTextureManager()};
auto srcTextureView{textureManager.FindOrCreate(srcGuestTexture, executor.tag)};
executor.AttachTexture(srcTextureView.get());
auto dstTextureView{textureManager.FindOrCreate(dstGuestTexture, executor.tag)};
executor.AttachTexture(dstTextureView.get());
auto getSubresourceLayers{[](const vk::ImageSubresourceRange &range, vk::ImageAspectFlags aspect) {
return vk::ImageSubresourceLayers{
.aspectMask = aspect,
.mipLevel = 0, // Blit engine only does one layer/mip level at a time
.layerCount = 1,
.baseArrayLayer = range.baseArrayLayer
};
}};
vk::ImageBlit region{
.srcSubresource = getSubresourceLayers(srcTextureView->range, srcTextureView->format->vkAspect),
.dstSubresource = getSubresourceLayers(dstTextureView->range, srcTextureView->range.aspectMask),
.srcOffsets = {{vk::Offset3D{srcX, srcY, 0}, vk::Offset3D{srcX + srcWidth, srcY + srcHeight, 1}}},
.dstOffsets = {{vk::Offset3D{dstX, dstY, 0}, vk::Offset3D{dstX + dstWidth, dstY + dstHeight, 1}}}
};
executor.AddOutsideRpCommand([region, srcTextureView, dstTextureView, linearFilter](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &) {
auto blitSrcImage{srcTextureView->texture->GetBacking()};
auto blitDstImage{dstTextureView->texture->GetBacking()};
commandBuffer.blitImage(blitSrcImage, vk::ImageLayout::eGeneral,
blitDstImage, vk::ImageLayout::eGeneral,
region,
linearFilter ? vk::Filter::eLinear : vk::Filter::eNearest);
});
}
};
}

View File

@ -0,0 +1,148 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <gpu/texture/format.h>
#include <gpu/texture/texture.h>
#include <gpu/texture_manager.h>
#include <soc/gm20b/gmmu.h>
#include <soc/gm20b/channel.h>
#include "fermi_2d.h"
namespace skyline::gpu::interconnect {
using IOVA = soc::gm20b::IOVA;
using MemoryLayout = skyline::soc::gm20b::engine::fermi2d::type::MemoryLayout;
gpu::GuestTexture Fermi2D::GetGuestTexture(const Surface &surface) {
auto determineFormat = [&](Surface::SurfaceFormat format) -> skyline::gpu::texture::Format {
#define FORMAT_CASE(fermiFmt, skFmt, fmtType) \
case Surface::SurfaceFormat::fermiFmt ## fmtType: \
return skyline::gpu::format::skFmt ## fmtType
#define FORMAT_SAME_CASE(fmt, type) FORMAT_CASE(fmt, fmt, type)
switch (format) {
FORMAT_SAME_CASE(R8, Unorm);
FORMAT_SAME_CASE(R8, Snorm);
FORMAT_SAME_CASE(R16, Unorm);
FORMAT_SAME_CASE(R16, Snorm);
FORMAT_SAME_CASE(R16, Float);
FORMAT_SAME_CASE(R8G8, Unorm);
FORMAT_SAME_CASE(R8G8, Snorm);
FORMAT_SAME_CASE(B5G6R5, Unorm);
FORMAT_SAME_CASE(B5G5R5A1, Unorm);
FORMAT_SAME_CASE(R32, Float);
FORMAT_SAME_CASE(B10G11R11, Float);
FORMAT_SAME_CASE(R16G16, Unorm);
FORMAT_SAME_CASE(R16G16, Snorm);
FORMAT_SAME_CASE(R16G16, Float);
FORMAT_SAME_CASE(R8G8B8A8, Unorm);
FORMAT_SAME_CASE(R8G8B8A8, Srgb);
FORMAT_CASE(R8G8B8X8, R8G8B8A8, Unorm);
FORMAT_CASE(R8G8B8X8, R8G8B8A8, Snorm);
FORMAT_CASE(R8G8B8X8, R8G8B8A8, Srgb);
FORMAT_SAME_CASE(B8G8R8A8, Unorm);
FORMAT_SAME_CASE(B8G8R8A8, Srgb);
FORMAT_SAME_CASE(A2B10G10R10, Unorm);
FORMAT_SAME_CASE(R32G32, Float);
FORMAT_SAME_CASE(R16G16B16A16, Float);
FORMAT_CASE(R16G16B16X16, R16G16B16A16, Unorm);
FORMAT_CASE(R16G16B16X16, R16G16B16A16, Snorm);
FORMAT_CASE(R16G16B16X16, R16G16B16A16, Float);
FORMAT_SAME_CASE(R32G32B32A32, Float);
FORMAT_CASE(R32G32B32X32, R32G32B32A32, Float);
default:
throw exception("Cannot translate the supplied surface format: 0x{:X}", static_cast<u32>(format));
}
#undef FORMAT_CASE
#undef FORMAT_SAME_CASE
};
GuestTexture texture{};
texture.format = determineFormat(surface.format);
texture.aspect = texture.format->vkAspect;
texture.baseArrayLayer = 0;
texture.layerCount = 1;
texture.viewType = vk::ImageViewType::e2D;
if (surface.memoryLayout == MemoryLayout::Pitch) {
texture.dimensions = gpu::texture::Dimensions{surface.stride / texture.format->bpb, surface.height, 1};
texture.tileConfig = gpu::texture::TileConfig{
.mode = gpu::texture::TileMode::Pitch,
.pitch = surface.stride
};
} else {
texture.dimensions = gpu::texture::Dimensions{surface.width, surface.height, surface.depth};
texture.tileConfig = gpu::texture::TileConfig{
.mode = gpu::texture::TileMode::Block,
.blockHeight = surface.blockSize.Height(),
.blockDepth = surface.blockSize.Depth(),
};
}
IOVA iova{surface.address};
auto mappings{channelCtx.asCtx->gmmu.TranslateRange(iova, texture.GetSize())};
texture.mappings.assign(mappings.begin(), mappings.end());
return texture;
}
Fermi2D::Fermi2D(GPU &gpu, soc::gm20b::ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor) : gpu(gpu), channelCtx(channelCtx), executor(executor) {}
void Fermi2D::Blit(const Surface &srcSurface, const Surface &dstSurface, float srcRectX, float srcRectY, u32 dstRectWidth, u32 dstRectHeight, u32 dstRectX, u32 dstRectY, float duDx, float dvDy, bool resolve, bool bilinearFilter) {
// TODO: When we support MSAA perform a resolve operation rather than blit when the `resolve` flag is set.
auto srcGuestTexture{GetGuestTexture(srcSurface)};
auto dstGuestTexture{GetGuestTexture(dstSurface)};
auto &textureManager{executor.AcquireTextureManager()};
auto srcTextureView{textureManager.FindOrCreate(srcGuestTexture, executor.tag)};
executor.AttachTexture(srcTextureView.get());
auto dstTextureView{textureManager.FindOrCreate(dstGuestTexture, executor.tag)};
executor.AttachTexture(dstTextureView.get());
gpu.helperShaders.blitHelperShader.Blit(
gpu,
{
.width = duDx * dstRectWidth,
.height = dvDy * dstRectHeight,
.x = srcRectX,
.y = srcRectY,
},
{
.width = static_cast<float>(dstRectWidth),
.height = static_cast<float>(dstRectHeight),
.x = static_cast<float>(dstRectX),
.y = static_cast<float>(dstRectY),
},
srcGuestTexture.dimensions, dstGuestTexture.dimensions,
duDx, dvDy,
bilinearFilter,
srcTextureView.get(), dstTextureView.get(),
[=](auto &&executionCallback) {
auto dst{dstTextureView.get()};
executor.AddSubpass(std::move(executionCallback), {{static_cast<i32>(dstRectX), static_cast<i32>(dstRectY)}, {dstRectWidth, dstRectHeight} }, {}, {dst});
}
);
}
}

View File

@ -0,0 +1,41 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <gpu/texture/texture.h>
#include <soc/gm20b/gmmu.h>
#include <soc/gm20b/engines/fermi/types.h>
namespace skyline::gpu {
class GPU;
}
namespace skyline::soc::gm20b {
struct ChannelContext;
}
namespace skyline::gpu::interconnect {
class CommandExecutor;
/**
* @brief Handles translating Fermi 2D engine blit operations to Vulkan
*/
class Fermi2D {
private:
using IOVA = soc::gm20b::IOVA;
using Surface = skyline::soc::gm20b::engine::fermi2d::type::Surface;
GPU &gpu;
soc::gm20b::ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor;
gpu::GuestTexture GetGuestTexture(const Surface &surface);
public:
Fermi2D(GPU &gpu, soc::gm20b::ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor);
void Blit(const Surface &srcSurface, const Surface &dstSurface, float srcRectX, float srcRectY, u32 dstRectWidth, u32 dstRectHeight, u32 dstRectX, u32 dstRectY, float duDx, float dvDy, bool resolve, bool bilinearFilter);
};
}

View File

@ -10,7 +10,7 @@ namespace skyline::soc::gm20b {
: asCtx(std::move(pAsCtx)),
executor(state),
maxwell3D(std::make_unique<engine::maxwell3d::Maxwell3D>(state, *this, macroState, executor)),
fermi2D(std::make_unique<engine::fermi2d::Fermi2D>(state, *this, macroState, executor)),
fermi2D(state, *this, macroState, executor),
maxwellDma(state, *this, executor),
keplerCompute(state, *this),
inline2Memory(*this),

View File

@ -6,6 +6,7 @@
#include <gpu/interconnect/command_executor.h>
#include "macro/macro_state.h"
#include "engines/engine.h"
#include "engines/fermi_2d.h"
#include "engines/maxwell_dma.h"
#include "engines/kepler_compute.h"
#include "engines/inline2memory.h"
@ -16,10 +17,6 @@ namespace skyline::soc::gm20b {
class Maxwell3D;
}
namespace engine::fermi2d {
class Fermi2D;
}
struct AddressSpaceContext;
/**
@ -31,7 +28,7 @@ namespace skyline::soc::gm20b {
gpu::interconnect::CommandExecutor executor;
MacroState macroState;
std::unique_ptr<engine::maxwell3d::Maxwell3D> maxwell3D; //!< TODO: fix this once graphics context is moved into a cpp file
std::unique_ptr<engine::fermi2d::Fermi2D> fermi2D;
engine::fermi2d::Fermi2D fermi2D;
engine::MaxwellDma maxwellDma;
engine::KeplerCompute keplerCompute;
engine::Inline2Memory inline2Memory;

View File

@ -9,7 +9,7 @@ namespace skyline::soc::gm20b::engine::fermi2d {
Fermi2D::Fermi2D(const DeviceState &state, ChannelContext &channelCtx, MacroState &macroState, gpu::interconnect::CommandExecutor &executor)
: MacroEngineBase(macroState),
syncpoints(state.soc->host1x.syncpoints),
context(*state.gpu, channelCtx, executor),
interconnect(*state.gpu, channelCtx, executor),
channelCtx(channelCtx) {}
void Fermi2D::HandleMethod(u32 method, u32 argument) {
@ -28,14 +28,18 @@ namespace skyline::soc::gm20b::engine::fermi2d {
if (pixelsFromMemory.safeOverlap)
Logger::Warn("Safe overlap is unimplemented!");
auto fixedToFloating{[](i64 value) {
constexpr u32 FractionalComponentSize{32};
// The 2D engine supports subpixel blit precision in the lower 32 bits of the src{X,Y}0 registers for filtering, we can safely ignore this in most cases though since the host driver will handle this in its own way
i32 srcX{static_cast<i32>(pixelsFromMemory.srcX0 >> FractionalComponentSize)};
i32 srcY{static_cast<i32>(pixelsFromMemory.srcY0 >> FractionalComponentSize)};
return static_cast<float>(value) / (1ULL << FractionalComponentSize);
}};
i32 srcWidth{static_cast<i32>((pixelsFromMemory.duDx * pixelsFromMemory.dstWidth) >> FractionalComponentSize)};
i32 srcHeight{static_cast<i32>((pixelsFromMemory.dvDy * pixelsFromMemory.dstHeight) >> FractionalComponentSize)};
// The 2D engine supports subpixel blit precision in the lower 32 bits of the src{X,Y}0 registers for filtering, we can safely ignore this in most cases though since the host driver will handle this in its own way
float srcX{fixedToFloating(pixelsFromMemory.srcX)};
float srcY{fixedToFloating(pixelsFromMemory.srcY)};
float duDx{fixedToFloating(pixelsFromMemory.duDx)};
float dvDy{fixedToFloating(pixelsFromMemory.dvDy)};
if (registers.pixelsFromMemory->sampleMode.origin == Registers::PixelsFromMemory::SampleModeOrigin::Center) {
// This is an MSAA resolve operation, sampling from the center of each pixel in order to resolve the final image from the MSAA samples
@ -48,21 +52,15 @@ namespace skyline::soc::gm20b::engine::fermi2d {
/* 123
456 */
// Since we don't implement MSAA, any image that is supposed to have MSAA applied when drawing is just stored in the corner without any pixel scaling, so adjust width/height appropriately
srcWidth = pixelsFromMemory.dstWidth;
srcHeight = pixelsFromMemory.dstHeight;
} else {
// This is a regular blit operation, scaling from one image to another
// https://github.com/Ryujinx/Ryujinx/blob/c9c65af59edea05e7206a076cb818128c004384e/Ryujinx.Graphics.Gpu/Engine/Twod/TwodClass.cs#L253
srcX -= (pixelsFromMemory.duDx >> FractionalComponentSize) >> 1;
srcY -= (pixelsFromMemory.dvDy >> FractionalComponentSize) >> 1;
// Since we don't implement MSAA, we can avoid any scaling at all by setting using a scale factor of 1
duDx = dvDy = 1.0f;
}
context.Blit(src, dst,
interconnect.Blit(src, dst,
srcX, srcY,
srcWidth, srcHeight,
pixelsFromMemory.dstX0, pixelsFromMemory.dstY0,
pixelsFromMemory.dstWidth, pixelsFromMemory.dstHeight,
pixelsFromMemory.dstX0, pixelsFromMemory.dstY0,
duDx, dvDy,
registers.pixelsFromMemory->sampleMode.origin == Registers::PixelsFromMemory::SampleModeOrigin::Center,
pixelsFromMemory.sampleMode.filter == Registers::PixelsFromMemory::SampleModeFilter::Bilinear);
}

View File

@ -4,7 +4,7 @@
#pragma once
#include <gpu/interconnect/blit_context.h>
#include <gpu/interconnect/fermi_2d.h>
#include "engine.h"
namespace skyline::soc::gm20b {
@ -18,7 +18,7 @@ namespace skyline::soc::gm20b::engine::fermi2d {
class Fermi2D : public MacroEngineBase {
private:
host1x::SyncpointSet &syncpoints;
gpu::interconnect::BlitContext context;
gpu::interconnect::Fermi2D interconnect;
ChannelContext &channelCtx;
/**
@ -71,15 +71,15 @@ namespace skyline::soc::gm20b::engine::fermi2d {
u32 _pad3_[8];
i32 dstX0;
i32 dstY0;
i32 dstWidth;
i32 dstHeight;
u32 dstX0;
u32 dstY0;
u32 dstWidth;
u32 dstHeight;
i64 duDx;
i64 dvDy;
i64 srcX0;
i64 srcX;
union {
i64 srcY0;
i64 srcY;
struct {
u32 _pad4_;
u32 trigger;

View File

@ -98,7 +98,7 @@ namespace skyline::soc::gm20b {
channelCtx.maxwell3D->HandleMacroCall(method - engine::EngineMethodsEnd, argument, lastCall);
break;
case SubchannelId::TwoD:
channelCtx.fermi2D->HandleMacroCall(method - engine::EngineMethodsEnd, argument, lastCall);
channelCtx.fermi2D.HandleMacroCall(method - engine::EngineMethodsEnd, argument, lastCall);
break;
default:
Logger::Warn("Called method 0x{:X} out of bounds for engine 0x{:X}, args: 0x{:X}", method, subChannel, argument);
@ -121,7 +121,7 @@ namespace skyline::soc::gm20b {
case SubchannelId::Copy:
channelCtx.maxwellDma.CallMethod(method, argument);
case SubchannelId::TwoD:
channelCtx.fermi2D->CallMethod(method, argument);
channelCtx.fermi2D.CallMethod(method, argument);
break;
default:
Logger::Warn("Called method 0x{:X} in unimplemented engine 0x{:X}, args: 0x{:X}", method, subChannel, argument);