Implement the Fermi 2D blitting engine

The Fermi 2D engine implements both image blit and resolve operations, supporting subpixel sampling with both linear and point filtering.

Resolve operations are performed by sampling from the center of each pixel in order to resolve the final image from the MSAA samples
MSAA images are stored in memory like regular images but each pixels dimensions are scaled: e.g for 2x2 MSAA
```
112233
112233
445566
445566
```
These would be sampled with both duDx and duDy as 2 (integer part), resolving to the following:
```
123
456
```
Blit operations are performed by sampling from the corner of each pixel, scaling the image as one would expect.

This implementation isn't fully complete as Vulkan blit doesn't support some combinations which Fermi does, most notably between colour and depth stencil. These will be implemented properly at a later date, likely after the texture manager rework.
Out of Bounds Blit, used by some OpenGL games is also missing since supporting it requires texture aliasing, this will also be supported after the texture manager rework.

Co-authored-by: Billy Laws <blaws05@gmail.com>
This commit is contained in:
Robin Kertels 2022-02-07 14:15:55 +01:00 committed by Billy Laws
parent be2546138d
commit 0a3cf25823
10 changed files with 455 additions and 3 deletions

View File

@ -192,6 +192,7 @@ add_library(skyline SHARED
${source_DIR}/skyline/soc/gm20b/engines/kepler_compute.cpp
${source_DIR}/skyline/soc/gm20b/engines/maxwell_dma.cpp
${source_DIR}/skyline/soc/gm20b/engines/maxwell/initialization.cpp
${source_DIR}/skyline/soc/gm20b/engines/fermi_2d.cpp
${source_DIR}/skyline/input/npad.cpp
${source_DIR}/skyline/input/npad_device.cpp
${source_DIR}/skyline/input/touch.cpp

View File

@ -0,0 +1,157 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <gpu/texture/format.h>
#include <gpu/texture/texture.h>
#include <gpu/texture_manager.h>
#include <gpu/buffer.h>
#include <soc/gm20b/gmmu.h>
#include <soc/gm20b/channel.h>
#include <soc/gm20b/engines/fermi/types.h>
namespace skyline::gpu::interconnect {
using IOVA = soc::gm20b::IOVA;
namespace fermi2d = skyline::soc::gm20b::engine::fermi2d::type;
/**
* @brief Handles translating Fermi 2D engine blit operations to Vulkan
*/
class BlitContext {
private:
GPU &gpu;
soc::gm20b::ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor;
gpu::GuestTexture GetGuestTexture(const fermi2d::Surface &surface) {
auto determineFormat = [&](fermi2d::Surface::SurfaceFormat format) -> skyline::gpu::texture::Format {
#define FORMAT_CASE(fermiFmt, skFmt, fmtType) \
case fermi2d::Surface::SurfaceFormat::fermiFmt ## fmtType: \
return skyline::gpu::format::skFmt ## fmtType
#define FORMAT_SAME_CASE(fmt, type) FORMAT_CASE(fmt, fmt, type)
#define FORMAT_NORM_CASE(fermiFmt, skFmt) \
FORMAT_CASE(fermiFmt, skFmt, Snorm); \
FORMAT_CASE(fermiFmt, skFmt, Unorm)
#define FORMAT_SAME_NORM_CASE(fmt) FORMAT_NORM_CASE(fmt, fmt)
#define FORMAT_NORM_FLOAT_CASE(fermiFmt, skFmt) \
FORMAT_NORM_CASE(fermiFmt, skFmt); \
FORMAT_CASE(fermiFmt, skFmt, Float)
#define FORMAT_SAME_NORM_FLOAT_CASE(fmt) FORMAT_NORM_FLOAT_CASE(fmt, fmt)
switch (format) {
FORMAT_SAME_NORM_CASE(R8);
FORMAT_SAME_NORM_FLOAT_CASE(R16);
FORMAT_SAME_NORM_CASE(R8G8);
FORMAT_SAME_CASE(B5G6R5, Unorm);
FORMAT_SAME_CASE(B5G5R5A1, Unorm);
FORMAT_SAME_CASE(R32, Float);
FORMAT_SAME_CASE(B10G11R11, Float);
FORMAT_SAME_NORM_FLOAT_CASE(R16G16);
FORMAT_SAME_CASE(R8G8B8A8, Unorm);
FORMAT_SAME_CASE(R8G8B8A8, Srgb);
FORMAT_NORM_CASE(R8G8B8X8, R8G8B8A8);
FORMAT_CASE(R8G8B8X8, R8G8B8A8, Srgb);
FORMAT_SAME_CASE(B8G8R8A8, Unorm);
FORMAT_SAME_CASE(B8G8R8A8, Srgb);
FORMAT_SAME_CASE(A2B10G10R10, Unorm);
FORMAT_SAME_CASE(R32G32, Float);
FORMAT_SAME_CASE(R16G16B16A16, Float);
FORMAT_NORM_FLOAT_CASE(R16G16B16X16, R16G16B16A16);
FORMAT_SAME_CASE(R32G32B32A32, Float);
FORMAT_CASE(R32G32B32X32, R32G32B32A32, Float);
default:
throw exception("Cannot translate the supplied surface format: 0x{:X}", static_cast<u32>(format));
}
#undef FORMAT_CASE
#undef FORMAT_SAME_CASE
#undef FORMAT_NORM_CASE
#undef FORMAT_SAME_NORM_CASE
#undef FORMAT_NORM_FLOAT_CASE
#undef FORMAT_SAME_NORM_FLOAT_CASE
};
GuestTexture texture{};
texture.format = determineFormat(surface.format);
texture.aspect = texture.format->vkAspect;
texture.baseArrayLayer = 0;
texture.layerCount = 1;
if (surface.memoryLayout == fermi2d::MemoryLayout::Pitch) {
texture.type = gpu::texture::TextureType::e2D;
texture.dimensions = gpu::texture::Dimensions{surface.stride / texture.format->bpb, surface.height, 1};
texture.tileConfig = gpu::texture::TileConfig{
.mode = gpu::texture::TileMode::Pitch,
.pitch = surface.stride
};
} else {
texture.type = gpu::texture::TextureType::e2D;
texture.dimensions = gpu::texture::Dimensions{surface.width, surface.height, surface.depth};
texture.tileConfig = gpu::texture::TileConfig{
.mode = gpu::texture::TileMode::Block,
.blockHeight = surface.blockSize.Height(),
.blockDepth = surface.blockSize.Depth(),
};
}
IOVA iova{surface.address};
size_t size{texture.GetLayerStride() * (texture.layerCount - texture.baseArrayLayer)};
auto mappings{channelCtx.asCtx->gmmu.TranslateRange(iova, size)};
texture.mappings.assign(mappings.begin(), mappings.end());
return texture;
}
public:
BlitContext(GPU &gpu, soc::gm20b::ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor) : gpu(gpu), channelCtx(channelCtx), executor(executor) {}
void Blit(const fermi2d::Surface &srcSurface, const fermi2d::Surface &dstSurface, i32 srcX, i32 srcY, i32 srcWidth, i32 srcHeight, i32 dstX, i32 dstY, i32 dstWidth, i32 dstHeight, bool resolve, bool linearFilter) {
// TODO: OOB blit: https://github.com/Ryujinx/Ryujinx/blob/master/Ryujinx.Graphics.Gpu/Engine/Twod/TwodClass.cs#L287
// TODO: When we support MSAA perform a resolve operation rather than blit when the `resolve` flag is set.
auto srcGuestTexture{GetGuestTexture(srcSurface)};
auto dstGuestTexture{GetGuestTexture(dstSurface)};
auto srcTextureView{gpu.texture.FindOrCreate(srcGuestTexture)};
auto dstTextureView{gpu.texture.FindOrCreate(dstGuestTexture)};
executor.AttachTexture(&*srcTextureView);
executor.AttachTexture(&*dstTextureView);
auto getSubresourceLayers{[](const vk::ImageSubresourceRange &range, vk::ImageAspectFlags aspect) {
return vk::ImageSubresourceLayers{
.aspectMask = aspect,
.mipLevel = 0, // Blit engine only does one layer/mip level at a time
.layerCount = 1,
.baseArrayLayer = range.baseArrayLayer
};
}};
vk::ImageBlit region{
.srcSubresource = getSubresourceLayers(srcTextureView->range, srcTextureView->format->vkAspect),
.dstSubresource = getSubresourceLayers(dstTextureView->range, srcTextureView->range.aspectMask),
.srcOffsets = {{vk::Offset3D{srcX, srcY, 0}, vk::Offset3D{srcX + srcWidth, srcY + srcHeight, 1}}},
.dstOffsets = {{vk::Offset3D{dstX, dstY, 0}, vk::Offset3D{dstX + dstWidth, dstY + dstHeight, 1}}}
};
executor.AddOutsideRpCommand([region, srcTextureView, dstTextureView, linearFilter](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &) {
std::scoped_lock lock{*srcTextureView, *dstTextureView};
auto blitSrcImage{srcTextureView->texture->GetBacking()};
auto blitDstImage{dstTextureView->texture->GetBacking()};
commandBuffer.blitImage(blitSrcImage, vk::ImageLayout::eGeneral,
blitDstImage, vk::ImageLayout::eGeneral,
region,
linearFilter ? vk::Filter::eLinear : vk::Filter::eNearest);
});
}
};
}

View File

@ -5,6 +5,7 @@
#include <services/common/fence.h>
#include <soc/gm20b/engines/maxwell_3d.h> // TODO: remove
#include <soc/gm20b/engines/fermi_2d.h> // TODO: remove
#include <soc/gm20b/channel.h>
#include <services/nvdrv/devices/nvdevice.h>
#include "as_gpu.h"

View File

@ -2,6 +2,7 @@
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include "engines/maxwell_3d.h" //TODO: remove
#include "engines/fermi_2d.h" //TODO: remove
#include "channel.h"
namespace skyline::soc::gm20b {
@ -9,6 +10,7 @@ namespace skyline::soc::gm20b {
: asCtx(std::move(pAsCtx)),
executor(state),
maxwell3D(std::make_unique<engine::maxwell3d::Maxwell3D>(state, *this, macroState, executor)),
fermi2D(std::make_unique<engine::fermi2d::Fermi2D>(state, *this, macroState, executor)),
maxwellDma(state, *this, executor),
keplerCompute(state, *this),
inline2Memory(asCtx),

View File

@ -16,6 +16,10 @@ namespace skyline::soc::gm20b {
class Maxwell3D;
}
namespace engine::fermi2d {
class Fermi2D;
}
struct AddressSpaceContext;
/**
@ -27,6 +31,7 @@ namespace skyline::soc::gm20b {
gpu::interconnect::CommandExecutor executor;
MacroState macroState;
std::unique_ptr<engine::maxwell3d::Maxwell3D> maxwell3D; //!< TODO: fix this once graphics context is moved into a cpp file
std::unique_ptr<engine::fermi2d::Fermi2D> fermi2D;
engine::MaxwellDma maxwellDma;
engine::KeplerCompute keplerCompute;
engine::Inline2Memory inline2Memory;

View File

@ -41,7 +41,7 @@ namespace skyline::soc::gm20b::engine {
u32 high;
u32 low;
operator u64() {
operator u64() const {
return (static_cast<u64>(high) << 32) | low;
}
};

View File

@ -0,0 +1,93 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
// Copyright © 2018-2020 fincs (https://github.com/devkitPro/deko3d)
#pragma once
#include <common.h>
#include <soc/gm20b/engines/engine.h>
namespace skyline::soc::gm20b::engine::fermi2d::type {
#pragma pack(push, 1)
enum class MemoryLayout {
BlockLinear = 0,
Pitch = 1
};
struct Surface {
enum class SurfaceFormat {
Y1_8X8 = 0x1C,
AY8 = 0x1D,
R32G32B32A32Float = 0xC0,
R32G32B32X32Float = 0xC3,
R16G16B16X16Unorm = 0xC6,
R16G16B16X16Snorm = 0xC7,
R16G16B16A16Float = 0xCA,
R32G32Float = 0xCB,
R16G16B16X16Float = 0xCE,
B8G8R8A8Unorm = 0xCF,
B8G8R8A8Srgb = 0xD0,
A2B10G10R10Unorm = 0xD1,
R8G8B8A8Unorm = 0xD5,
R8G8B8A8Srgb = 0xD6,
R8G8B8X8Snorm = 0xD7,
R16G16Unorm = 0xDA,
R16G16Snorm = 0xDB,
R16G16Float = 0xDE,
A2R10G10B10 = 0xDF,
B10G11R11Float = 0xE0,
R32Float = 0xE5,
B8G8R8X8Unorm = 0xE6,
B8G8R8X8Srgb = 0xE7,
B5G6R5Unorm = 0xE8,
B5G5R5A1Unorm = 0xE9,
R8G8Unorm = 0xEA,
R8G8Snorm = 0xEB,
R16Unorm = 0xEE,
R16Snorm = 0xEF,
R16Float = 0xF2,
R8Unorm = 0xF3,
R8Snorm = 0xF4,
A8 = 0xF7,
B5G5R5X1Unorm = 0xF8,
R8G8B8X8Unorm = 0xF9,
R8G8B8X8Srgb = 0xFA,
Z1R5G5B5 = 0xFB,
O1R5G5B5 = 0xFC,
Z8R8G8B8 = 0xFD,
O8R8G8B8 = 0xFE,
Y32 = 0xFF
} format;
MemoryLayout memoryLayout;
struct {
u8 widthLog2 : 4;
u8 heightLog2 : 4;
u8 depthLog2 : 4;
u32 _pad_ : 20;
u8 Width() const {
return static_cast<u8>(1 << widthLog2);
}
u8 Height() const {
return static_cast<u8>(1 << heightLog2);
}
u8 Depth() const {
return static_cast<u8>(1 << depthLog2);
}
} blockSize;
u32 depth;
u32 layer;
u32 stride;
u32 width;
u32 height;
Address address;
};
#pragma pack(pop)
}

View File

@ -0,0 +1,84 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <soc.h>
#include "fermi_2d.h"
namespace skyline::soc::gm20b::engine::fermi2d {
Fermi2D::Fermi2D(const DeviceState &state, ChannelContext &channelCtx, MacroState &macroState, gpu::interconnect::CommandExecutor &executor)
: MacroEngineBase(macroState),
syncpoints(state.soc->host1x.syncpoints),
context(*state.gpu, channelCtx, executor),
channelCtx(channelCtx) {}
void Fermi2D::HandleMethod(u32 method, u32 argument) {
registers.raw[method] = argument;
if (method == ENGINE_STRUCT_OFFSET(pixelsFromMemory, trigger)) {
// Example user code for this method: https://github.com/devkitPro/deko3d/blob/8ee30005cf6d24d081800ee3820810290fffbb09/source/dk_image.cpp#L513
auto &src{*registers.src};
auto &dst{*registers.dst};
auto &pixelsFromMemory{*registers.pixelsFromMemory};
if (src.layer != 0 || dst.layer != 0)
Logger::Warn("Blits between layers are unimplemented!");
if (pixelsFromMemory.safeOverlap)
Logger::Warn("Safe overlap is unimplemented!");
constexpr u32 FractionalComponentSize{32};
// The 2D engine supports subpixel blit precision in the lower 32 bits of the src{X,Y}0 registers for filtering, we can safely ignore this in most cases though since the host driver will handle this in its own way
i32 srcX{static_cast<i32>(pixelsFromMemory.srcX0 >> FractionalComponentSize)};
i32 srcY{static_cast<i32>(pixelsFromMemory.srcY0 >> FractionalComponentSize)};
i32 srcWidth{static_cast<i32>((pixelsFromMemory.duDx * pixelsFromMemory.dstWidth) >> FractionalComponentSize)};
i32 srcHeight{static_cast<i32>((pixelsFromMemory.dvDy * pixelsFromMemory.dstHeight) >> FractionalComponentSize)};
if (registers.pixelsFromMemory->sampleMode.origin == Registers::PixelsFromMemory::SampleModeOrigin::Center) {
// This is an MSAA resolve operation, sampling from the center of each pixel in order to resolve the final image from the MSAA samples
// MSAA images are stored in memory like regular images but each pixels dimensions are scaled: e.g for 2x2 MSAA
/* 112233
112233
445566
445566 */
// These would be sampled with both duDx and duDy as 2 (integer part), resolving to the following:
/* 123
456 */
// Since we don't implement MSAA, any image that is supposed to have MSAA applied when drawing is just stored in the corner without any pixel scaling, so adjust width/height appropriately
srcWidth = pixelsFromMemory.dstWidth;
srcHeight = pixelsFromMemory.dstHeight;
} else {
// This is a regular blit operation, scaling from one image to another
// https://github.com/Ryujinx/Ryujinx/blob/c9c65af59edea05e7206a076cb818128c004384e/Ryujinx.Graphics.Gpu/Engine/Twod/TwodClass.cs#L253
srcX -= (pixelsFromMemory.duDx >> FractionalComponentSize) >> 1;
srcY -= (pixelsFromMemory.dvDy >> FractionalComponentSize) >> 1;
}
context.Blit(src, dst,
srcX, srcY,
srcWidth, srcHeight,
pixelsFromMemory.dstX0, pixelsFromMemory.dstY0,
pixelsFromMemory.dstWidth, pixelsFromMemory.dstHeight,
registers.pixelsFromMemory->sampleMode.origin == Registers::PixelsFromMemory::SampleModeOrigin::Center,
pixelsFromMemory.sampleMode.filter == Registers::PixelsFromMemory::SampleModeFilter::Bilinear);
}
}
void Fermi2D::CallMethodFromMacro(u32 method, u32 argument) {
HandleMethod(method, argument);
}
u32 Fermi2D::ReadMethodFromMacro(u32 method) {
return registers.raw[method];
}
__attribute__((always_inline)) void Fermi2D::CallMethod(u32 method, u32 argument) {
Logger::Verbose("Called method in Fermi 2D: 0x{:X} args: 0x{:X}", method, argument);
HandleMethod(method, argument);
}
}

View File

@ -0,0 +1,107 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)
// Copyright © 2018-2020 fincs (https://github.com/devkitPro/deko3d)
#pragma once
#include <gpu/interconnect/blit_context.h>
#include "engine.h"
namespace skyline::soc::gm20b {
struct ChannelContext;
}
namespace skyline::soc::gm20b::engine::fermi2d {
/**
* @brief The Fermi 2D engine handles perfoming blit and resolve operations
*/
class Fermi2D : public MacroEngineBase {
private:
host1x::SyncpointSet &syncpoints;
gpu::interconnect::BlitContext context;
ChannelContext &channelCtx;
/**
* @brief Calls the appropriate function corresponding to a certain method with the supplied argument
*/
void HandleMethod(u32 method, u32 argument);
public:
static constexpr u32 RegisterCount{0xE00}; //!< The number of Fermi 2D registers
/**
* @url https://github.com/devkitPro/deko3d/blob/master/source/maxwell/engine_2d.def
*/
#pragma pack(push, 1)
union Registers {
std::array<u32, RegisterCount> raw;
template<size_t Offset, typename Type>
using Register = util::OffsetMember<Offset, Type, u32>;
struct PixelsFromMemory {
enum class BlockShapeV : u8 {
Auto = 0,
Shape8x8 = 1,
Shape16x4 = 2
};
enum class SampleModeOrigin : u8 {
Center = 0,
Corner = 1
};
enum class SampleModeFilter : u8 {
Point = 0,
Bilinear = 1
};
BlockShapeV blockShape : 3;
u32 _pad0_ : 29;
u16 corralSize : 10;
u32 _pad1_ : 22;
bool safeOverlap : 1;
u32 _pad2_ : 31;
struct {
SampleModeOrigin origin : 1;
u8 _pad0_ : 3;
SampleModeFilter filter : 1;
u32 _pad1_ : 27;
} sampleMode;
u32 _pad3_[8];
i32 dstX0;
i32 dstY0;
i32 dstWidth;
i32 dstHeight;
i64 duDx;
i64 dvDy;
i64 srcX0;
union {
i64 srcY0;
struct {
u32 _pad4_;
u32 trigger;
};
};
};
Register<0x80, type::Surface> dst;
Register<0x8C, type::Surface> src;
Register<0x220, PixelsFromMemory> pixelsFromMemory;
};
static_assert(sizeof(Registers) == (RegisterCount * sizeof(u32)));
#pragma pack(pop)
Registers registers{};
Fermi2D(const DeviceState &state, ChannelContext &channelCtx, MacroState &macroState, gpu::interconnect::CommandExecutor &executor);
void CallMethodFromMacro(u32 method, u32 argument) override;
u32 ReadMethodFromMacro(u32 method) override;
void CallMethod(u32 method, u32 argument);
};
}

View File

@ -7,6 +7,7 @@
#include <soc.h>
#include <os.h>
#include "engines/maxwell_3d.h"
#include "engines/fermi_2d.h"
namespace skyline::soc::gm20b {
/**
@ -97,8 +98,7 @@ namespace skyline::soc::gm20b {
channelCtx.maxwell3D->HandleMacroCall(method - engine::EngineMethodsEnd, argument, lastCall);
break;
case SubchannelId::TwoD:
// TODO: Fix this when we implement the 2D Engine
Logger::Warn("Calling macros in the 2D engine is unimplemented!");
channelCtx.fermi2D->HandleMacroCall(method - engine::EngineMethodsEnd, argument, lastCall);
break;
default:
Logger::Warn("Called method 0x{:X} out of bounds for engine 0x{:X}, args: 0x{:X}", method, subChannel, argument);
@ -120,6 +120,8 @@ namespace skyline::soc::gm20b {
break;
case SubchannelId::Copy:
channelCtx.maxwellDma.CallMethod(method, argument);
case SubchannelId::TwoD:
channelCtx.fermi2D->CallMethod(method, argument);
break;
default:
Logger::Warn("Called method 0x{:X} in unimplemented engine 0x{:X}, args: 0x{:X}", method, subChannel, argument);