mirror of
https://github.com/skyline-emu/skyline.git
synced 2025-01-28 20:27:55 +03:00
Commonise maxwell3d guest shader caching code
This commit is contained in:
parent
6f6a312692
commit
23a7f70a8e
@ -0,0 +1,119 @@
|
|||||||
|
// SPDX-License-Identifier: MPL-2.0
|
||||||
|
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/Ryujinx/)
|
||||||
|
// Copyright © 2022 yuzu Team and Contributors (https://github.com/yuzu-emu/)
|
||||||
|
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
||||||
|
|
||||||
|
#include <nce.h>
|
||||||
|
#include <kernel/memory.h>
|
||||||
|
#include <soc/gm20b/channel.h>
|
||||||
|
#include <soc/gm20b/gmmu.h>
|
||||||
|
#include <gpu.h>
|
||||||
|
#include "shader_cache.h"
|
||||||
|
|
||||||
|
namespace skyline::gpu::interconnect {
|
||||||
|
/* Pipeline Stage */
|
||||||
|
ShaderBinary ShaderCache::Lookup(InterconnectContext &ctx, u64 programBase, u32 programOffset) {
|
||||||
|
lastProgramBase = programBase;
|
||||||
|
lastProgramOffset = programOffset;
|
||||||
|
auto[blockMapping, blockOffset]{ctx.channelCtx.asCtx->gmmu.LookupBlock(programBase + programOffset)};
|
||||||
|
|
||||||
|
if (!trapExecutionLock)
|
||||||
|
trapExecutionLock.emplace(trapMutex);
|
||||||
|
|
||||||
|
// Skip looking up the mirror if it is the same as the one used for the previous update
|
||||||
|
if (!mirrorBlock.valid() || !mirrorBlock.contains(blockMapping)) {
|
||||||
|
auto mirrorIt{mirrorMap.find(blockMapping.data())};
|
||||||
|
if (mirrorIt == mirrorMap.end()) {
|
||||||
|
// Allocate a host mirror for the mapping and trap the guest region
|
||||||
|
auto newIt{mirrorMap.emplace(blockMapping.data(), std::make_unique<MirrorEntry>(ctx.memory.CreateMirror(blockMapping)))};
|
||||||
|
|
||||||
|
// We need to create the trap after allocating the entry so that we have an `invalid` pointer we can pass in
|
||||||
|
auto trapHandle{ctx.nce.CreateTrap(blockMapping, [mutex = &trapMutex]() {
|
||||||
|
std::scoped_lock lock{*mutex};
|
||||||
|
return;
|
||||||
|
}, []() { return true; }, [entry = newIt.first->second.get(), mutex = &trapMutex]() {
|
||||||
|
std::unique_lock lock{*mutex, std::try_to_lock};
|
||||||
|
if (!lock)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (++entry->trapCount <= MirrorEntry::SkipTrapThreshold)
|
||||||
|
entry->dirty = true;
|
||||||
|
return true;
|
||||||
|
})};
|
||||||
|
|
||||||
|
// Write only trap
|
||||||
|
ctx.nce.TrapRegions(trapHandle, true);
|
||||||
|
|
||||||
|
entry = newIt.first->second.get();
|
||||||
|
entry->trap = trapHandle;
|
||||||
|
} else {
|
||||||
|
entry = mirrorIt->second.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
mirrorBlock = blockMapping;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) {
|
||||||
|
entry->channelSequenceNumber = ctx.channelCtx.channelSequenceNumber;
|
||||||
|
entry->dirty = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes
|
||||||
|
if (entry->dirty) {
|
||||||
|
entry->cache.clear();
|
||||||
|
entry->dirty = false;
|
||||||
|
|
||||||
|
if (entry->trapCount <= MirrorEntry::SkipTrapThreshold)
|
||||||
|
ctx.nce.TrapRegions(*entry->trap, true);
|
||||||
|
} else if (auto it{entry->cache.find(blockMapping.data() + blockOffset)}; it != entry->cache.end()) {
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
// entry->mirror may not be a direct mirror of blockMapping and may just contain it as a subregion, so we need to explicitly calculate the offset
|
||||||
|
span<u8> blockMappingMirror{blockMapping.data() - mirrorBlock.data() + entry->mirror.data(), blockMapping.size()};
|
||||||
|
|
||||||
|
ShaderBinary binary{};
|
||||||
|
// If nothing was in the cache then do a full shader parse
|
||||||
|
binary.binary = [](span<u8> mapping) {
|
||||||
|
// We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader
|
||||||
|
// UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351
|
||||||
|
constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F};
|
||||||
|
|
||||||
|
span<u64> shaderInstructions{mapping.cast<u64, std::dynamic_extent, true>()};
|
||||||
|
for (auto it{shaderInstructions.begin()}; it != shaderInstructions.end(); it++) {
|
||||||
|
auto instruction{*it};
|
||||||
|
if (instruction == BraSelf1 || instruction == BraSelf2) [[unlikely]]
|
||||||
|
// It is far more likely that the instruction doesn't match so this is an unlikely case
|
||||||
|
return span{shaderInstructions.begin(), it}.cast<u8>();
|
||||||
|
}
|
||||||
|
|
||||||
|
return span<u8>{};
|
||||||
|
}(blockMappingMirror.subspan(blockOffset));
|
||||||
|
|
||||||
|
binary.baseOffset = programOffset;
|
||||||
|
binary.hash = XXH64(binary.binary.data(), binary.binary.size_bytes(), 0);
|
||||||
|
|
||||||
|
entry->cache.insert({blockMapping.data() + blockOffset, binary});
|
||||||
|
|
||||||
|
return binary;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ShaderCache::Refresh(InterconnectContext &ctx, u64 programBase, u32 programOffset) {
|
||||||
|
if (!trapExecutionLock)
|
||||||
|
trapExecutionLock.emplace(trapMutex);
|
||||||
|
|
||||||
|
if (programBase != lastProgramBase || programOffset != lastProgramOffset)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber)
|
||||||
|
return true;
|
||||||
|
else if (entry && entry->dirty)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ShaderCache::PurgeCaches() {
|
||||||
|
trapExecutionLock.reset();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
// SPDX-License-Identifier: MPL-2.0
|
||||||
|
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
namespace skyline::gpu::interconnect {
|
||||||
|
class ShaderCache {
|
||||||
|
private:
|
||||||
|
/**
|
||||||
|
* @brief Holds mirror state for a single GPU mapped block
|
||||||
|
*/
|
||||||
|
struct MirrorEntry {
|
||||||
|
span<u8> mirror;
|
||||||
|
tsl::robin_map<u8 *, ShaderBinary> cache;
|
||||||
|
std::optional<nce::NCE::TrapHandle> trap;
|
||||||
|
|
||||||
|
static constexpr u32 SkipTrapThreshold{20}; //!< Threshold for the number of times a mirror trap needs to be hit before we fallback to always hashing
|
||||||
|
u32 trapCount{}; //!< The number of times the trap has been hit, used to avoid trapping in cases where the constant retraps would harm performance
|
||||||
|
size_t channelSequenceNumber{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access
|
||||||
|
bool dirty{}; //!< If the trap has been hit and the cache needs to be cleared
|
||||||
|
|
||||||
|
MirrorEntry(span<u8> alignedMirror) : mirror{alignedMirror} {}
|
||||||
|
};
|
||||||
|
|
||||||
|
tsl::robin_map<u8 *, std::unique_ptr<MirrorEntry>> mirrorMap;
|
||||||
|
std::mutex trapMutex; //!< Protects accesses from trap handlers to the mirror map
|
||||||
|
std::optional<std::scoped_lock<std::mutex>> trapExecutionLock; //!< Persistently held lock over an execution to avoid frequent relocking
|
||||||
|
MirrorEntry *entry{};
|
||||||
|
span<u8> mirrorBlock{}; //!< Guest mapped memory block corresponding to `entry`
|
||||||
|
u64 lastProgramBase{};
|
||||||
|
u32 lastProgramOffset{};
|
||||||
|
|
||||||
|
public:
|
||||||
|
ShaderBinary Lookup(InterconnectContext &ctx, u64 programBase, u32 programOffset);
|
||||||
|
|
||||||
|
bool Refresh(InterconnectContext &ctx, u64 programBase, u32 programOffset);
|
||||||
|
|
||||||
|
void PurgeCaches();
|
||||||
|
};
|
||||||
|
}
|
@ -17,11 +17,6 @@ namespace skyline::gpu {
|
|||||||
}
|
}
|
||||||
|
|
||||||
namespace skyline::gpu::interconnect::maxwell3d {
|
namespace skyline::gpu::interconnect::maxwell3d {
|
||||||
struct ShaderBinary {
|
|
||||||
span<u8> binary;
|
|
||||||
u32 baseOffset;
|
|
||||||
};
|
|
||||||
|
|
||||||
class Pipeline {
|
class Pipeline {
|
||||||
public:
|
public:
|
||||||
struct ShaderStage {
|
struct ShaderStage {
|
||||||
|
@ -255,112 +255,19 @@ namespace skyline::gpu::interconnect::maxwell3d {
|
|||||||
throw exception("Shader type mismatch: {} != {}!", engine->pipeline.shader.type, static_cast<u8>(shaderType));
|
throw exception("Shader type mismatch: {} != {}!", engine->pipeline.shader.type, static_cast<u8>(shaderType));
|
||||||
|
|
||||||
if (!engine->pipeline.shader.enable && shaderType != engine::Pipeline::Shader::Type::Vertex) {
|
if (!engine->pipeline.shader.enable && shaderType != engine::Pipeline::Shader::Type::Vertex) {
|
||||||
hash = 0;
|
binary.hash = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto[blockMapping, blockOffset]{ctx.channelCtx.asCtx->gmmu.LookupBlock(engine->programRegion + engine->pipeline.programOffset)};
|
binary = cache.Lookup(ctx, engine->programRegion, engine->pipeline.programOffset);
|
||||||
|
|
||||||
if (!trapExecutionLock)
|
|
||||||
trapExecutionLock.emplace(trapMutex);
|
|
||||||
|
|
||||||
// Skip looking up the mirror if it is the same as the one used for the previous update
|
|
||||||
if (!mirrorBlock.valid() || !mirrorBlock.contains(blockMapping)) {
|
|
||||||
auto mirrorIt{mirrorMap.find(blockMapping.data())};
|
|
||||||
if (mirrorIt == mirrorMap.end()) {
|
|
||||||
// Allocate a host mirror for the mapping and trap the guest region
|
|
||||||
auto newIt{mirrorMap.emplace(blockMapping.data(), std::make_unique<MirrorEntry>(ctx.memory.CreateMirror(blockMapping)))};
|
|
||||||
|
|
||||||
// We need to create the trap after allocating the entry so that we have an `invalid` pointer we can pass in
|
|
||||||
auto trapHandle{ctx.nce.CreateTrap(blockMapping, [mutex = &trapMutex]() {
|
|
||||||
std::scoped_lock lock{*mutex};
|
|
||||||
return;
|
|
||||||
}, []() { return true; }, [entry = newIt.first->second.get(), mutex = &trapMutex]() {
|
|
||||||
std::unique_lock lock{*mutex, std::try_to_lock};
|
|
||||||
if (!lock)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (++entry->trapCount <= MirrorEntry::SkipTrapThreshold)
|
|
||||||
entry->dirty = true;
|
|
||||||
return true;
|
|
||||||
})};
|
|
||||||
|
|
||||||
// Write only trap
|
|
||||||
ctx.nce.TrapRegions(trapHandle, true);
|
|
||||||
|
|
||||||
entry = newIt.first->second.get();
|
|
||||||
entry->trap = trapHandle;
|
|
||||||
} else {
|
|
||||||
entry = mirrorIt->second.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
mirrorBlock = blockMapping;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) {
|
|
||||||
entry->channelSequenceNumber = ctx.channelCtx.channelSequenceNumber;
|
|
||||||
entry->dirty = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes
|
|
||||||
if (entry->dirty) {
|
|
||||||
entry->cache.clear();
|
|
||||||
entry->dirty = false;
|
|
||||||
|
|
||||||
if (entry->trapCount <= MirrorEntry::SkipTrapThreshold)
|
|
||||||
ctx.nce.TrapRegions(*entry->trap, true);
|
|
||||||
} else if (auto it{entry->cache.find(blockMapping.data() + blockOffset)}; it != entry->cache.end()) {
|
|
||||||
binary = it->second.binary;
|
|
||||||
hash = it->second.hash;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// entry->mirror may not be a direct mirror of blockMapping and may just contain it as a subregion, so we need to explicitly calculate the offset
|
|
||||||
span<u8> blockMappingMirror{blockMapping.data() - mirrorBlock.data() + entry->mirror.data(), blockMapping.size()};
|
|
||||||
|
|
||||||
// If nothing was in the cache then do a full shader parse
|
|
||||||
binary.binary = [](span<u8> mapping) {
|
|
||||||
// We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader
|
|
||||||
// UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351
|
|
||||||
constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F};
|
|
||||||
|
|
||||||
span<u64> shaderInstructions{mapping.cast<u64, std::dynamic_extent, true>()};
|
|
||||||
for (auto it{shaderInstructions.begin()}; it != shaderInstructions.end(); it++) {
|
|
||||||
auto instruction{*it};
|
|
||||||
if (instruction == BraSelf1 || instruction == BraSelf2) [[unlikely]]
|
|
||||||
// It is far more likely that the instruction doesn't match so this is an unlikely case
|
|
||||||
return span{shaderInstructions.begin(), it}.cast<u8>();
|
|
||||||
}
|
|
||||||
|
|
||||||
return span<u8>{};
|
|
||||||
}(blockMappingMirror.subspan(blockOffset));
|
|
||||||
|
|
||||||
binary.baseOffset = engine->pipeline.programOffset;
|
|
||||||
hash = XXH64(binary.binary.data(), binary.binary.size_bytes(), 0);
|
|
||||||
|
|
||||||
entry->cache.insert({blockMapping.data() + blockOffset, CacheEntry{binary, hash}});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PipelineStageState::Refresh(InterconnectContext &ctx) {
|
bool PipelineStageState::Refresh(InterconnectContext &ctx) {
|
||||||
if (!trapExecutionLock)
|
return cache.Refresh(ctx, engine->programRegion, engine->pipeline.programOffset);
|
||||||
trapExecutionLock.emplace(trapMutex);
|
|
||||||
|
|
||||||
if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber)
|
|
||||||
return true;
|
|
||||||
else if (entry && entry->dirty)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void PipelineStageState::PurgeCaches() {
|
void PipelineStageState::PurgeCaches() {
|
||||||
trapExecutionLock.reset();
|
cache.PurgeCaches();
|
||||||
}
|
|
||||||
|
|
||||||
PipelineStageState::~PipelineStageState() {
|
|
||||||
std::scoped_lock lock{trapMutex};
|
|
||||||
//for (const auto &mirror : mirrorMap)
|
|
||||||
// ctx.nce.DestroyTrap(*mirror.second->trap);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Vertex Input State */
|
/* Vertex Input State */
|
||||||
@ -584,7 +491,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
|
|||||||
std::array<ShaderBinary, engine::PipelineCount> shaderBinaries;
|
std::array<ShaderBinary, engine::PipelineCount> shaderBinaries;
|
||||||
for (size_t i{}; i < engine::PipelineCount; i++) {
|
for (size_t i{}; i < engine::PipelineCount; i++) {
|
||||||
const auto &stage{pipelineStages[i].UpdateGet(ctx)};
|
const auto &stage{pipelineStages[i].UpdateGet(ctx)};
|
||||||
packedState.shaderHashes[i] = stage.hash;
|
packedState.shaderHashes[i] = stage.binary.hash;
|
||||||
shaderBinaries[i] = stage.binary;
|
shaderBinaries[i] = stage.binary;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <boost/container/static_vector.hpp>
|
#include <boost/container/static_vector.hpp>
|
||||||
#include <gpu/texture/texture.h>
|
#include <gpu/texture/texture.h>
|
||||||
|
#include <gpu/interconnect/common/shader_cache.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "packed_pipeline_state.h"
|
#include "packed_pipeline_state.h"
|
||||||
#include "pipeline_manager.h"
|
#include "pipeline_manager.h"
|
||||||
@ -67,46 +68,16 @@ namespace skyline::gpu::interconnect::maxwell3d {
|
|||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct CacheEntry {
|
|
||||||
ShaderBinary binary;
|
|
||||||
u64 hash;
|
|
||||||
|
|
||||||
CacheEntry(ShaderBinary binary, u64 hash) : binary{binary}, hash{hash} {}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Holds mirror state for a single GPU mapped block
|
|
||||||
*/
|
|
||||||
struct MirrorEntry {
|
|
||||||
span<u8> mirror;
|
|
||||||
tsl::robin_map<u8 *, CacheEntry> cache;
|
|
||||||
std::optional<nce::NCE::TrapHandle> trap;
|
|
||||||
|
|
||||||
static constexpr u32 SkipTrapThreshold{20}; //!< Threshold for the number of times a mirror trap needs to be hit before we fallback to always hashing
|
|
||||||
u32 trapCount{}; //!< The number of times the trap has been hit, used to avoid trapping in cases where the constant retraps would harm performance
|
|
||||||
size_t channelSequenceNumber{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access
|
|
||||||
bool dirty{}; //!< If the trap has been hit and the cache needs to be cleared
|
|
||||||
|
|
||||||
MirrorEntry(span<u8> alignedMirror) : mirror{alignedMirror} {}
|
|
||||||
};
|
|
||||||
|
|
||||||
dirty::BoundSubresource<EngineRegisters> engine;
|
dirty::BoundSubresource<EngineRegisters> engine;
|
||||||
engine::Pipeline::Shader::Type shaderType;
|
engine::Pipeline::Shader::Type shaderType;
|
||||||
|
|
||||||
tsl::robin_map<u8 *, std::unique_ptr<MirrorEntry>> mirrorMap;
|
ShaderCache cache;
|
||||||
std::mutex trapMutex; //!< Protects accesses from trap handlers to the mirror map
|
|
||||||
std::optional<std::scoped_lock<std::mutex>> trapExecutionLock;
|
|
||||||
MirrorEntry *entry{};
|
|
||||||
span<u8> mirrorBlock{}; //!< Guest mapped memory block corresponding to `entry`
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ShaderBinary binary;
|
ShaderBinary binary;
|
||||||
u64 hash;
|
|
||||||
|
|
||||||
PipelineStageState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u8 shaderType);
|
PipelineStageState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u8 shaderType);
|
||||||
|
|
||||||
~PipelineStageState();
|
|
||||||
|
|
||||||
void Flush(InterconnectContext &ctx);
|
void Flush(InterconnectContext &ctx);
|
||||||
|
|
||||||
bool Refresh(InterconnectContext &ctx);
|
bool Refresh(InterconnectContext &ctx);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user