NEEDS CLEANUP: Reimplement GPU VMM and rewrite nvdrv VM impl

2025-06-07 03:02:21 +03:00 · 2021-08-14 20:42:11 +01:00 · 2021-08-14 20:42:11 +01:00 · d03b288db6
commit d03b288db6
parent 020aa0e43a
23 changed files with 794 additions and 444 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -100,7 +100,7 @@ add_library(skyline SHARED
        ${source_DIR}/skyline/gpu/command_scheduler.cpp
        ${source_DIR}/skyline/gpu/texture/texture.cpp
        ${source_DIR}/skyline/gpu/presentation_engine.cpp
-        ${source_DIR}/skyline/soc/gmmu.cpp
+        ${source_DIR}/skyline/soc/gm20b.cpp
        ${source_DIR}/skyline/soc/host1x/syncpoint.cpp
        ${source_DIR}/skyline/soc/gm20b/gpfifo.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/maxwell_3d.cpp
--- a/app/src/main/cpp/skyline/common/address_space.h
+++ b/app/src/main/cpp/skyline/common/address_space.h
@ -0,0 +1,155 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 #pragma once
 #include <concepts>
 #include <common.h>
 namespace skyline {
    template<typename VaType, size_t AddressSpaceBits>
    concept AddressSpaceValid = std::is_unsigned_v<VaType> && sizeof(VaType) * 8 >= AddressSpaceBits;
    /**
     * @brief FlatAddressSpaceMap provides a generic VA->PA mapping implementation using a sorted vector
     */
    template<typename VaType, VaType UnmappedVa, typename PaType, PaType UnmappedPa, bool PaContigSplit, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits>
    class FlatAddressSpaceMap {
      private:
        /**
         * @brief Represents a block of memory in the AS
         */
        struct Block {
            VaType virt{UnmappedVa}; //!< VA of the block
            PaType phys{UnmappedPa}; //!< PA of the block, will increase 1-1 with VA until a new block is encountered
            bool flag{}; //!< General purpose flag for use by derived classes
            Block() = default;
            Block(VaType virt, PaType phys, bool flag) : virt(virt), phys(phys), flag(flag) {}
            constexpr bool Valid() {
                return virt != UnmappedVa;
            }
            constexpr bool Mapped() {
                return phys != UnmappedPa;
            }
            constexpr bool Unmapped() {
                return phys == UnmappedPa;
            }
            bool operator<(const VaType &pVirt) const {
                return virt < pVirt;
            }
        };
      protected:
        std::mutex blockMutex;
        std::vector<Block> blocks{Block{}};
        /**
         * @brief Maps a PA range into the given AS region, optionally setting the flag
         * @note blockMutex MUST be locked when calling this
         */
        void MapLocked(VaType virt, PaType phys, VaType size, bool flag = {});
        /**
         * @brief Unmaps the given range and merges it with other unmapped regions
         * @note blockMutex MUST be locked when calling this
         */
        void UnmapLocked(VaType virt, VaType size);
      public:
        static constexpr VaType VaMaximum{(1ULL << (AddressSpaceBits - 1)) + ((1ULL << (AddressSpaceBits - 1)) - 1)}; //!< The maximum VA that this AS can technically reach
        VaType vaLimit{VaMaximum}; //!< A soft limit on the maximum VA of the AS
        FlatAddressSpaceMap(VaType pVaLimit);
        FlatAddressSpaceMap() = default;
        /**
         * @brief Locked version of MapLocked
         */
        void Map(VaType virt, PaType phys, VaType size, bool flag = {});
        /**
         * @brief Locked version of UnmapLocked
         */
        void Unmap(VaType virt, VaType size);
    };
    /**
     * @brief FlatMemoryManager specialises FlatAddressSpaceMap to focus on pointers as PAs, adding read/write functions
     */
    template<typename VaType, VaType UnmappedVa, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits>
    class FlatMemoryManager : public FlatAddressSpaceMap<VaType, UnmappedVa, u8 *, nullptr, true, AddressSpaceBits> {
      public:
        /**
         * @return A placeholder address for sparse mapped regions, this means nothing
         */
        static u8 *SparsePlaceholderAddress() {
            return reinterpret_cast<u8 *>(0xCAFEBABE);
        }
        void Read(u8 *destination, VaType virt, VaType size);
        template<typename T>
        void Read(span <T> destination, VaType virt) {
            Read(reinterpret_cast<u8 *>(destination.data()), virt, destination.size_bytes());
        }
        template<typename T>
        T Read(VaType virt) {
            T obj;
            Read(reinterpret_cast<u8 *>(&obj), virt, sizeof(T));
            return obj;
        }
        void Write(VaType virt, u8 *source, VaType size);
        template<typename T>
        void Write(VaType virt, span <T> source) {
            Write(virt, reinterpret_cast<u8 *>(source.data()), source.size_bytes());
        }
        template<typename T>
        void Write(VaType virt, T source) {
            Write(virt, reinterpret_cast<u8 *>(&source), sizeof(T));
        }
    };
    /**
     * @brief FlatMemoryManager specialises FlatAddressSpaceMap to work as an allocator, with an initial, fast linear pass and a subsequent slower pass that iterates until it finds a free block
     */
    template<typename VaType, VaType UnmappedVa, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits>
    class FlatAllocator : public FlatAddressSpaceMap<VaType, UnmappedVa, bool, false, false, AddressSpaceBits> {
      private:
        using Base = FlatAddressSpaceMap<VaType, UnmappedVa, bool, false, false, AddressSpaceBits>;
        VaType currentLinearAllocEnd; //!< The end address for the initial linear allocation pass, once this reaches the AS limit the slower allocation path will be used
      public:
        VaType vaStart; //!< The base VA of the allocator, no allocations will be below this
        FlatAllocator(VaType vaStart, VaType vaLimit);
        /**
         * @brief Allocates a region in the AS of the given size and returns its address
         */
        VaType Allocate(VaType size);
        /**
         * @brief Marks the given region in the AS as allocated
         */
        void AllocateFixed(VaType virt, VaType size);
        /**
         * @brief Frees an AS region so it can be used again
         */
        void Free(VaType virt, VaType size);
    };
 }
--- a/app/src/main/cpp/skyline/common/address_space.inc
+++ b/app/src/main/cpp/skyline/common/address_space.inc
@ -0,0 +1,354 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 #include <common/trace.h>
 #include <kernel/types/KProcess.h>
 #include "address_space.h"
 #define MAP_MEMBER(returnType) template<typename VaType, VaType UnmappedVa, typename PaType, PaType UnmappedPa, bool PaContigSplit, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits> returnType FlatAddressSpaceMap<VaType, UnmappedVa, PaType, UnmappedPa, PaContigSplit, AddressSpaceBits>
 #define MM_MEMBER(returnType) template<typename VaType, VaType UnmappedVa, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits> returnType FlatMemoryManager<VaType, UnmappedVa, AddressSpaceBits>
 #define ALLOC_MEMBER(returnType) template<typename VaType, VaType UnmappedVa, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits> returnType FlatAllocator<VaType, UnmappedVa, AddressSpaceBits>
 namespace skyline {
    MAP_MEMBER()::FlatAddressSpaceMap(VaType pVaLimit) : vaLimit(pVaLimit) {
        if (pVaLimit > VaMaximum)
            throw exception("Invalid VA limit!");
    }
    MAP_MEMBER(void)::MapLocked(VaType virt, PaType phys, VaType size, bool flag) {
        TRACE_EVENT("containers", "FlatAddressSpaceMap::Map");
        VaType virtEnd{virt + size};
        if (virtEnd > vaLimit)
            throw exception("Trying to map a block past the VA limit!");
        auto blockEndSuccessor{std::lower_bound(blocks.begin(), blocks.end(), virtEnd)};
        if (blockEndSuccessor == blocks.begin())
            throw exception("Unexpected Memory Manager state!");
        auto blockEndPredecessor{std::prev(blockEndSuccessor)};
        if (blockEndSuccessor != blocks.end()) {
            // We have blocks in front of us, if one is directly in front then we don't have to add a tail
            if (blockEndSuccessor->virt != virtEnd) {
                PaType tailPhys{[&]() -> PaType {
                    if (!PaContigSplit || blockEndPredecessor->Unmapped())
                        return blockEndPredecessor->phys; // Always propagate unmapped regions
                    else
                        return blockEndPredecessor->phys + virtEnd - blockEndPredecessor->virt;
                }()};
                if (blockEndPredecessor->virt >= virt) {
                    // If this block's start would be overlapped by the map then reuse it as a tail block
                    blockEndPredecessor->virt = virtEnd;
                    blockEndPredecessor->phys = tailPhys;
                    blockEndPredecessor->flag = blockEndPredecessor->flag;
                } else {
                    // Else insert a new one and we're done
                    blocks.insert(blockEndSuccessor, {Block(virt, phys, flag), Block(virtEnd, tailPhys, blockEndPredecessor->flag)});
                    return;
                }
            }
        } else {
            // blockEndPredecessor will always be unmapped as blocks has to be terminated by an unmapped chunk
            if (blockEndPredecessor != blocks.begin() && blockEndPredecessor->virt >= virt) {
                // Move the unmapped block start backwards
                blockEndPredecessor->virt = virtEnd;
            } else {
                // Else insert a new one and we're done
                blocks.insert(blockEndSuccessor, {Block(virt, phys, flag), Block(virtEnd, UnmappedPa, false)});
                return;
            }
        }
        auto blockStartSuccessor{blockEndPredecessor};
        // Walk the block vector to find the start successor as this is more efficient than another binary search in most scenarios
        while (std::prev(blockStartSuccessor)->virt >= virt)
            std::advance(blockStartSuccessor, -1);
        if (blockStartSuccessor->virt > virtEnd)
            throw exception("Unexpected Memory Manager state!");
        if (blockStartSuccessor->virt == virtEnd) {
            // We need to create a new block as there are none spare that we would overwrite
            blocks.insert(blockStartSuccessor, Block(virt, phys, flag));
            return;
        } else {
            blockStartSuccessor->virt = virt;
            blockStartSuccessor->phys = phys;
            blockStartSuccessor->flag = flag;
            // Erase overwritten blocks
            if (auto eraseStart{std::next(blockStartSuccessor)}; blockStartSuccessor != blockEndPredecessor) {
                if (eraseStart == blockEndPredecessor)
                    __builtin_trap();
                blocks.erase(eraseStart, blockEndPredecessor);
            }
        }
    }
    MAP_MEMBER(void)::UnmapLocked(VaType virt, VaType size) {
        TRACE_EVENT("containers", "FlatAddressSpaceMap::Unmap");
        VaType virtEnd{virt + size};
        if (virtEnd > vaLimit)
            throw exception("Trying to map a block past the VA limit!");
        auto blockEndSuccessor{std::lower_bound(blocks.begin(), blocks.end(), virtEnd)};
        if (blockEndSuccessor == blocks.begin())
            throw exception("Unexpected Memory Manager state!");
        auto blockEndPredecessor{std::prev(blockEndSuccessor)};
        auto walkBackToPredecessor{[&](auto iter) {
            while (iter->virt >= virt)
                std::advance(iter, -1);
            return iter;
        }};
        auto eraseBlocksWithEndUnmapped{[&] (auto unmappedEnd) {
            auto blockStartPredecessor{walkBackToPredecessor(unmappedEnd)};
            auto blockStartSuccessor{std::next(blockStartPredecessor)};
            auto eraseEnd{[&]() {
                if (blockStartPredecessor->Unmapped()) {
                    // If the start predecessor is unmapped then we can erase everything in our region and be done
                    return std::next(unmappedEnd);
                } else {
                    // Else reuse the end predecessor as the start of our unmapped region then erase all up to it
                    unmappedEnd->virt = virt;
                    return unmappedEnd;
                }
            }()};
            // We can't have two unmapped regions after each other
            if (eraseEnd == blockStartSuccessor || (blockStartPredecessor->Unmapped() && eraseEnd->Unmapped()))
                throw exception("Unexpected Memory Manager state!");
            blocks.erase(blockStartSuccessor, eraseEnd);
        }};
        // We can avoid any splitting logic if these are the case
        if (blockEndPredecessor->Unmapped()) {
            if (blockEndPredecessor->virt > virt)
                eraseBlocksWithEndUnmapped(blockEndPredecessor);
            return; // The region is unmapped, bail out early
        } else if (blockEndSuccessor->virt == virtEnd && blockEndSuccessor->Unmapped()) {
            eraseBlocksWithEndUnmapped(blockEndSuccessor);
            return; // The region is unmapped here and doesn't need splitting, bail out early
        } else if (blockEndSuccessor == blocks.end()) {
            // This should never happen as the end should always follow an unmapped block
            throw exception("Unexpected Memory Manager state!");
        } else if (blockEndSuccessor->virt != virtEnd) {
            // If one block is directly in front then we don't have to add a tail
            // The previous block is mapped so we will need to add a tail with an offset
            PaType tailPhys{[&]() {
                if constexpr (PaContigSplit)
                    return blockEndPredecessor->phys + virtEnd - blockEndPredecessor->virt;
                else
                    return blockEndPredecessor->phys;
            }()};
            if (blockEndPredecessor->virt >= virt) {
                // If this block's start would be overlapped by the unmap then reuse it as a tail block
                blockEndPredecessor->virt = virtEnd;
                blockEndPredecessor->phys = tailPhys;
            } else {
                blocks.insert(blockEndSuccessor, {Block(virt, UnmappedPa, false), Block(virtEnd, tailPhys, blockEndPredecessor->flag)});
                return; // The previous block is mapped and ends bef
            }
        }
        // Walk the block vector to find the start predecessor as this is more efficient than another binary search in most scenarios
        auto blockStartPredecessor{walkBackToPredecessor(blockEndPredecessor)};
        auto blockStartSuccessor{std::next(blockStartPredecessor)};
        if (blockStartSuccessor->virt > virtEnd)
            throw exception("Unexpected Memory Manager state!");
        if (blockStartSuccessor->virt == virtEnd) {
            // There are no blocks between the start and the end that would let us skip inserting a new one for head
            // The previous block is may be unmapped, if so we don't need to insert any unmaps after it
            if (blockStartPredecessor->Mapped())
                blocks.insert(blockStartSuccessor, Block(virt, UnmappedPa, false));
        } else if (blockStartPredecessor->Unmapped()) {
            // If the previous block is unmapped
            blocks.erase(blockStartSuccessor, blockEndPredecessor);
        } else {
            // Add in the unmapped block header
            blockStartSuccessor->virt = virt;
            blockStartSuccessor->phys = UnmappedPa;
            // Erase overwritten blocks, skipping the first one as we have written the unmapped start block there
            if (auto eraseStart{std::next(blockStartSuccessor)}; blockStartSuccessor != blockEndPredecessor) {
                if (eraseStart == blockEndPredecessor)
                    __builtin_trap();
                blocks.erase(eraseStart, blockEndPredecessor);
            }
        }
    }
    MAP_MEMBER(void)::Map(VaType virt, PaType phys, VaType size, bool flag) {
        std::scoped_lock lock(blockMutex);
        MapLocked(virt, phys, size, flag);
    }
    MAP_MEMBER(void)::Unmap(VaType virt, VaType size) {
        std::scoped_lock lock(blockMutex);
        UnmapLocked(virt, size);
    }
    MM_MEMBER(void)::Read(u8 *destination, VaType virt, VaType size) {
        std::scoped_lock lock(this->blockMutex);
        TRACE_EVENT("containers", "FlatMemoryManager::Read");
        VaType virtEnd{virt + size};
        auto successor{std::upper_bound(this->blocks.begin(), this->blocks.end(), virt, [] (auto virt, const auto &block) {
            return virt < block.virt;
        })};
        auto predecessor{std::prev(successor)};
        u8 *blockPhys{predecessor->phys + (virt - predecessor->virt)};
        VaType blockReadSize{std::min(successor->virt - virt, size)};
        while (size) {
            if (predecessor->phys == nullptr) {
                if (predecessor->flag) // Sparse mapping
                    std::memset(destination, 0, blockReadSize);
                else
                    throw exception("Page fault at: 0x{:X}", predecessor->virt);
            } else {
                std::memcpy(destination, blockPhys, blockReadSize);
            }
            destination += blockReadSize;
            size -= blockReadSize;
            if (size) {
                predecessor = successor++;
                blockPhys = predecessor->phys;
                blockReadSize = std::min(successor->virt - predecessor->virt, size);
            }
        }
    }
    MM_MEMBER(void)::Write(VaType virt, u8 *source, VaType size) {
        std::scoped_lock lock(this->blockMutex);
        TRACE_EVENT("containers", "FlatMemoryManager::Write");
        VaType virtEnd{virt + size};
        auto successor{std::upper_bound(this->blocks.begin(), this->blocks.end(), virt, [] (auto virt, const auto &block) {
            return virt < block.virt;
        })};
        auto predecessor{std::prev(successor)};
        u8 *blockPhys{predecessor->phys + (virt - predecessor->virt)};
        VaType blockWriteSize{std::min(successor->virt - virt, size)};
        while (size) {
            if (predecessor->phys == nullptr) {
                if (!predecessor->flag) // Sparse mappings allow unmapped writes
                    throw exception("Page fault at: 0x{:X}", predecessor->virt);
            } else {
                std::memcpy(blockPhys, source, blockWriteSize);
            }
            source += blockWriteSize;
            size -= blockWriteSize;
            if (size) {
                predecessor = successor++;
                blockPhys = predecessor->phys;
                blockWriteSize = std::min(successor->virt - predecessor->virt, size);
            }
        }
    }
    ALLOC_MEMBER()::FlatAllocator(VaType vaStart, VaType vaLimit) : Base(vaLimit), vaStart(vaStart), currentLinearAllocEnd(vaStart) {}
    ALLOC_MEMBER(VaType)::Allocate(VaType size) {
        std::scoped_lock lock(this->blockMutex);
        TRACE_EVENT("containers", "FlatAllocator::Allocate");
        VaType allocStart{UnmappedVa};
        VaType allocEnd{currentLinearAllocEnd + size};
        if (allocEnd >= currentLinearAllocEnd && allocEnd <= this->vaLimit) {
            auto allocEndSuccessor{std::lower_bound(this->blocks.begin(), this->blocks.end(), allocEnd)};
            if (allocEndSuccessor == this->blocks.begin())
                throw exception("Unexpected allocator state!");
            auto allocEndPredecessor{std::prev(allocEndSuccessor)};
            if (allocEndPredecessor->virt <= currentLinearAllocEnd) {
                allocStart = currentLinearAllocEnd;
            } else {
                while (allocEndSuccessor != this->blocks.end()) {
                    if (allocEndSuccessor->virt - allocEndPredecessor->virt < size || allocEndPredecessor->Mapped() ) {
                        allocStart = allocEndPredecessor->virt;
                        break;
                    }
                    allocEndPredecessor = allocEndSuccessor++;
                    if (allocEndSuccessor == this->blocks.end()) {
                        allocEnd = allocEndPredecessor->virt + size;
                        if (allocEnd >= allocEndPredecessor->virt && allocEnd <= this->vaLimit)
                            allocStart = allocEndPredecessor->virt;
                    }
                }
            }
        }
        if (allocStart != UnmappedVa) {
            currentLinearAllocEnd = allocStart + size;
        } else {  // If linear allocation overflows the AS then find a gap
            if (this->blocks.size() <= 2)
                throw exception("Unexpected allocator state!");
            auto searchPredecessor{this->blocks.begin()};
            auto searchSuccessor{std::next(searchPredecessor)};
            while (searchSuccessor != this->blocks.end() &&
                (searchSuccessor->virt - searchPredecessor->virt < size || searchPredecessor->Mapped())) {
                searchPredecessor = searchSuccessor++;
            }
            if (searchSuccessor != this->blocks.end())
                allocStart = searchPredecessor->virt;
            else
                throw exception("Unexpected allocator state!");
        }
        this->MapLocked(allocStart, true, size);
        return allocStart;
    }
    ALLOC_MEMBER(void)::AllocateFixed(VaType virt, VaType size) {
        this->MapLocked(virt, true, size);
    }
    ALLOC_MEMBER(void)::Free(VaType virt, VaType size) {
        this->UnmapLocked(virt, size);
    }
 }
--- a/app/src/main/cpp/skyline/common/circular_queue.h
+++ b/app/src/main/cpp/skyline/common/circular_queue.h
@ -3,6 +3,7 @@
 #pragma once
 #include <common/trace.h>
 #include <common.h>
 namespace skyline {
@ -51,10 +52,15 @@ namespace skyline {
         */
        template<typename F>
        [[noreturn]] void Process(F function) {
            TRACE_EVENT_BEGIN("containers", "CircularQueue::Process");
            while (true) {
                if (start == end) {
                    std::unique_lock lock(productionMutex);
                    TRACE_EVENT_END("containers");
                    produceCondition.wait(lock, [this]() { return start != end; });
                    TRACE_EVENT_BEGIN("containers", "CircularQueue::Process");
                }
                while (start != end) {
--- a/app/src/main/cpp/skyline/common/signal.cpp
+++ b/app/src/main/cpp/skyline/common/signal.cpp
@ -183,8 +183,6 @@ namespace skyline::signal {
            std::call_once(signalHandlerOnce[signal], [signal, &action]() {
                struct sigaction oldAction;
                Sigaction(signal, &action, &oldAction);
                if (oldAction.sa_flags && oldAction.sa_flags != action.sa_flags)
                    throw exception("Old sigaction flags aren't equivalent to the replaced signal: {:#b} | {:#b}", oldAction.sa_flags, action.sa_flags);
                DefaultSignalHandlers.at(signal).function = (oldAction.sa_flags & SA_SIGINFO) ? oldAction.sa_sigaction : reinterpret_cast<void (*)(int, struct siginfo *, void *)>(oldAction.sa_handler);
            });
--- a/app/src/main/cpp/skyline/common/trace.h
+++ b/app/src/main/cpp/skyline/common/trace.h
@ -13,7 +13,8 @@ PERFETTO_DEFINE_CATEGORIES(
    perfetto::Category("kernel").SetDescription("Events from parts of the HLE kernel"),
    perfetto::Category("guest").SetDescription("Events relating to guest code"),
    perfetto::Category("gpu").SetDescription("Events from the emulated GPU"),
-    perfetto::Category("service").SetDescription("Events from the HLE sysmodule implementations")
+    perfetto::Category("service").SetDescription("Events from the HLE sysmodule implementations"),
    perfetto::Category("containers").SetDescription("Events from custom container implementations")
 );
 namespace skyline::trace {
--- a/app/src/main/cpp/skyline/services/common/result.h
+++ b/app/src/main/cpp/skyline/services/common/result.h
@ -13,6 +13,7 @@ namespace skyline::service {
        Busy = 16, // EBUSY
        InvalidArgument = 22, // EINVAL
        InappropriateIoctlForDevice = 25, // ENOTTY
        FunctionNotImplemented = 38, // ENOSYS
        NotSupported = 95, // EOPNOTSUPP, ENOTSUP
        TimedOut = 110, // ETIMEDOUT
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/deserialisation/macro_def.inc
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/deserialisation/macro_def.inc
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/deserialisation/macro_undef.inc
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/deserialisation/macro_undef.inc
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.cpp
@ -1,10 +1,16 @@
 // SPDX-License-Identifier: MIT OR MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 #include <common/address_space.inc>
 #include <soc.h>
 #include <services/nvdrv/devices/deserialisation/deserialisation.h>
 #include "as_gpu.h"
 namespace skyline {
    template class FlatAddressSpaceMap<u32, 0, bool, false, false, 32>;
    template class FlatAllocator<u32, 0, 32>;
 }
 namespace skyline::service::nvdrv::device::nvhost {
    AsGpu::AsGpu(const DeviceState &state, Core &core, const SessionContext &ctx) : NvDevice(state, core, ctx) {}
@ -14,38 +20,66 @@ namespace skyline::service::nvdrv::device::nvhost {
    }
    PosixResult AsGpu::AllocSpace(In<u32> pages, In<u32> pageSize, In<MappingFlags> flags, InOut<u64> offset) {
-        // TODO: track this on the nvdrv side and have the gmmu only do virt -> phys
+        state.logger->Debug("pages: 0x{:X}, pageSize: 0x{:X}, flags: ( fixed: {}, sparse: {} ), offset: 0x{:X}", pages, pageSize, flags.fixed, flags.sparse, offset);
-        // Also fix error codes
+
-        u64 size{static_cast<u64>(pages) * static_cast<u64>(pageSize)};
+        if (pageSize != VM::PageSize && pageSize != vm.bigPageSize)
            return PosixResult::InvalidArgument;
        if (pageSize != vm.bigPageSize && flags.sparse)
            return PosixResult::FunctionNotImplemented;
        u32 pageSizeBits{pageSize == VM::PageSize ? VM::PageSizeBits : vm.bigPageSizeBits};
        auto &allocator{[&] () -> auto & {
            if (pageSize == VM::PageSize)
                return vm.smallPageAllocator;
            else
                return vm.bigPageAllocator;
        }()};
        if (flags.fixed)
-            offset = state.soc->gmmu.ReserveFixed(offset, size);
+            allocator->AllocateFixed(offset >> pageSizeBits, pages);
        else
-            offset = state.soc->gmmu.ReserveSpace(size, offset); // offset contains the input alignment
+            offset = static_cast<u64>(allocator->Allocate(pages)) << pageSizeBits;
-        if (offset == 0) {
+        u64 size{static_cast<u64>(pages) * static_cast<u64>(pageSize)};
-            state.logger->Warn("Failed to allocate GPU address space region!");
+
-            return PosixResult::InvalidArgument;
+        if (flags.sparse)
-        }
+            state.soc->gm20b.gmmu.Map(offset, soc::gm20b::GM20B::GMMU::SparsePlaceholderAddress(), size, true);
        allocationMap[offset] = {
            .size = size,
            .pageSize = pageSize,
            .sparse = flags.sparse
        };
        return PosixResult::Success;
    }
    PosixResult AsGpu::FreeSpace(In<u64> offset, In<u32> pages, In<u32> pageSize) {
-        // TODO: implement this when we add nvdrv side address space allocation
+        // TODO: implement after UNMAP
        return PosixResult::Success;
    }
    PosixResult AsGpu::UnmapBuffer(In<u64> offset) {
        state.logger->Debug("offset: 0x{:X}", offset);
        try {
-            auto region{regionMap.at(offset)};
+            auto mapping{mappingMap.at(offset)};
-            // Non-fixed regions are unmapped so that they can be used by future non-fixed mappings
+            if (!mapping->fixed) {
-            if (!region.fixed)
+                auto &allocator{mapping->bigPage ? vm.bigPageAllocator : vm.smallPageAllocator};
-                if (!state.soc->gmmu.Unmap(offset, region.size))
+                u32 pageSizeBits{mapping->bigPage ? vm.bigPageSizeBits : VM::PageSizeBits};
                    state.logger->Warn("Failed to unmap region at 0x{:X}", offset);
-            regionMap.erase(offset);
+                allocator->Free(mapping->offset >> pageSizeBits, mapping->size >> pageSizeBits);
            }
            if (mapping->sparseAlloc)
                state.soc->gm20b.gmmu.Map(offset, soc::gm20b::GM20B::GMMU::SparsePlaceholderAddress(), mapping->size, true);
            else
                state.soc->gm20b.gmmu.Unmap(offset, mapping->size);
            mappingMap.erase(offset);
        } catch (const std::out_of_range &e) {
            state.logger->Warn("Couldn't find region to unmap at 0x{:X}", offset);
        }
@ -53,62 +87,94 @@ namespace skyline::service::nvdrv::device::nvhost {
        return PosixResult::Success;
    }
-    PosixResult AsGpu::MapBufferEx(In<MappingFlags> flags, In<u32> kind, In<core::NvMap::Handle::Id> handle, InOut<u32> pageSize, In<u64> bufferOffset, In<u64> mappingSize, InOut<u64> offset) {
+    PosixResult AsGpu::MapBufferEx(In<MappingFlags> flags, In<u32> kind, In<core::NvMap::Handle::Id> handle, In<u64> bufferOffset, In<u64> mappingSize, InOut<u64> offset) {
-        state.logger->Debug("flags: ( fixed: {}, remap: {} ), kind: {}, handle: {}, pageSize: 0x{:X}, bufferOffset: 0x{:X}, mappingSize: 0x{:X}, offset: 0x{:X}", flags.fixed, flags.remap, kind, handle, pageSize, bufferOffset, mappingSize, offset);
+        if (!vm.initialised)
            return PosixResult::InvalidArgument;
        state.logger->Debug("flags: ( fixed: {}, remap: {} ), kind: {}, handle: {}, bufferOffset: 0x{:X}, mappingSize: 0x{:X}, offset: 0x{:X}", flags.fixed, flags.remap, kind, handle, bufferOffset, mappingSize, offset);
        if (flags.remap) {
-            auto region{regionMap.lower_bound(offset)};
+            try {
-            if (region == regionMap.end()) {
+                auto mapping{mappingMap.at(offset)};
                state.logger->Warn("Cannot remap an unmapped GPU address space region: 0x{:X}", offset);
                return PosixResult::InvalidArgument;
            }
-            if (region->second.size < mappingSize) {
+                if (mapping->size < mappingSize) {
-                state.logger->Warn("Cannot remap an partially mapped GPU address space region: 0x{:X}", offset);
+                    state.logger->Warn("Cannot remap a partially mapped GPU address space region: 0x{:X}", offset);
                    return PosixResult::InvalidArgument;
                }
                u64 gpuAddress{offset + bufferOffset};
-            u8 *cpuPtr{region->second.ptr + bufferOffset};
+                u8 *cpuPtr{mapping->ptr + bufferOffset};
-            if (!state.soc->gmmu.MapFixed(gpuAddress, cpuPtr, mappingSize)) {
+                state.soc->gm20b.gmmu.Map(gpuAddress, cpuPtr, mappingSize);
                state.logger->Warn("Failed to remap GPU address space region: 0x{:X}", gpuAddress);
                return PosixResult::InvalidArgument;
            }
                return PosixResult::Success;
            } catch (const std::out_of_range &e) {
                state.logger->Warn("Cannot remap an unmapped GPU address space region: 0x{:X}", offset);
                return PosixResult::InvalidArgument;
            }
        }
        auto h{core.nvMap.GetHandle(handle)};
        if (!h)
            return PosixResult::InvalidArgument;
        if (auto err{h->Duplicate(ctx.internalSession)}; err != PosixResult::Success)
            return err;
        u8 *cpuPtr{reinterpret_cast<u8 *>(h->address + bufferOffset)};
        u64 size{mappingSize ? mappingSize : h->origSize};
-        if (flags.fixed)
+        if (flags.fixed) {
-            offset = state.soc->gmmu.MapFixed(offset, cpuPtr, size);
+            auto alloc{allocationMap.upper_bound(offset)};
            if (alloc-- == allocationMap.begin() || (offset - alloc->first) + size > alloc->second.size)
                throw exception("Cannot perform a fixed mapping into an unallocated region!");
            state.soc->gm20b.gmmu.Map(offset, cpuPtr, size);
            auto mapping{std::make_shared<Mapping>(cpuPtr, offset, size, true, false, alloc->second.sparse)};
            alloc->second.mappings.push_back(mapping);
            mappingMap[offset] = mapping;
        } else {
            bool bigPage{[&] () {
                if (util::IsAligned(h->align, vm.bigPageSize))
                    return true;
                else if (util::IsAligned(h->align, VM::PageSize))
                    return false;
                else
-            offset = state.soc->gmmu.MapAllocate(cpuPtr, size);
+                    throw exception("Invalid handle alignment: 0x{:X}", h->align);
            }()};
-        if (offset == 0) {
+            auto &allocator{bigPage ? vm.bigPageAllocator : vm.smallPageAllocator};
-            state.logger->Warn("Failed to map GPU address space region!");
+            u32 pageSize{bigPage ? vm.bigPageSize : VM::PageSize};
-            return PosixResult::InvalidArgument;
+            u32 pageSizeBits{bigPage ? vm.bigPageSizeBits : VM::PageSizeBits};
            offset = static_cast<u64>(allocator->Allocate(util::AlignUp(size, pageSize) >> pageSizeBits)) << pageSizeBits;
            state.soc->gm20b.gmmu.Map(offset, cpuPtr, size);
            auto mapping{std::make_shared<Mapping>(cpuPtr, offset, size, false, bigPage, false)};
            mappingMap[offset] = mapping;
        }
        state.logger->Debug("Mapped to 0x{:X}", offset);
        regionMap[offset] = {cpuPtr, size, flags.fixed};
        return PosixResult::Success;
    }
    PosixResult AsGpu::GetVaRegions(In<u64> bufAddr, InOut<u32> bufSize, Out<std::array<VaRegion, 2>> vaRegions) {
-        // TODO: impl when we move allocator to nvdrv
+        if (!vm.initialised)
            return PosixResult::InvalidArgument;
        vaRegions = std::array<VaRegion, 2> {
            VaRegion{
                .pageSize = VM::PageSize,
                .pages = vm.smallPageAllocator->vaLimit - vm.smallPageAllocator->vaStart,
                .offset = vm.smallPageAllocator->vaStart << VM::PageSizeBits,
            },
            VaRegion{
                .pageSize = vm.bigPageSize,
                .pages = vm.bigPageAllocator->vaLimit - vm.bigPageAllocator->vaStart,
                .offset = vm.bigPageAllocator->vaStart << vm.bigPageSizeBits,
            }
        };
        return PosixResult::Success;
    }
@ -116,30 +182,83 @@ namespace skyline::service::nvdrv::device::nvhost {
        return GetVaRegions(bufAddr, bufSize, vaRegions);
    }
-    PosixResult  AsGpu::AllocAsEx(In<u32> bigPageSize, In<FileDescriptor> asFd, In<u32> flags, In<u64> vaRangeStart, In<u64> vaRangeEnd, In<u64> vaRangeSplit) {
+    PosixResult AsGpu::AllocAsEx(In<u32> flags, In<FileDescriptor> asFd, In<u32> bigPageSize, In<u64> vaRangeStart, In<u64> vaRangeEnd, In<u64> vaRangeSplit) {
-        // TODO: create the allocator here
+        if (vm.initialised)
            throw exception("Cannot initialise an address space twice!");
        state.logger->Debug("bigPageSize: 0x{:X}, asFd: {}, flags: 0x{:X}, vaRangeStart: 0x{:X}, vaRangeEnd: 0x{:X}, vaRangeSplit: 0x{:X}",
                            bigPageSize, asFd, flags, vaRangeStart, vaRangeEnd, vaRangeSplit);
        if (bigPageSize) {
            if (!std::ispow2(bigPageSize)) {
                state.logger->Error("Non power-of-2 big page size: 0x{:X}!", bigPageSize);
                return PosixResult::InvalidArgument;
            }
            if (!(bigPageSize & VM::SupportedBigPageSizes)) {
                state.logger->Error("Unsupported big page size: 0x{:X}!", bigPageSize);
                return PosixResult::InvalidArgument;
            }
            vm.bigPageSize = bigPageSize;
            vm.bigPageSizeBits = std::countr_zero(bigPageSize);
            vm.vaRangeStart = bigPageSize << VM::VaStartShift;
        }
        if (vaRangeStart) {
            vm.vaRangeStart = vaRangeStart;
            vm.vaRangeSplit = vaRangeSplit;
            vm.vaRangeEnd = vaRangeEnd;
        }
        u64 startPages{vm.vaRangeStart >> VM::PageSizeBits};
        u64 endPages{vm.vaRangeSplit >> VM::PageSizeBits};
        vm.smallPageAllocator = std::make_unique<VM::Allocator>(startPages, endPages);
        u64 startBigPages{vm.vaRangeSplit >> vm.bigPageSizeBits};
        u64 endBigPages{(vm.vaRangeEnd - vm.vaRangeSplit) >> vm.bigPageSizeBits};
        vm.bigPageAllocator = std::make_unique<VM::Allocator>(startBigPages, endBigPages);
        vm.initialised = true;
        return PosixResult::Success;
    }
    PosixResult AsGpu::Remap(span<RemapEntry> entries) {
        constexpr u32 BigPageSize{0x10}; //!< The big page size of the GPU
        for (const auto &entry : entries) {
            u64 virtAddr{static_cast<u64>(entry.asOffsetBigPages) << vm.bigPageSizeBits};
            u64 size{static_cast<u64>(entry.bigPages) << vm.bigPageSizeBits};
            auto alloc{allocationMap.upper_bound(virtAddr)};
            if (alloc-- == allocationMap.begin() || (virtAddr - alloc->first) + size > alloc->second.size) {
                state.logger->Warn("Cannot remap into an unallocated region!");
                return PosixResult::InvalidArgument;
            }
            if (!alloc->second.sparse) {
                state.logger->Warn("Cannot remap a non-sparse mapping!");
                return PosixResult::InvalidArgument;
            }
            if (!entry.handle) {
                state.soc->gm20b.gmmu.Map(virtAddr, soc::gm20b::GM20B::GMMU::SparsePlaceholderAddress(), size, true);
            } else {
                auto h{core.nvMap.GetHandle(entry.handle)};
                if (!h)
                    return PosixResult::InvalidArgument;
-            u64 virtAddr{static_cast<u64>(entry.asOffsetBigPages) << BigPageSize};
+                u8 *cpuPtr{reinterpret_cast<u8 *>(h->address + (static_cast<u64>(entry.handleOffsetBigPages) << vm.bigPageSizeBits))};
            u8 *cpuPtr{reinterpret_cast<u8 *>(h->address + (static_cast<u64>(entry.handleOffsetBigPages) << BigPageSize))};
            u64 size{static_cast<u64>(entry.bigPages) << BigPageSize};
-            state.soc->gmmu.MapFixed(virtAddr, cpuPtr, size);
+                state.soc->gm20b.gmmu.Map(virtAddr, cpuPtr, size);
            }
        }
        return PosixResult::Success;
    }
-#include <services/nvdrv/devices/deserialisation/macro_def.h>
+#include <services/nvdrv/devices/deserialisation/macro_def.inc>
    static constexpr u32 AsGpuMagic{0x41};
    VARIABLE_IOCTL_HANDLER_FUNC(AsGpu, ({
@ -152,7 +271,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        IOCTL_CASE_ARGS(INOUT, SIZE(0x8),  MAGIC(AsGpuMagic), FUNC(0x5),
                        UnmapBuffer,  ARGS(In<u64>))
        IOCTL_CASE_ARGS(INOUT, SIZE(0x28), MAGIC(AsGpuMagic), FUNC(0x6),
-                        MapBufferEx,  ARGS(In<MappingFlags>, In<u32>, In<core::NvMap::Handle::Id>, InOut<u32>, In<u64>, In<u64>, InOut<u64>))
+                        MapBufferEx,  ARGS(In<MappingFlags>, In<u32>, In<core::NvMap::Handle::Id>, Pad<u32>, In<u64>, In<u64>, InOut<u64>))
        IOCTL_CASE_ARGS(INOUT, SIZE(0x40), MAGIC(AsGpuMagic), FUNC(0x8),
                        GetVaRegions, ARGS(In<u64>, InOut<u32>, Pad<u32>, Out<std::array<VaRegion, 2>>))
        IOCTL_CASE_ARGS(IN,    SIZE(0x28), MAGIC(AsGpuMagic), FUNC(0x9),
@ -166,5 +285,5 @@ namespace skyline::service::nvdrv::device::nvhost {
        INLINE_IOCTL_CASE_ARGS(INOUT, SIZE(0x40), MAGIC(AsGpuMagic), FUNC(0x8),
                               GetVaRegions3, ARGS(In<u64>, InOut<u32>, Pad<u32>, Out<std::array<VaRegion, 2>>))
    }))
-#include <services/nvdrv/devices/deserialisation/macro_undef.h>
+#include <services/nvdrv/devices/deserialisation/macro_undef.inc>
 }
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
@ -3,6 +3,8 @@
 #pragma once
 #include <common/address_space.h>
 #include <services/nvdrv/devices/nvdevice.h>
 namespace skyline::service::nvdrv::device::nvhost {
@ -12,18 +14,64 @@ namespace skyline::service::nvdrv::device::nvhost {
     */
    class AsGpu : public NvDevice {
      private:
-        struct AddressSpaceRegion {
+        struct Mapping {
            u8 *ptr;
            u64 offset;
            u64 size;
            bool fixed;
            bool bigPage; // Only valid if fixed == false
            bool sparseAlloc;
            Mapping(u8 *ptr, u64 offset, u64 size, bool fixed, bool bigPage, bool sparseAlloc) : ptr(ptr),
                offset(offset),
                size(size),
                fixed(fixed),
                bigPage(bigPage),
                sparseAlloc(sparseAlloc) {}
        };
-        std::map<u64, AddressSpaceRegion> regionMap; //!< This maps the base addresses of mapped buffers to their total sizes and mapping type, this is needed as what was originally a single buffer may have been split into multiple GPU side buffers with the remap flag.
+        struct Allocation {
            u64 size;
            std::list<std::shared_ptr<Mapping>> mappings;
            u32 pageSize;
            bool sparse;
        };
        std::map<u64, std::shared_ptr<Mapping>> mappingMap; //!< This maps the base addresses of mapped buffers to their total sizes and mapping type, this is needed as what was originally a single buffer may have been split into multiple GPU side buffers with the remap flag.
        std::map<u64, Allocation> allocationMap;
        struct VM {
            static constexpr u32 PageSize{0x1000};
            static constexpr u32 PageSizeBits{std::countr_zero(PageSize)};
            static constexpr u32 SupportedBigPageSizes{0x30000};
            static constexpr u32 DefaultBigPageSize{0x20000};
            u32 bigPageSize{DefaultBigPageSize};
            u32 bigPageSizeBits{std::countr_zero(DefaultBigPageSize)};
            static constexpr u32 VaStartShift{10};
            static constexpr u64 DefaultVaSplit{1ULL << 34};
            static constexpr u64 DefaultVaRange{1ULL << 37};
            u64 vaRangeStart{DefaultBigPageSize << VaStartShift};
            u64 vaRangeSplit{DefaultVaSplit};
            u64 vaRangeEnd{DefaultVaRange};
            using Allocator = FlatAllocator<u32, 0, 32>;
            std::unique_ptr<Allocator> bigPageAllocator{};
            std::unique_ptr<Allocator> smallPageAllocator{};
            bool initialised{};
        } vm;
      public:
        struct MappingFlags {
            bool fixed : 1;
-            u8 _pad0_ : 7;
+            bool sparse : 1;
            u8 _pad0_ : 6;
            bool remap : 1;
            u32 _pad1_ : 23;
        };
@ -77,7 +125,7 @@ namespace skyline::service::nvdrv::device::nvhost {
         * @brief Maps a region into this address space with extra parameters
         * @url https://switchbrew.org/wiki/NV_services#NVGPU_AS_IOCTL_MAP_BUFFER_EX
         */
-        PosixResult MapBufferEx(In<MappingFlags> flags, In<u32> kind, In<core::NvMap::Handle::Id> handle, InOut<u32> pageSize, In<u64> bufferOffset, In<u64> mappingSize, InOut<u64> offset);
+        PosixResult MapBufferEx(In<MappingFlags> flags, In<u32> kind, In<core::NvMap::Handle::Id> handle, In<u64> bufferOffset, In<u64> mappingSize, InOut<u64> offset);
        /**
         * @brief Returns info about the address space and its page sizes
@ -94,7 +142,7 @@ namespace skyline::service::nvdrv::device::nvhost {
         * @brief Allocates this address space with the given parameters
         * @url https://switchbrew.org/wiki/NV_services#NVGPU_AS_IOCTL_ALLOC_AS_EX
         */
-        PosixResult AllocAsEx(In<u32> bigPageSize, In<FileDescriptor> asFd, In<u32> flags, In<u64> vaRangeStart, In<u64> vaRangeEnd, In<u64> vaRangeSplit);
+        PosixResult AllocAsEx(In<u32> flags, In<FileDescriptor> asFd, In<u32> bigPageSize, In<u64> vaRangeStart, In<u64> vaRangeEnd, In<u64> vaRangeSplit);
        /**
         * @brief Remaps a region of the GPU address space
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl.cpp
@ -234,7 +234,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        return nullptr;
    }
-#include <services/nvdrv/devices/deserialisation/macro_def.h>
+#include <services/nvdrv/devices/deserialisation/macro_def.inc>
    static constexpr u32 CtrlMagic{0};
    IOCTL_HANDLER_FUNC(Ctrl, ({
@ -254,5 +254,5 @@ namespace skyline::service::nvdrv::device::nvhost {
        IOCTL_CASE_RESULT(INOUT, SIZE(0x183), MAGIC(CtrlMagic), FUNC(0x1B),
                          PosixResult::InvalidArgument) // GetConfig isn't available in production
    }))
-#include <services/nvdrv/devices/deserialisation/macro_undef.h>
+#include <services/nvdrv/devices/deserialisation/macro_undef.inc>
 }
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl_gpu.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl_gpu.cpp
@ -62,7 +62,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        }
    }
-#include <services/nvdrv/devices/deserialisation/macro_def.h>
+#include <services/nvdrv/devices/deserialisation/macro_def.inc>
    static constexpr u32 CtrlGpuMagic{0x47};
    IOCTL_HANDLER_FUNC(CtrlGpu, ({
@ -77,5 +77,5 @@ namespace skyline::service::nvdrv::device::nvhost {
        IOCTL_CASE_ARGS(OUT,   SIZE(0x8),  MAGIC(CtrlGpuMagic), FUNC(0x14),
                        GetActiveSlotMask,  ARGS(Out<u32>, Out<u32>))
    }))
-#include <services/nvdrv/devices/deserialisation/macro_undef.h>
+#include <services/nvdrv/devices/deserialisation/macro_undef.inc>
 }
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.cpp
@ -104,7 +104,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        }
    }
-#include <services/nvdrv/devices/deserialisation/macro_def.h>
+#include <services/nvdrv/devices/deserialisation/macro_def.inc>
    static constexpr u32 GpuChannelUserMagic{0x47};
    static constexpr u32 GpuChannelMagic{0x48};
@ -138,5 +138,5 @@ namespace skyline::service::nvdrv::device::nvhost {
        INLINE_IOCTL_CASE_ARGS(INOUT, SIZE(0x18), MAGIC(GpuChannelMagic), FUNC(0x1B),
                               SubmitGpfifo2, ARGS(In<u64>, In<u32>, InOut<SubmitGpfifoFlags>, InOut<Fence>))
    }))
-#include <services/nvdrv/devices/deserialisation/macro_undef.h>
+#include <services/nvdrv/devices/deserialisation/macro_undef.inc>
 }
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvmap.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvmap.cpp
@ -115,7 +115,7 @@ namespace skyline::service::nvdrv::device {
        return PosixResult::Success;
    }
-#include "deserialisation/macro_def.h"
+#include "deserialisation/macro_def.inc"
    static constexpr u32 NvMapMagic{1};
    IOCTL_HANDLER_FUNC(NvMap, ({
@ -132,6 +132,6 @@ namespace skyline::service::nvdrv::device {
        IOCTL_CASE_ARGS(INOUT, SIZE(0x8),  MAGIC(NvMapMagic), FUNC(0xE),
                        GetId,  ARGS(Out<NvMapCore::Handle::Id>, In<NvMapCore::Handle::Id>))
    }))
-#include "deserialisation/macro_undef.h"
+#include "deserialisation/macro_undef.inc"
 }
--- a/app/src/main/cpp/skyline/soc.h
+++ b/app/src/main/cpp/skyline/soc.h
@ -3,7 +3,6 @@
 #pragma once
 #include "soc/gmmu.h"
 #include "soc/host1x.h"
 #include "soc/gm20b.h"
@ -14,10 +13,9 @@ namespace skyline::soc {
     */
    class SOC {
      public:
        gmmu::GraphicsMemoryManager gmmu;
        host1x::Host1X host1x;
        gm20b::GM20B gm20b;
-        SOC(const DeviceState &state) : gmmu(state), gm20b(state) {}
+        SOC(const DeviceState &state) : gm20b(state) {}
    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b.cpp
@ -0,0 +1,20 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 #include <common/address_space.inc>
 #include "gm20b.h"
 namespace skyline {
    template class FlatAddressSpaceMap<u64, 0, u8 *, nullptr, true, soc::gm20b::GM20B::AddressSpaceBits>;
    template class FlatMemoryManager<u64, 0, soc::gm20b::GM20B::AddressSpaceBits>;
 }
 namespace skyline::soc::gm20b {
    GM20B::GM20B(const DeviceState &state) :
        fermi2D(state),
        keplerMemory(state),
        maxwell3D(state),
        maxwellCompute(state),
        maxwellDma(state),
        gpfifo(state) {}
 }
--- a/app/src/main/cpp/skyline/soc/gm20b.h
+++ b/app/src/main/cpp/skyline/soc/gm20b.h
@ -3,23 +3,28 @@
 #pragma once
 #include <common/address_space.h>
 #include "gm20b/engines/maxwell_3d.h"
 #include "gm20b/gpfifo.h"
 namespace skyline::soc::gm20b {
    /**
     * @brief The GPU block in the X1, it contains all GPU engines required for accelerating graphics operations
-     * @note We omit parts of components related to external access such as the GM20B Host, all accesses to the external components are done directly
+     * @note We omit parts of components related to external access such as the grhost, all accesses to the external components are done directly
     */
    class GM20B {
      public:
        static constexpr u8 AddressSpaceBits{40}; //!< The width of the GMMU AS
        using GMMU = FlatMemoryManager<u64, 0, AddressSpaceBits>;
        engine::Engine fermi2D;
        engine::maxwell3d::Maxwell3D maxwell3D;
        engine::Engine maxwellCompute;
        engine::Engine maxwellDma;
        engine::Engine keplerMemory;
        GPFIFO gpfifo;
        GMMU gmmu;
-        GM20B(const DeviceState &state) : fermi2D(state), keplerMemory(state), maxwell3D(state), maxwellCompute(state), maxwellDma(state), gpfifo(state) {}
+        GM20B(const DeviceState &state);
    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)
-#include <soc/gmmu.h>
+#include <common/address_space.h>
 #include <soc/gm20b/engines/maxwell_3d.h>
 namespace skyline::soc::gm20b::engine::maxwell3d {
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@ -157,7 +157,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
        switch (registers.semaphore.info.structureSize) {
            case Registers::SemaphoreInfo::StructureSize::OneWord:
-                state.soc->gmmu.Write<u32>(static_cast<u32>(result), registers.semaphore.address.Pack());
+                state.soc->gm20b.gmmu.Write<u32>(registers.semaphore.address.Pack(), static_cast<u32>(result));
                break;
            case Registers::SemaphoreInfo::StructureSize::FourWords: {
                // Convert the current nanosecond time to GPU ticks
@ -167,7 +167,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
                u64 nsTime{util::GetTimeNs()};
                u64 timestamp{(nsTime / NsToTickDenominator) * NsToTickNumerator + ((nsTime % NsToTickDenominator) * NsToTickNumerator) / NsToTickDenominator};
-                state.soc->gmmu.Write<FourWordResult>(FourWordResult{result, timestamp}, registers.semaphore.address.Pack());
+                state.soc->gm20b.gmmu.Write<FourWordResult>(registers.semaphore.address.Pack(), FourWordResult{result, timestamp});
                break;
            }
        }
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@ -56,7 +56,7 @@ namespace skyline::soc::gm20b {
        }
        pushBufferData.resize(gpEntry.size);
-        state.soc->gmmu.Read<u32>(pushBufferData, gpEntry.Address());
+        state.soc->gm20b.gmmu.Read<u32>(pushBufferData, gpEntry.Address());
        for (auto entry{pushBufferData.begin()}; entry != pushBufferData.end(); entry++) {
            // An entry containing all zeroes is a NOP, skip over it
@ -88,8 +88,7 @@ namespace skyline::soc::gm20b {
                    return;
                default:
-                    state.logger->Warn("Unsupported pushbuffer method SecOp: {}", static_cast<u8>(methodHeader.secOp));
+                    throw exception("Unsupported pushbuffer method SecOp: {}", static_cast<u8>(methodHeader.secOp));
                    break;
            }
        }
    }
@ -106,7 +105,7 @@ namespace skyline::soc::gm20b {
        try {
            signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
            pushBuffers->Process([this](GpEntry gpEntry) {
-                state.logger->Debug("Processing pushbuffer: 0x{:X}", gpEntry.Address());
+                state.logger->Warn("Processing pushbuffer: 0x{:X}", gpEntry.Address());
                Process(gpEntry);
            });
        } catch (const signal::SignalException &e) {
--- a/app/src/main/cpp/skyline/soc/gmmu.cpp
+++ b/app/src/main/cpp/skyline/soc/gmmu.cpp
@ -1,214 +0,0 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)
 #include <kernel/types/KProcess.h>
 #include "gmmu.h"
 namespace skyline::soc::gmmu {
    constexpr u64 GpuPageSize{1 << 16}; //!< The page size of the GPU address space
    GraphicsMemoryManager::GraphicsMemoryManager(const DeviceState &state) : state(state) {
        constexpr u64 gpuAddressSpaceSize{1UL << 40}; //!< The size of the GPU address space
        constexpr u64 gpuAddressSpaceBase{0x100000}; //!< The base of the GPU address space - must be non-zero
        // Create the initial chunk that will be split to create new chunks
        ChunkDescriptor baseChunk(gpuAddressSpaceBase, gpuAddressSpaceSize, nullptr, ChunkState::Unmapped);
        chunks.push_back(baseChunk);
    }
    std::optional<ChunkDescriptor> GraphicsMemoryManager::FindChunk(ChunkState desiredState, u64 size, u64 alignment) {
        auto chunk{std::find_if(chunks.begin(), chunks.end(), [desiredState, size, alignment](const ChunkDescriptor &chunk) -> bool {
            return (alignment ? util::IsAligned(chunk.virtualAddress, alignment) : true) && chunk.size > size && chunk.state == desiredState;
        })};
        if (chunk != chunks.end())
            return *chunk;
        return std::nullopt;
    }
    u64 GraphicsMemoryManager::InsertChunk(const ChunkDescriptor &newChunk) {
        auto chunkEnd{chunks.end()};
        for (auto chunk{chunks.begin()}; chunk != chunkEnd; chunk++) {
            if (chunk->CanContain(newChunk)) {
                auto oldChunk{*chunk};
                u64 newSize{newChunk.virtualAddress - chunk->virtualAddress};
                u64 extension{chunk->size - newSize - newChunk.size};
                if (newSize == 0) {
                    *chunk = newChunk;
                } else {
                    chunk->size = newSize;
                    chunk = chunks.insert(std::next(chunk), newChunk);
                }
                if (extension)
                    chunks.insert(std::next(chunk), ChunkDescriptor(newChunk.virtualAddress + newChunk.size, extension, (oldChunk.state == ChunkState::Mapped) ? (oldChunk.cpuPtr + newSize + newChunk.size) : nullptr, oldChunk.state));
                return newChunk.virtualAddress;
            } else if (chunk->virtualAddress + chunk->size > newChunk.virtualAddress) {
                chunk->size = newChunk.virtualAddress - chunk->virtualAddress;
                // Deletes all chunks that are within the chunk being inserted and split the final one
                auto tailChunk{std::next(chunk)};
                while (tailChunk != chunkEnd) {
                    if (tailChunk->virtualAddress + tailChunk->size >= newChunk.virtualAddress + newChunk.size)
                        break;
                    tailChunk = chunks.erase(tailChunk);
                    chunkEnd = chunks.end();
                }
                // The given chunk is too large to fit into existing chunks
                if (tailChunk == chunkEnd)
                    break;
                u64 chunkSliceOffset{newChunk.virtualAddress + newChunk.size - tailChunk->virtualAddress};
                tailChunk->virtualAddress += chunkSliceOffset;
                tailChunk->size -= chunkSliceOffset;
                if (tailChunk->state == ChunkState::Mapped)
                    tailChunk->cpuPtr += chunkSliceOffset;
                // If the size of the head chunk is zero then we can directly replace it with our new one rather than inserting it
                auto headChunk{std::prev(tailChunk)};
                if (headChunk->size == 0)
                    *headChunk = newChunk;
                else
                    chunks.insert(std::next(headChunk), newChunk);
                return newChunk.virtualAddress;
            }
        }
        throw exception("Failed to insert chunk into GPU address space!");
    }
    u64 GraphicsMemoryManager::ReserveSpace(u64 size, u64 alignment) {
        size = util::AlignUp(size, GpuPageSize);
        std::unique_lock lock(mutex);
        auto newChunk{FindChunk(ChunkState::Unmapped, size, alignment)};
        if (!newChunk) [[unlikely]]
            return 0;
        auto chunk{*newChunk};
        chunk.size = size;
        chunk.state = ChunkState::Reserved;
        return InsertChunk(chunk);
    }
    u64 GraphicsMemoryManager::ReserveFixed(u64 virtualAddress, u64 size) {
        if (!util::IsAligned(virtualAddress, GpuPageSize)) [[unlikely]]
            return 0;
        size = util::AlignUp(size, GpuPageSize);
        std::unique_lock lock(mutex);
        return InsertChunk(ChunkDescriptor(virtualAddress, size, nullptr, ChunkState::Reserved));
    }
    u64 GraphicsMemoryManager::MapAllocate(u8 *cpuPtr, u64 size) {
        size = util::AlignUp(size, GpuPageSize);
        std::unique_lock lock(mutex);
        auto mappedChunk{FindChunk(ChunkState::Unmapped, size)};
        if (!mappedChunk) [[unlikely]]
            return 0;
        auto chunk{*mappedChunk};
        chunk.cpuPtr = cpuPtr;
        chunk.size = size;
        chunk.state = ChunkState::Mapped;
        return InsertChunk(chunk);
    }
    u64 GraphicsMemoryManager::MapFixed(u64 virtualAddress, u8 *cpuPtr, u64 size) {
        if (!util::IsAligned(virtualAddress, GpuPageSize)) [[unlikely]]
            return 0;
        size = util::AlignUp(size, GpuPageSize);
        std::unique_lock lock(mutex);
        return InsertChunk(ChunkDescriptor(virtualAddress, size, cpuPtr, ChunkState::Mapped));
    }
    bool GraphicsMemoryManager::Unmap(u64 virtualAddress, u64 size) {
        if (!util::IsAligned(virtualAddress, GpuPageSize)) [[unlikely]]
            return false;
        try {
            std::unique_lock lock(mutex);
            InsertChunk(ChunkDescriptor(virtualAddress, size, nullptr, ChunkState::Unmapped));
        } catch (const std::exception &e) {
            return false;
        }
        return true;
    }
    void GraphicsMemoryManager::Read(u8 *destination, u64 virtualAddress, u64 size) {
        std::shared_lock lock(mutex);
        auto chunk{std::upper_bound(chunks.begin(), chunks.end(), virtualAddress, [](const u64 address, const ChunkDescriptor &chunk) -> bool {
            return address < chunk.virtualAddress;
        })};
        if (chunk == chunks.end() || chunk->state != ChunkState::Mapped)
            throw exception("Failed to read region in GPU address space: Address: 0x{:X}, Size: 0x{:X}", virtualAddress, size);
        chunk--;
        u64 initialSize{size};
        u64 chunkOffset{virtualAddress - chunk->virtualAddress};
        u8 *source{chunk->cpuPtr + chunkOffset};
        u64 sourceSize{std::min(chunk->size - chunkOffset, size)};
        // A continuous region in the GPU address space may be made up of several discontinuous regions in physical memory so we have to iterate over all chunks
        while (size) {
            std::memcpy(destination + (initialSize - size), source, sourceSize);
            size -= sourceSize;
            if (size) {
                if (++chunk == chunks.end() || chunk->state != ChunkState::Mapped)
                    throw exception("Failed to read region in GPU address space: Address: 0x{:X}, Size: 0x{:X}", virtualAddress, size);
                source = chunk->cpuPtr;
                sourceSize = std::min(chunk->size, size);
            }
        }
    }
    void GraphicsMemoryManager::Write(u8 *source, u64 virtualAddress, u64 size) {
        std::shared_lock lock(mutex);
        auto chunk{std::upper_bound(chunks.begin(), chunks.end(), virtualAddress, [](const u64 address, const ChunkDescriptor &chunk) -> bool {
            return address < chunk.virtualAddress;
        })};
        if (chunk == chunks.end() || chunk->state != ChunkState::Mapped)
            throw exception("Failed to write region in GPU address space: Address: 0x{:X}, Size: 0x{:X}", virtualAddress, size);
        chunk--;
        u64 initialSize{size};
        u64 chunkOffset{virtualAddress - chunk->virtualAddress};
        u8 *destination{chunk->cpuPtr + chunkOffset};
        u64 destinationSize{std::min(chunk->size - chunkOffset, size)};
        // A continuous region in the GPU address space may be made up of several discontinuous regions in physical memory so we have to iterate over all chunks
        while (size) {
            std::memcpy(destination, source + (initialSize - size), destinationSize);
            size -= destinationSize;
            if (size) {
                if (++chunk == chunks.end() || chunk->state != ChunkState::Mapped)
                    throw exception("Failed to write region in GPU address space: Address: 0x{:X}, Size: 0x{:X}", virtualAddress, size);
                destination = chunk->cpuPtr;
                destinationSize = std::min(chunk->size, size);
            }
        }
    }
 }
--- a/app/src/main/cpp/skyline/soc/gmmu.h
+++ b/app/src/main/cpp/skyline/soc/gmmu.h
@ -1,140 +0,0 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)
 #pragma once
 #include <common.h>
 namespace skyline::soc::gmmu {
    enum class ChunkState {
        Unmapped, //!< The chunk is unmapped
        Reserved, //!< The chunk is reserved
        Mapped //!< The chunk is mapped and a CPU side address is present
    };
    struct ChunkDescriptor {
        u64 virtualAddress; //!< The address of the chunk in the virtual address space
        u64 size; //!< The size of the chunk in bytes
        u8 *cpuPtr; //!< A pointer to the chunk in the application's address space (if mapped)
        ChunkState state;
        ChunkDescriptor(u64 virtualAddress, u64 size, u8 *cpuPtr, ChunkState state) : virtualAddress(virtualAddress), size(size), cpuPtr(cpuPtr), state(state) {}
        /**
         * @return If the given chunk can be contained wholly within this chunk
         */
        inline bool CanContain(const ChunkDescriptor &chunk) {
            return (chunk.virtualAddress >= virtualAddress) && ((size + virtualAddress) >= (chunk.size + chunk.virtualAddress));
        }
    };
    /**
     * @brief The GraphicsMemoryManager class handles mapping between a Maxwell GPU virtual address space and an application's address space and is meant to roughly emulate the GMMU on the X1
     * @note This is not accurate to the X1 as it would have an SMMU between the GMMU and physical memory but we don't emulate this abstraction at the moment
     */
    class GraphicsMemoryManager {
      private:
        const DeviceState &state;
        std::vector<ChunkDescriptor> chunks;
        std::shared_mutex mutex;
        /**
         * @brief Finds a chunk in the virtual address space that is larger than meets the given requirements
         * @note vmmMutex MUST be locked when calling this
         * @param desiredState The state of the chunk to find
         * @param size The minimum size of the chunk to find
         * @param alignment The minimum alignment of the chunk to find
         * @return The first applicable chunk
         */
        std::optional<ChunkDescriptor> FindChunk(ChunkState desiredState, u64 size, u64 alignment = 0);
        /**
         * @brief Inserts a chunk into the chunk list, resizing and splitting as necessary
         * @note vmmMutex MUST be locked when calling this
         * @param newChunk The chunk to insert
         * @return The base virtual address of the inserted chunk
         */
        u64 InsertChunk(const ChunkDescriptor &newChunk);
      public:
        GraphicsMemoryManager(const DeviceState &state);
        /**
         * @brief Reserves a region of the virtual address space so it will not be chosen automatically when mapping
         * @param size The size of the region to reserve
         * @param alignment The alignment of the region to reserve
         * @return The base virtual address of the reserved region
         */
        u64 ReserveSpace(u64 size, u64 alignment);
        /**
         * @brief Reserves a fixed region of the virtual address space so it will not be chosen automatically when mapping
         * @param virtualAddress The virtual base address of the region to allocate
         * @param size The size of the region to allocate
         * @return The base virtual address of the reserved region
         */
        u64 ReserveFixed(u64 virtualAddress, u64 size);
        /**
         * @brief Maps a CPU memory region into an automatically chosen region of the virtual address space
         * @param cpuPtr A pointer to the region to be mapped into the virtual address space
         * @param size The size of the region to map
         * @return The base virtual address of the mapped region
         */
        u64 MapAllocate(u8 *cpuPtr, u64 size);
        /**
         * @brief Maps a CPU memory region to a fixed region in the virtual address space
         * @param virtualAddress The target virtual address of the region
         * @param cpuPtr A pointer to the region to be mapped into the virtual address space
         * @param size The size of the region to map
         * @return The base virtual address of the mapped region
         */
        u64 MapFixed(u64 virtualAddress, u8 *cpuPtr, u64 size);
        /**
         * @brief Unmaps all chunks in the given region from the virtual address space
         * @return Whether the operation succeeded
         */
        bool Unmap(u64 virtualAddress, u64 size);
        void Read(u8 *destination, u64 virtualAddress, u64 size);
        /**
         * @brief Reads in a span from a region of the virtual address space
         */
        template<typename T>
        void Read(span <T> destination, u64 virtualAddress) {
            Read(reinterpret_cast<u8 *>(destination.data()), virtualAddress, destination.size_bytes());
        }
        /**
         * @brief Reads in an object from a region of the virtual address space
         * @tparam T The type of object to return
         */
        template<typename T>
        T Read(u64 virtualAddress) {
            T obj;
            Read(reinterpret_cast<u8 *>(&obj), virtualAddress, sizeof(T));
            return obj;
        }
        void Write(u8 *source, u64 virtualAddress, u64 size);
        /**
         * @brief Writes out a span to a region of the virtual address space
         */
        template<typename T>
        void Write(span <T> source, u64 virtualAddress) {
            Write(reinterpret_cast<u8 *>(source.data()), virtualAddress, source.size_bytes());
        }
        /**
         * @brief Reads in an object from a region of the virtual address space
         */
        template<typename T>
        void Write(T source, u64 virtualAddress) {
            Write(reinterpret_cast<u8 *>(&source), virtualAddress, sizeof(T));
        }
    };
 }