Merge pull request #2429 from FernandoS27/compute

Corrections and Implementation on GPU Engines
2019-05-09 13:19:22 -04:00 · 2019-05-09 13:19:22 -04:00 · c27b81cb85
parent 0e9a17b029 e64c41efe8
commit c27b81cb85
13 changed files with 484 additions and 143 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -3,6 +3,8 @@ add_library(video_core STATIC
    dma_pusher.h
    debug_utils/debug_utils.cpp
    debug_utils/debug_utils.h
    engines/engine_upload.cpp
    engines/engine_upload.h
    engines/fermi_2d.cpp
    engines/fermi_2d.h
    engines/kepler_compute.cpp
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@ -0,0 +1,48 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include "common/assert.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/memory_manager.h"
 #include "video_core/textures/decoders.h"
 namespace Tegra::Engines::Upload {
 State::State(MemoryManager& memory_manager, Registers& regs)
    : memory_manager(memory_manager), regs(regs) {}
 void State::ProcessExec(const bool is_linear) {
    write_offset = 0;
    copy_size = regs.line_length_in * regs.line_count;
    inner_buffer.resize(copy_size);
    this->is_linear = is_linear;
 }
 void State::ProcessData(const u32 data, const bool is_last_call) {
    const u32 sub_copy_size = std::min(4U, copy_size - write_offset);
    std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size);
    write_offset += sub_copy_size;
    if (!is_last_call) {
        return;
    }
    const GPUVAddr address{regs.dest.Address()};
    if (is_linear) {
        memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
    } else {
        UNIMPLEMENTED_IF(regs.dest.z != 0);
        UNIMPLEMENTED_IF(regs.dest.depth != 1);
        UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
        UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
        const std::size_t dst_size = Tegra::Texture::CalculateSize(
            true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
        tmp_buffer.resize(dst_size);
        memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
        Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
                                      regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
                                      tmp_buffer.data());
        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
    }
 }
 } // namespace Tegra::Engines::Upload
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@ -0,0 +1,75 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <cstddef>
 #include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 namespace Tegra {
 class MemoryManager;
 }
 namespace Tegra::Engines::Upload {
 struct Registers {
    u32 line_length_in;
    u32 line_count;
    struct {
        u32 address_high;
        u32 address_low;
        u32 pitch;
        union {
            BitField<0, 4, u32> block_width;
            BitField<4, 4, u32> block_height;
            BitField<8, 4, u32> block_depth;
        };
        u32 width;
        u32 height;
        u32 depth;
        u32 z;
        u32 x;
        u32 y;
        GPUVAddr Address() const {
            return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
        }
        u32 BlockWidth() const {
            return 1U << block_width.Value();
        }
        u32 BlockHeight() const {
            return 1U << block_height.Value();
        }
        u32 BlockDepth() const {
            return 1U << block_depth.Value();
        }
    } dest;
 };
 class State {
 public:
    State(MemoryManager& memory_manager, Registers& regs);
    ~State() = default;
    void ProcessExec(const bool is_linear);
    void ProcessData(const u32 data, const bool is_last_call);
 private:
    u32 write_offset = 0;
    u32 copy_size = 0;
    std::vector<u8> inner_buffer;
    std::vector<u8> tmp_buffer;
    bool is_linear = false;
    Registers& regs;
    MemoryManager& memory_manager;
 };
 } // namespace Tegra::Engines::Upload
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@ -21,6 +21,12 @@ class RasterizerInterface;
 namespace Tegra::Engines {
 /**
 * This Engine is known as G80_2D. Documentation can be found in:
 * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
 * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
 */
 #define FERMI2D_REG_INDEX(field_name)                                                              \
    (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@ -4,12 +4,21 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
 #include "video_core/textures/decoders.h"
 namespace Tegra::Engines {
-KeplerCompute::KeplerCompute(MemoryManager& memory_manager) : memory_manager{memory_manager} {}
+KeplerCompute::KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                             MemoryManager& memory_manager)
    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, upload_state{
                                                                                  memory_manager,
                                                                                  regs.upload} {}
 KeplerCompute::~KeplerCompute() = default;
@ -20,14 +29,34 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
    regs.reg_array[method_call.method] = method_call.argument;
    switch (method_call.method) {
    case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
        upload_state.ProcessExec(regs.exec_upload.linear != 0);
        break;
    }
    case KEPLER_COMPUTE_REG_INDEX(data_upload): {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
        }
        break;
    }
    case KEPLER_COMPUTE_REG_INDEX(launch):
-        // Abort execution since compute shaders can be used to alter game memory (e.g. CUDA
+        ProcessLaunch();
        // kernels)
        UNREACHABLE_MSG("Compute shaders are not implemented");
        break;
    default:
        break;
    }
 }
 void KeplerCompute::ProcessLaunch() {
    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
                                   LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
    const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
    LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
 }
 } // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@ -6,22 +6,40 @@
 #include <array>
 #include <cstddef>
 #include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 namespace Core {
 class System;
 }
 namespace Tegra {
 class MemoryManager;
 }
 namespace VideoCore {
 class RasterizerInterface;
 }
 namespace Tegra::Engines {
 /**
 * This Engine is known as GK104_Compute. Documentation can be found in:
 * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_compute.xml
 * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
 */
 #define KEPLER_COMPUTE_REG_INDEX(field_name)                                                       \
    (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
 class KeplerCompute final {
 public:
-    explicit KeplerCompute(MemoryManager& memory_manager);
+    explicit KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                           MemoryManager& memory_manager);
    ~KeplerCompute();
    static constexpr std::size_t NumConstBuffers = 8;
@ -31,30 +49,181 @@ public:
        union {
            struct {
-                INSERT_PADDING_WORDS(0xAF);
+                INSERT_PADDING_WORDS(0x60);
                Upload::Registers upload;
                struct {
                    union {
                        BitField<0, 1, u32> linear;
                    };
                } exec_upload;
                u32 data_upload;
                INSERT_PADDING_WORDS(0x3F);
                struct {
                    u32 address;
                    GPUVAddr Address() const {
                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8));
                    }
                } launch_desc_loc;
                INSERT_PADDING_WORDS(0x1);
                u32 launch;
-                INSERT_PADDING_WORDS(0xC48);
+                INSERT_PADDING_WORDS(0x4A7);
                struct {
                    u32 address_high;
                    u32 address_low;
                    u32 limit;
                    GPUVAddr Address() const {
                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                     address_low);
                    }
                } tsc;
                INSERT_PADDING_WORDS(0x3);
                struct {
                    u32 address_high;
                    u32 address_low;
                    u32 limit;
                    GPUVAddr Address() const {
                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                     address_low);
                    }
                } tic;
                INSERT_PADDING_WORDS(0x22);
                struct {
                    u32 address_high;
                    u32 address_low;
                    GPUVAddr Address() const {
                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                     address_low);
                    }
                } code_loc;
                INSERT_PADDING_WORDS(0x3FE);
                u32 texture_const_buffer_index;
                INSERT_PADDING_WORDS(0x374);
            };
            std::array<u32, NUM_REGS> reg_array;
        };
    } regs{};
    struct LaunchParams {
        static constexpr std::size_t NUM_LAUNCH_PARAMETERS = 0x40;
        INSERT_PADDING_WORDS(0x8);
        u32 program_start;
        INSERT_PADDING_WORDS(0x2);
        BitField<30, 1, u32> linked_tsc;
        BitField<0, 31, u32> grid_dim_x;
        union {
            BitField<0, 16, u32> grid_dim_y;
            BitField<16, 16, u32> grid_dim_z;
        };
        INSERT_PADDING_WORDS(0x3);
        BitField<0, 16, u32> shared_alloc;
        BitField<0, 31, u32> block_dim_x;
        union {
            BitField<0, 16, u32> block_dim_y;
            BitField<16, 16, u32> block_dim_z;
        };
        union {
            BitField<0, 8, u32> const_buffer_enable_mask;
            BitField<29, 2, u32> cache_layout;
        } memory_config;
        INSERT_PADDING_WORDS(0x8);
        struct {
            u32 address_low;
            union {
                BitField<0, 8, u32> address_high;
                BitField<15, 17, u32> size;
            };
            GPUVAddr Address() const {
                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
                                             address_low);
            }
        } const_buffer_config[8];
        union {
            BitField<0, 20, u32> local_pos_alloc;
            BitField<27, 5, u32> barrier_alloc;
        };
        union {
            BitField<0, 20, u32> local_neg_alloc;
            BitField<24, 5, u32> gpr_alloc;
        };
        INSERT_PADDING_WORDS(0x11);
    } launch_description;
    struct {
        u32 write_offset = 0;
        u32 copy_size = 0;
        std::vector<u8> inner_buffer;
    } state{};
    static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
                  "KeplerCompute Regs has wrong size");
    static_assert(sizeof(LaunchParams) == LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32),
                  "KeplerCompute LaunchParams has wrong size");
    /// Write the value to the register identified by method.
    void CallMethod(const GPU::MethodCall& method_call);
 private:
    Core::System& system;
    VideoCore::RasterizerInterface& rasterizer;
    MemoryManager& memory_manager;
    Upload::State upload_state;
    void ProcessLaunch();
 };
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
    static_assert(offsetof(KeplerCompute::Regs, field_name) == position * 4,                       \
                  "Field " #field_name " has invalid position")
 #define ASSERT_LAUNCH_PARAM_POSITION(field_name, position)                                         \
    static_assert(offsetof(KeplerCompute::LaunchParams, field_name) == position * 4,               \
                  "Field " #field_name " has invalid position")
 ASSERT_REG_POSITION(upload, 0x60);
 ASSERT_REG_POSITION(exec_upload, 0x6C);
 ASSERT_REG_POSITION(data_upload, 0x6D);
 ASSERT_REG_POSITION(launch, 0xAF);
 ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(tic, 0x55D);
 ASSERT_REG_POSITION(code_loc, 0x582);
 ASSERT_REG_POSITION(texture_const_buffer_index, 0x982);
 ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8);
 ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC);
 ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11);
 ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12);
 ASSERT_LAUNCH_PARAM_POSITION(memory_config, 0x14);
 ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D);
 #undef ASSERT_REG_POSITION
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@ -14,9 +14,8 @@
 namespace Tegra::Engines {
-KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
-                           MemoryManager& memory_manager)
+    : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {}
    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
 KeplerMemory::~KeplerMemory() = default;
@ -28,46 +27,18 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
    switch (method_call.method) {
    case KEPLERMEMORY_REG_INDEX(exec): {
-        ProcessExec();
+        upload_state.ProcessExec(regs.exec.linear != 0);
        break;
    }
    case KEPLERMEMORY_REG_INDEX(data): {
-        ProcessData(method_call.argument, method_call.IsLastCall());
+        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
        }
        break;
    }
    }
 }
 void KeplerMemory::ProcessExec() {
    state.write_offset = 0;
    state.copy_size = regs.line_length_in * regs.line_count;
    state.inner_buffer.resize(state.copy_size);
 }
 void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
    const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset);
    std::memcpy(&state.inner_buffer[state.write_offset], &regs.data, sub_copy_size);
    state.write_offset += sub_copy_size;
    if (is_last_call) {
        const GPUVAddr address{regs.dest.Address()};
        if (regs.exec.linear != 0) {
            memory_manager.WriteBlock(address, state.inner_buffer.data(), state.copy_size);
        } else {
            UNIMPLEMENTED_IF(regs.dest.z != 0);
            UNIMPLEMENTED_IF(regs.dest.depth != 1);
            UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
            UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
            const std::size_t dst_size = Tegra::Texture::CalculateSize(
                true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
            std::vector<u8> tmp_buffer(dst_size);
            memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
            Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x,
                                          regs.dest.y, regs.dest.BlockHeight(), state.copy_size,
                                          state.inner_buffer.data(), tmp_buffer.data());
            memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
        }
        system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
    }
 }
 } // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@ -10,6 +10,7 @@
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 namespace Core {
@ -20,19 +21,20 @@ namespace Tegra {
 class MemoryManager;
 }
 namespace VideoCore {
 class RasterizerInterface;
 }
 namespace Tegra::Engines {
 /**
 * This Engine is known as P2MF. Documentation can be found in:
 * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_p2mf.xml
 * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
 */
 #define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
    (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
 class KeplerMemory final {
 public:
-    KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+    KeplerMemory(Core::System& system, MemoryManager& memory_manager);
                 MemoryManager& memory_manager);
    ~KeplerMemory();
    /// Write the value to the register identified by method.
@ -45,42 +47,7 @@ public:
            struct {
                INSERT_PADDING_WORDS(0x60);
-                u32 line_length_in;
+                Upload::Registers upload;
                u32 line_count;
                struct {
                    u32 address_high;
                    u32 address_low;
                    u32 pitch;
                    union {
                        BitField<0, 4, u32> block_width;
                        BitField<4, 4, u32> block_height;
                        BitField<8, 4, u32> block_depth;
                    };
                    u32 width;
                    u32 height;
                    u32 depth;
                    u32 z;
                    u32 x;
                    u32 y;
                    GPUVAddr Address() const {
                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                     address_low);
                    }
                    u32 BlockWidth() const {
                        return 1U << block_width.Value();
                    }
                    u32 BlockHeight() const {
                        return 1U << block_height.Value();
                    }
                    u32 BlockDepth() const {
                        return 1U << block_depth.Value();
                    }
                } dest;
                struct {
                    union {
@ -96,28 +63,17 @@ public:
        };
    } regs{};
    struct {
        u32 write_offset = 0;
        u32 copy_size = 0;
        std::vector<u8> inner_buffer;
    } state{};
 private:
    Core::System& system;
    VideoCore::RasterizerInterface& rasterizer;
    MemoryManager& memory_manager;
-
+    Upload::State upload_state;
    void ProcessExec();
    void ProcessData(u32 data, bool is_last_call);
 };
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
    static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
                  "Field " #field_name " has invalid position")
-ASSERT_REG_POSITION(line_length_in, 0x60);
+ASSERT_REG_POSITION(upload, 0x60);
 ASSERT_REG_POSITION(line_count, 0x61);
 ASSERT_REG_POSITION(dest, 0x62);
 ASSERT_REG_POSITION(exec, 0x6C);
 ASSERT_REG_POSITION(data, 0x6D);
 #undef ASSERT_REG_POSITION
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@ -20,8 +20,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                     MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
-                                                                                  *this} {
+      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
    InitializeRegisterDefaults();
 }
@ -253,6 +253,18 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
        ProcessSyncPoint();
        break;
    }
    case MAXWELL3D_REG_INDEX(exec_upload): {
        upload_state.ProcessExec(regs.exec_upload.linear != 0);
        break;
    }
    case MAXWELL3D_REG_INDEX(data_upload): {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
            dirty_flags.OnMemoryWrite();
        }
        break;
    }
    default:
        break;
    }
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@ -14,6 +14,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 #include "video_core/macro_interpreter.h"
 #include "video_core/textures/texture.h"
@ -32,6 +33,12 @@ class RasterizerInterface;
 namespace Tegra::Engines {
 /**
 * This Engine is known as GF100_3D. Documentation can be found in:
 * https://github.com/envytools/envytools/blob/master/rnndb/graph/gf100_3d.xml
 * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
 */
 #define MAXWELL3D_REG_INDEX(field_name)                                                            \
    (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32))
@ -580,7 +587,18 @@ public:
                    u32 bind;
                } macros;
-                INSERT_PADDING_WORDS(0x69);
+                INSERT_PADDING_WORDS(0x17);
                Upload::Registers upload;
                struct {
                    union {
                        BitField<0, 1, u32> linear;
                    };
                } exec_upload;
                u32 data_upload;
                INSERT_PADDING_WORDS(0x44);
                struct {
                    union {
@ -1176,6 +1194,8 @@ private:
    /// Interpreter for the macro codes uploaded to the GPU.
    MacroInterpreter macro_interpreter;
    Upload::State upload_state;
    /// Retrieves information about a specific TIC entry from the TIC buffer.
    Texture::TICEntry GetTICEntry(u32 tic_index) const;
@ -1219,6 +1239,9 @@ private:
                  "Field " #field_name " has invalid position")
 ASSERT_REG_POSITION(macros, 0x45);
 ASSERT_REG_POSITION(upload, 0x60);
 ASSERT_REG_POSITION(exec_upload, 0x6C);
 ASSERT_REG_POSITION(data_upload, 0x6D);
 ASSERT_REG_POSITION(sync_info, 0xB2);
 ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
 ASSERT_REG_POSITION(rt, 0x200);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@ -83,57 +83,66 @@ void MaxwellDMA::HandleCopy() {
    ASSERT(regs.exec.enable_2d == 1);
    const std::size_t copy_size = regs.x_count * regs.y_count;
    auto source_ptr{memory_manager.GetPointer(source)};
    auto dst_ptr{memory_manager.GetPointer(dest)};
    if (!source_ptr) {
        LOG_ERROR(HW_GPU, "source_ptr is invalid");
        return;
    }
    if (!dst_ptr) {
        LOG_ERROR(HW_GPU, "dst_ptr is invalid");
        return;
    }
    const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
        // copying.
        rasterizer.FlushRegion(ToCacheAddr(source_ptr), src_size);
        // We have to invalidate the destination region to evict any outdated surfaces from the
        // cache. We do this before actually writing the new data because the destination address
        // might contain a dirty surface that will have to be written back to memory.
        rasterizer.InvalidateRegion(ToCacheAddr(dst_ptr), dst_size);
    };
    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
        ASSERT(regs.src_params.size_z == 1);
        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
        const std::size_t src_size = Texture::CalculateSize(
            true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
            regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
-        FlushAndInvalidate(regs.src_pitch * regs.src_params.size_y,
+        const std::size_t dst_size = regs.dst_pitch * regs.y_count;
-                           copy_size * src_bytes_per_pixel);
+
        if (read_buffer.size() < src_size) {
            read_buffer.resize(src_size);
        }
        if (write_buffer.size() < dst_size) {
            write_buffer.resize(dst_size);
        }
        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
-                                  regs.src_params.size_x, src_bytes_per_pixel, source_ptr, dst_ptr,
+                                  regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(),
-                                  regs.src_params.BlockHeight(), regs.src_params.pos_x,
+                                  write_buffer.data(), regs.src_params.BlockHeight(),
-                                  regs.src_params.pos_y);
+                                  regs.src_params.pos_x, regs.src_params.pos_y);
        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
    } else {
-        ASSERT(regs.dst_params.size_z == 1);
+        ASSERT(regs.dst_params.BlockDepth() == 1);
        ASSERT(regs.src_pitch == regs.x_count);
-        const u32 src_bpp = regs.src_pitch / regs.x_count;
+        const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count;
-        FlushAndInvalidate(regs.src_pitch * regs.y_count,
+        const std::size_t dst_size = Texture::CalculateSize(
-                           regs.dst_params.size_x * regs.dst_params.size_y * src_bpp);
+            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
            regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
        const std::size_t dst_layer_size = Texture::CalculateSize(
            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
            regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
        const std::size_t src_size = regs.src_pitch * regs.y_count;
        if (read_buffer.size() < src_size) {
            read_buffer.resize(src_size);
        }
        if (write_buffer.size() < dst_size) {
            write_buffer.resize(dst_size);
        }
        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
        // If the input is linear and the output is tiled, swizzle the input and copy it over.
        Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
-                                src_bpp, dst_ptr, source_ptr, regs.dst_params.BlockHeight());
+                                src_bytes_per_pixel,
                                write_buffer.data() + dst_layer_size * regs.dst_params.pos_z,
                                read_buffer.data(), regs.dst_params.BlockHeight());
        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
    }
 }
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@ -6,6 +6,7 @@
 #include <array>
 #include <cstddef>
 #include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@ -25,6 +26,11 @@ class RasterizerInterface;
 namespace Tegra::Engines {
 /**
 * This Engine is known as GK104_Copy. Documentation can be found in:
 * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml
 */
 class MaxwellDMA final {
 public:
    explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
@ -63,6 +69,16 @@ public:
        static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
        enum class ComponentMode : u32 {
            Src0 = 0,
            Src1 = 1,
            Src2 = 2,
            Src3 = 3,
            Const0 = 4,
            Const1 = 5,
            Zero = 6,
        };
        enum class CopyMode : u32 {
            None = 0,
            Unk1 = 1,
@ -128,7 +144,26 @@ public:
                u32 x_count;
                u32 y_count;
-                INSERT_PADDING_WORDS(0xBB);
+                INSERT_PADDING_WORDS(0xB8);
                u32 const0;
                u32 const1;
                union {
                    BitField<0, 4, ComponentMode> component0;
                    BitField<4, 4, ComponentMode> component1;
                    BitField<8, 4, ComponentMode> component2;
                    BitField<12, 4, ComponentMode> component3;
                    BitField<16, 2, u32> component_size;
                    BitField<20, 3, u32> src_num_components;
                    BitField<24, 3, u32> dst_num_components;
                    u32 SrcBytePerPixel() const {
                        return src_num_components.Value() * component_size.Value();
                    }
                    u32 DstBytePerPixel() const {
                        return dst_num_components.Value() * component_size.Value();
                    }
                } swizzle_config;
                Parameters dst_params;
@ -149,6 +184,9 @@ private:
    MemoryManager& memory_manager;
    std::vector<u8> read_buffer;
    std::vector<u8> write_buffer;
    /// Performs the copy from the source buffer to the destination buffer as configured in the
    /// registers.
    void HandleCopy();
@ -165,6 +203,9 @@ ASSERT_REG_POSITION(src_pitch, 0x104);
 ASSERT_REG_POSITION(dst_pitch, 0x105);
 ASSERT_REG_POSITION(x_count, 0x106);
 ASSERT_REG_POSITION(y_count, 0x107);
 ASSERT_REG_POSITION(const0, 0x1C0);
 ASSERT_REG_POSITION(const1, 0x1C1);
 ASSERT_REG_POSITION(swizzle_config, 0x1C2);
 ASSERT_REG_POSITION(dst_params, 0x1C3);
 ASSERT_REG_POSITION(src_params, 0x1CA);
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{ren
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
-    kepler_compute = std::make_unique<Engines::KeplerCompute>(*memory_manager);
+    kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager);
-    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, rasterizer, *memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
 }
 GPU::~GPU() = default;