Merge pull request #1264 from degasus/optimizations

video_core: Optimize the command processor.
2018-09-10 18:02:47 -04:00 · 2018-09-10 18:02:47 -04:00 · ae0c95efcc
parent ac959799e4 c1b8cd9058
commit ae0c95efcc
9 changed files with 123 additions and 126 deletions
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@ -8,6 +8,7 @@
 #include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_gpu.h"
 #include "core/memory.h"
 #include "video_core/command_processor.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
@ -134,17 +135,16 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
                params.address, params.num_entries, params.flags);
-    ASSERT_MSG(input.size() ==
+    ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) +
-                   sizeof(IoctlSubmitGpfifo) + params.num_entries * sizeof(IoctlGpfifoEntry),
+                                   params.num_entries * sizeof(Tegra::CommandListHeader),
               "Incorrect input size");
-    std::vector<IoctlGpfifoEntry> entries(params.num_entries);
+    std::vector<Tegra::CommandListHeader> entries(params.num_entries);
    std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
-                params.num_entries * sizeof(IoctlGpfifoEntry));
+                params.num_entries * sizeof(Tegra::CommandListHeader));
-    for (auto entry : entries) {
+
-        Tegra::GPUVAddr va_addr = entry.Address();
+    Core::System::GetInstance().GPU().ProcessCommandLists(entries);
-        Core::System::GetInstance().GPU().ProcessCommandList(va_addr, entry.sz);
+
    }
    params.fence_out.id = 0;
    params.fence_out.value = 0;
    std::memcpy(output.data(), &params, sizeof(IoctlSubmitGpfifo));
@ -160,14 +160,12 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
                params.address, params.num_entries, params.flags);
-    std::vector<IoctlGpfifoEntry> entries(params.num_entries);
+    std::vector<Tegra::CommandListHeader> entries(params.num_entries);
    Memory::ReadBlock(params.address, entries.data(),
-                      params.num_entries * sizeof(IoctlGpfifoEntry));
+                      params.num_entries * sizeof(Tegra::CommandListHeader));
    Core::System::GetInstance().GPU().ProcessCommandLists(entries);
    for (auto entry : entries) {
        Tegra::GPUVAddr va_addr = entry.Address();
        Core::System::GetInstance().GPU().ProcessCommandList(va_addr, entry.sz);
    }
    params.fence_out.id = 0;
    params.fence_out.value = 0;
    std::memcpy(output.data(), &params, output.size());
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@ -10,7 +10,6 @@
 #include "common/common_types.h"
 #include "common/swap.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
 #include "video_core/memory_manager.h"
 namespace Service::Nvidia::Devices {
@ -151,22 +150,6 @@ private:
    };
    static_assert(sizeof(IoctlAllocObjCtx) == 16, "IoctlAllocObjCtx is incorrect size");
    struct IoctlGpfifoEntry {
        u32_le entry0; // gpu_va_lo
        union {
            u32_le entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F)
            BitField<0, 8, u32_le> gpu_va_hi;
            BitField<8, 2, u32_le> unk1;
            BitField<10, 21, u32_le> sz;
            BitField<31, 1, u32_le> unk2;
        };
        Tegra::GPUVAddr Address() const {
            return (static_cast<Tegra::GPUVAddr>(gpu_va_hi) << 32) | entry0;
        }
    };
    static_assert(sizeof(IoctlGpfifoEntry) == 8, "IoctlGpfifoEntry is incorrect size");
    struct IoctlSubmitGpfifo {
        u64_le address;     // pointer to gpfifo entry structs
        u32_le num_entries; // number of fence objects being submitted
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@ -28,7 +28,12 @@ enum class BufferMethods {
    CountBufferMethods = 0x40,
 };
-void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) {
+MICROPROFILE_DEFINE(ProcessCommandLists, "GPU", "Execute command buffer", MP_RGB(128, 128, 192));
 void GPU::ProcessCommandLists(const std::vector<CommandListHeader>& commands) {
    MICROPROFILE_SCOPE(ProcessCommandLists);
    auto WriteReg = [this](u32 method, u32 subchannel, u32 value, u32 remaining_params) {
        LOG_TRACE(HW_GPU,
                  "Processing method {:08X} on subchannel {} value "
                  "{:08X} remaining params {}",
@ -67,9 +72,11 @@ void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params)
        default:
            UNIMPLEMENTED_MSG("Unimplemented engine");
        }
-}
+    };
-void GPU::ProcessCommandList(GPUVAddr address, u32 size) {
+    for (auto entry : commands) {
        Tegra::GPUVAddr address = entry.Address();
        u32 size = entry.sz;
        const boost::optional<VAddr> head_address = memory_manager->GpuToCpuAddress(address);
        VAddr current_addr = *head_address;
        while (current_addr < *head_address + size * sizeof(CommandHeader)) {
@ -100,8 +107,8 @@ void GPU::ProcessCommandList(GPUVAddr address, u32 size) {
            case SubmissionMode::IncreaseOnce: {
                ASSERT(header.arg_count.Value() >= 1);
-            // Use the original method for the first argument and then the next method for all other
+                // Use the original method for the first argument and then the next method for all
-            // arguments.
+                // other arguments.
                WriteReg(header.method, header.subchannel, Memory::Read32(current_addr),
                         header.arg_count - 1);
                current_addr += sizeof(u32);
@ -123,5 +130,6 @@ void GPU::ProcessCommandList(GPUVAddr address, u32 size) {
            }
        }
    }
 }
 } // namespace Tegra
--- a/src/video_core/command_processor.h
+++ b/src/video_core/command_processor.h
@ -7,6 +7,7 @@
 #include <type_traits>
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "video_core/memory_manager.h"
 namespace Tegra {
@ -19,6 +20,22 @@ enum class SubmissionMode : u32 {
    IncreaseOnce = 5
 };
 struct CommandListHeader {
    u32 entry0; // gpu_va_lo
    union {
        u32 entry1; // gpu_va_hi | (unk_0x02 << 0x08) | (size << 0x0A) | (unk_0x01 << 0x1F)
        BitField<0, 8, u32> gpu_va_hi;
        BitField<8, 2, u32> unk1;
        BitField<10, 21, u32> sz;
        BitField<31, 1, u32> unk2;
    };
    GPUVAddr Address() const {
        return (static_cast<GPUVAddr>(gpu_va_hi) << 32) | entry0;
    }
 };
 static_assert(sizeof(CommandListHeader) == 8, "CommandListHeader is incorrect size");
 union CommandHeader {
    u32 hex;
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@ -135,8 +135,6 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
        break;
    }
    rasterizer.NotifyMaxwellRegisterChanged(method);
    if (debug_context) {
        debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandProcessed, nullptr);
    }
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@ -6,6 +6,7 @@
 #include <array>
 #include <memory>
 #include <vector>
 #include "common/common_types.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/memory_manager.h"
@ -67,6 +68,7 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format);
 /// Returns the number of bytes per pixel of each depth format.
 u32 DepthFormatBytesPerPixel(DepthFormat format);
 struct CommandListHeader;
 class DebugContext;
 /**
@ -115,7 +117,7 @@ public:
    ~GPU();
    /// Processes a command list stored at the specified address in GPU memory.
-    void ProcessCommandList(GPUVAddr address, u32 size);
+    void ProcessCommandLists(const std::vector<CommandListHeader>& commands);
    /// Returns a reference to the Maxwell3D GPU engine.
    Engines::Maxwell3D& Maxwell3D();
@ -130,9 +132,6 @@ public:
    const Tegra::MemoryManager& MemoryManager() const;
 private:
    /// Writes a single register in the engine bound to the specified subchannel
    void WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params);
    std::unique_ptr<Tegra::MemoryManager> memory_manager;
    /// Mapping of command subchannels to their bound engine ids.
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@ -20,9 +20,6 @@ public:
    /// Clear the current framebuffer
    virtual void Clear() = 0;
    /// Notify rasterizer that the specified Maxwell register has been changed
    virtual void NotifyMaxwellRegisterChanged(u32 method) = 0;
    /// Notify rasterizer that all caches should be flushed to Switch memory
    virtual void FlushAll() = 0;
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -527,8 +527,6 @@ void RasterizerOpenGL::DrawArrays() {
    state.Apply();
 }
 void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {}
 void RasterizerOpenGL::FlushAll() {}
 void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {}
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@ -45,7 +45,6 @@ public:
    void DrawArrays() override;
    void Clear() override;
    void NotifyMaxwellRegisterChanged(u32 method) override;
    void FlushAll() override;
    void FlushRegion(VAddr addr, u64 size) override;
    void InvalidateRegion(VAddr addr, u64 size) override;