Merge pull request #2912 from FernandoS27/async-fixes

General fixes to Async GPU
2019-10-16 10:34:48 -04:00 · 2019-10-16 10:34:48 -04:00 · ef9b31783d
parent 60315060b1 cfc2f30dc4
commit ef9b31783d
16 changed files with 67 additions and 52 deletions
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@ -256,6 +256,8 @@ struct System::Impl {
        is_powered_on = false;
        exit_lock = false;

+        gpu_core->WaitIdle();
+
        // Shutdown emulation session
        renderer.reset();
        GDBStub::Shutdown();
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
+#include "core/core_timing.h"
 #include "core/hle/service/nvdrv/devices/nvdisp_disp0.h"
 #include "core/hle/service/nvdrv/devices/nvmap.h"
 #include "core/perf_stats.h"
@ -38,7 +39,10 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3
        transform, crop_rect};

    system.GetPerfStats().EndGameFrame();
+    system.GetPerfStats().EndSystemFrame();
    system.GPU().SwapBuffers(&framebuffer);
+    system.FrameLimiter().DoFrameLimiting(system.CoreTiming().GetGlobalTimeUs());
+    system.GetPerfStats().BeginSystemFrame();
 }

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
@ -63,16 +63,26 @@ u32 nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>&
        return NvResult::BadParameter;
    }

+    u32 event_id = params.value & 0x00FF;
+
+    if (event_id >= MaxNvEvents) {
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::BadParameter;
+    }
+
+    auto event = events_interface.events[event_id];
    auto& gpu = system.GPU();
    // This is mostly to take into account unimplemented features. As synced
    // gpu is always synced.
    if (!gpu.IsAsync()) {
+        event.writable->Signal();
        return NvResult::Success;
    }
    auto lock = gpu.LockSync();
    const u32 current_syncpoint_value = gpu.GetSyncpointValue(params.syncpt_id);
    const s32 diff = current_syncpoint_value - params.threshold;
    if (diff >= 0) {
+        event.writable->Signal();
        params.value = current_syncpoint_value;
        std::memcpy(output.data(), &params, sizeof(params));
        return NvResult::Success;
@ -88,27 +98,6 @@ u32 nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>&
        return NvResult::Timeout;
    }

-    u32 event_id;
-    if (is_async) {
-        event_id = params.value & 0x00FF;
-        if (event_id >= MaxNvEvents) {
-            std::memcpy(output.data(), &params, sizeof(params));
-            return NvResult::BadParameter;
-        }
-    } else {
-        if (ctrl.fresh_call) {
-            const auto result = events_interface.GetFreeEvent();
-            if (result) {
-                event_id = *result;
-            } else {
-                LOG_CRITICAL(Service_NVDRV, "No Free Events available!");
-                event_id = params.value & 0x00FF;
-            }
-        } else {
-            event_id = ctrl.event_id;
-        }
-    }
-
    EventState status = events_interface.status[event_id];
    if (event_id < MaxNvEvents || status == EventState::Free || status == EventState::Registered) {
        events_interface.SetEventStatus(event_id, EventState::Waiting);
@ -120,7 +109,7 @@ u32 nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>&
            params.value = ((params.syncpt_id & 0xfff) << 16) | 0x10000000;
        }
        params.value |= event_id;
-        events_interface.events[event_id].writable->Clear();
+        event.writable->Clear();
        gpu.RegisterSyncptInterrupt(params.syncpt_id, target_value);
        if (!is_async && ctrl.fresh_call) {
            ctrl.must_delay = true;
--- a/src/core/hle/service/nvdrv/interface.cpp
+++ b/src/core/hle/service/nvdrv/interface.cpp
@ -134,7 +134,9 @@ void NVDRV::QueryEvent(Kernel::HLERequestContext& ctx) {
    IPC::ResponseBuilder rb{ctx, 3, 1};
    rb.Push(RESULT_SUCCESS);
    if (event_id < MaxNvEvents) {
-        rb.PushCopyObjects(nvdrv->GetEvent(event_id));
+        auto event = nvdrv->GetEvent(event_id);
+        event->Clear();
+        rb.PushCopyObjects(event);
        rb.Push<u32>(NvResult::Success);
    } else {
        rb.Push<u32>(0);
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@ -40,8 +40,8 @@ Module::Module(Core::System& system) {
    auto& kernel = system.Kernel();
    for (u32 i = 0; i < MaxNvEvents; i++) {
        std::string event_label = fmt::format("NVDRV::NvEvent_{}", i);
-        events_interface.events[i] = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, event_label);
+        events_interface.events[i] =
+            Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual, event_label);
        events_interface.status[i] = EventState::Free;
        events_interface.registered[i] = false;
    }
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@ -187,14 +187,18 @@ void NVFlinger::Compose() {
        MicroProfileFlip();

        if (!buffer) {
-            // There was no queued buffer to draw, render previous frame
-            system.GetPerfStats().EndGameFrame();
-            system.GPU().SwapBuffers({});
            continue;
        }

        const auto& igbp_buffer = buffer->get().igbp_buffer;

+        const auto& gpu = system.GPU();
+        const auto& multi_fence = buffer->get().multi_fence;
+        for (u32 fence_id = 0; fence_id < multi_fence.num_fences; fence_id++) {
+            const auto& fence = multi_fence.fences[fence_id];
+            gpu.WaitFence(fence.id, fence.value);
+        }
+
        // Now send the buffer to the GPU for drawing.
        // TODO(Subv): Support more than just disp0. The display device selection is probably based
        // on which display we're drawing (Default, Internal, External, etc)
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@ -3,6 +3,7 @@
 // Refer to the license.txt file included.

 #include "common/assert.h"
+#include "common/microprofile.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/memory.h"
@ -17,6 +18,8 @@

 namespace Tegra {

+MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
+
 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
    : system{system}, renderer{renderer}, is_async{is_async} {
    auto& rasterizer{renderer.Rasterizer()};
@ -63,6 +66,16 @@ const DmaPusher& GPU::DmaPusher() const {
    return *dma_pusher;
 }

+void GPU::WaitFence(u32 syncpoint_id, u32 value) const {
+    // Synced GPU, is always in sync
+    if (!is_async) {
+        return;
+    }
+    MICROPROFILE_SCOPE(GPU_wait);
+    while (syncpoints[syncpoint_id].load(std::memory_order_relaxed) < value) {
+    }
+}
+
 void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
    syncpoints[syncpoint_id]++;
    std::lock_guard lock{sync_mutex};
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@ -177,6 +177,12 @@ public:
    /// Returns a reference to the GPU DMA pusher.
    Tegra::DmaPusher& DmaPusher();

+    // Waits for the GPU to finish working
+    virtual void WaitIdle() const = 0;
+
+    /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
+    void WaitFence(u32 syncpoint_id, u32 value) const;
+
    void IncrementSyncPoint(u32 syncpoint_id);

    u32 GetSyncpointValue(u32 syncpoint_id) const;
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@ -44,4 +44,8 @@ void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) con
    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
 }

+void GPUAsynch::WaitIdle() const {
+    gpu_thread.WaitIdle();
+}
+
 } // namespace VideoCommon
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@ -25,6 +25,7 @@ public:
    void FlushRegion(CacheAddr addr, u64 size) override;
    void InvalidateRegion(CacheAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+    void WaitIdle() const override;

 protected:
    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@ -24,6 +24,7 @@ public:
    void FlushRegion(CacheAddr addr, u64 size) override;
    void InvalidateRegion(CacheAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+    void WaitIdle() const override {}

 protected:
    void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@ -5,8 +5,6 @@
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
-#include "core/core_timing.h"
-#include "core/core_timing_util.h"
 #include "core/frontend/scope_acquire_window_context.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
@ -68,14 +66,10 @@ ThreadManager::~ThreadManager() {

 void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) {
    thread = std::thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)};
-    synchronization_event = system.CoreTiming().RegisterEvent(
-        "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
 }

 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
-    const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))};
-    const s64 synchronization_ticks{Core::Timing::usToCycles(std::chrono::microseconds{9000})};
-    system.CoreTiming().ScheduleEvent(synchronization_ticks, synchronization_event, fence);
+    PushCommand(SubmitListCommand(std::move(entries)));
 }

 void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
@ -96,16 +90,15 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
    InvalidateRegion(addr, size);
 }

+void ThreadManager::WaitIdle() const {
+    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed)) {
+    }
+}
+
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
    const u64 fence{++state.last_fence};
    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
    return fence;
 }

-MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
-void SynchState::WaitForSynchronization(u64 fence) {
-    while (signaled_fence.load() < fence)
-        ;
-}
-
 } // namespace VideoCommon::GPUThread
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@ -21,9 +21,6 @@ class DmaPusher;

 namespace Core {
 class System;
-namespace Timing {
-struct EventType;
-} // namespace Timing
 } // namespace Core

 namespace VideoCommon::GPUThread {
@ -89,8 +86,6 @@ struct CommandDataContainer {
 struct SynchState final {
    std::atomic_bool is_running{true};

-    void WaitForSynchronization(u64 fence);
-
    using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
    CommandQueue queue;
    u64 last_fence{};
@ -121,6 +116,9 @@ public:
    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size);

+    // Wait until the gpu thread is idle.
+    void WaitIdle() const;
+
 private:
    /// Pushes a command to be executed by the GPU thread
    u64 PushCommand(CommandData&& command_data);
@ -128,7 +126,6 @@ private:
 private:
    SynchState state;
    Core::System& system;
-    Core::Timing::EventType* synchronization_event{};
    std::thread thread;
    std::thread::id thread_id;
 };
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -348,6 +348,7 @@ static constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
 }

 void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
+    std::lock_guard lock{pages_mutex};
    const u64 page_start{addr >> Memory::PAGE_BITS};
    const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS};

--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@ -9,6 +9,7 @@
 #include <cstddef>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <optional>
 #include <tuple>
 #include <utility>
@ -230,6 +231,8 @@ private:

    using CachedPageMap = boost::icl::interval_map<u64, int>;
    CachedPageMap cached_pages;
+
+    std::mutex pages_mutex;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@ -102,8 +102,6 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::Syst
 RendererOpenGL::~RendererOpenGL() = default;

 void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
-    system.GetPerfStats().EndSystemFrame();
-
    // Maintain the rasterizer's state as a priority
    OpenGLState prev_state = OpenGLState::GetCurState();
    state.AllDirty();
@ -135,9 +133,6 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {

    render_window.PollEvents();

-    system.FrameLimiter().DoFrameLimiting(system.CoreTiming().GetGlobalTimeUs());
-    system.GetPerfStats().BeginSystemFrame();
-
    // Restore the rasterizer state
    prev_state.AllDirty();
    prev_state.Apply();