diff --git a/src/core/settings.cpp b/src/core/settings.cpp index e8a6f2a6e..44252dd81 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -115,6 +115,7 @@ void LogSettings() { values.use_asynchronous_gpu_emulation.GetValue()); log_setting("Renderer_UseVsync", values.use_vsync.GetValue()); log_setting("Renderer_UseAssemblyShaders", values.use_assembly_shaders.GetValue()); + log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue()); log_setting("Renderer_AnisotropicFilteringLevel", values.max_anisotropy.GetValue()); log_setting("Audio_OutputEngine", values.sink_id); log_setting("Audio_EnableAudioStretching", values.enable_audio_stretching.GetValue()); @@ -170,6 +171,7 @@ void RestoreGlobalState() { values.use_asynchronous_gpu_emulation.SetGlobal(true); values.use_vsync.SetGlobal(true); values.use_assembly_shaders.SetGlobal(true); + values.use_asynchronous_shaders.SetGlobal(true); values.use_fast_gpu_time.SetGlobal(true); values.force_30fps_mode.SetGlobal(true); values.bg_red.SetGlobal(true); diff --git a/src/core/settings.h b/src/core/settings.h index a64debd25..386233fdf 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -434,6 +434,7 @@ struct Values { Setting use_asynchronous_gpu_emulation; Setting use_vsync; Setting use_assembly_shaders; + Setting use_asynchronous_shaders; Setting force_30fps_mode; Setting use_fast_gpu_time; diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp index 78915e6db..5a30c75da 100644 --- a/src/core/telemetry_session.cpp +++ b/src/core/telemetry_session.cpp @@ -207,6 +207,8 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) { AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync.GetValue()); AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders.GetValue()); + AddField(field_type, "Renderer_UseAsynchronousShaders", + Settings::values.use_asynchronous_shaders.GetValue()); AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode); } diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 21c46a567..3cd896a0f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -98,6 +98,8 @@ add_library(video_core STATIC sampler_cache.cpp sampler_cache.h shader_cache.h + shader_notify.cpp + shader_notify.h shader/decode/arithmetic.cpp shader/decode/arithmetic_immediate.cpp shader/decode/bfe.cpp @@ -128,6 +130,8 @@ add_library(video_core STATIC shader/decode/other.cpp shader/ast.cpp shader/ast.h + shader/async_shaders.cpp + shader/async_shaders.h shader/compiler_settings.cpp shader/compiler_settings.h shader/control_flow.cpp diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 758bfe148..8e19c3373 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -20,6 +20,7 @@ #include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/renderer_base.h" +#include "video_core/shader_notify.h" #include "video_core/video_core.h" namespace Tegra { @@ -36,6 +37,7 @@ GPU::GPU(Core::System& system, std::unique_ptr&& render kepler_compute = std::make_unique(system, rasterizer, *memory_manager); maxwell_dma = std::make_unique(system, *memory_manager); kepler_memory = std::make_unique(system, *memory_manager); + shader_notify = std::make_unique(); } GPU::~GPU() = default; diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 2c42483bd..8d04d9fd9 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -33,6 +33,7 @@ class System; namespace VideoCore { class RendererBase; +class ShaderNotify; } // namespace VideoCore namespace Tegra { @@ -207,6 +208,14 @@ public: return *renderer; } + VideoCore::ShaderNotify& ShaderNotify() { + return *shader_notify; + } + + const VideoCore::ShaderNotify& ShaderNotify() const { + return *shader_notify; + } + // Waits for the GPU to finish working virtual void WaitIdle() const = 0; @@ -347,6 +356,8 @@ private: std::unique_ptr maxwell_dma; /// Inline memory engine std::unique_ptr kepler_memory; + /// Shader build notifier + std::unique_ptr shader_notify; std::array, Service::Nvidia::MaxSyncPoints> syncpoints{}; diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index c1f20f0ab..630acb73b 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -233,6 +233,8 @@ Device::Device() GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; + use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); + LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index e1d811966..94d38d7d1 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -104,6 +104,10 @@ public: return use_assembly_shaders; } + bool UseAsynchronousShaders() const { + return use_asynchronous_shaders; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); @@ -127,6 +131,7 @@ private: bool has_fast_buffer_sub_data{}; bool has_nv_viewport_array2{}; bool use_assembly_shaders{}; + bool use_asynchronous_shaders{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index e960a0ef1..c3fad563c 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -149,7 +149,8 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE}, fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system}, - screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { + screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker}, + async_shaders{emu_window} { CheckExtensions(); unified_uniform_buffer.Create(); @@ -162,6 +163,23 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind nullptr, 0); } } + + if (device.UseAsynchronousShaders()) { + // Max worker threads we should allow + constexpr auto MAX_THREADS = 2u; + // Amount of threads we should reserve for other parts of yuzu + constexpr auto RESERVED_THREADS = 6u; + // Get the amount of threads we can use(this can return zero) + const auto cpu_thread_count = + std::max(RESERVED_THREADS, std::thread::hardware_concurrency()); + // Deduce how many "extra" threads we have to use. + const auto max_threads_unused = cpu_thread_count - RESERVED_THREADS; + // Always allow at least 1 thread regardless of our settings + const auto max_worker_count = std::max(1u, max_threads_unused); + // Don't use more than MAX_THREADS + const auto worker_count = std::min(max_worker_count, MAX_THREADS); + async_shaders.AllocateWorkers(worker_count); + } } RasterizerOpenGL::~RasterizerOpenGL() { @@ -336,7 +354,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { continue; } - Shader* const shader = shader_cache.GetStageProgram(program); + Shader* shader = shader_cache.GetStageProgram(program, async_shaders); if (device.UseAssemblyShaders()) { // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this @@ -353,7 +371,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { SetupDrawTextures(stage, shader); SetupDrawImages(stage, shader); - const GLuint program_handle = shader->GetHandle(); + const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0; switch (program) { case Maxwell::ShaderProgram::VertexA: case Maxwell::ShaderProgram::VertexB: diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 4f082592f..a95646936 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -33,6 +33,7 @@ #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/utils.h" +#include "video_core/shader/async_shaders.h" #include "video_core/textures/texture.h" namespace Core { @@ -91,6 +92,14 @@ public: return num_queued_commands > 0; } + VideoCommon::Shader::AsyncShaders& GetAsyncShaders() { + return async_shaders; + } + + const VideoCommon::Shader::AsyncShaders& GetAsyncShaders() const { + return async_shaders; + } + private: /// Configures the color and depth framebuffer states. void ConfigureFramebuffers(); @@ -242,6 +251,7 @@ private: ScreenInfo& screen_info; ProgramManager& program_manager; StateTracker& state_tracker; + VideoCommon::Shader::AsyncShaders async_shaders; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index f8b322227..b05cb641c 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -177,6 +177,12 @@ public: Release(); } + OGLAssemblyProgram& operator=(OGLAssemblyProgram&& o) noexcept { + Release(); + handle = std::exchange(o.handle, 0); + return *this; + } + /// Deletes the internal OpenGL resource void Release(); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index c6a3bf3a1..f469ed656 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -31,6 +31,7 @@ #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" #include "video_core/shader_cache.h" +#include "video_core/shader_notify.h" namespace OpenGL { @@ -140,9 +141,24 @@ std::shared_ptr MakeRegistry(const ShaderDiskCacheEntry& entry) { return registry; } +std::unordered_set GetSupportedFormats() { + GLint num_formats; + glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats); + + std::vector formats(num_formats); + glGetIntegerv(GL_PROGRAM_BINARY_FORMATS, formats.data()); + + std::unordered_set supported_formats; + for (const GLint format : formats) { + supported_formats.insert(static_cast(format)); + } + return supported_formats; +} + +} // Anonymous namespace + ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier, - const ShaderIR& ir, const Registry& registry, - bool hint_retrievable = false) { + const ShaderIR& ir, const Registry& registry, bool hint_retrievable) { const std::string shader_id = MakeShaderID(unique_identifier, shader_type); LOG_INFO(Render_OpenGL, "{}", shader_id); @@ -181,30 +197,17 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u return program; } -std::unordered_set GetSupportedFormats() { - GLint num_formats; - glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats); - - std::vector formats(num_formats); - glGetIntegerv(GL_PROGRAM_BINARY_FORMATS, formats.data()); - - std::unordered_set supported_formats; - for (const GLint format : formats) { - supported_formats.insert(static_cast(format)); - } - return supported_formats; -} - -} // Anonymous namespace - Shader::Shader(std::shared_ptr registry_, ShaderEntries entries_, - ProgramSharedPtr program_) - : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} { + ProgramSharedPtr program_, bool is_built) + : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)}, + is_built(is_built) { handle = program->assembly_program.handle; if (handle == 0) { handle = program->source_program.handle; } - ASSERT(handle != 0); + if (is_built) { + ASSERT(handle != 0); + } } Shader::~Shader() = default; @@ -214,42 +217,82 @@ GLuint Shader::GetHandle() const { return handle; } -std::unique_ptr Shader::CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode code, ProgramCode code_b) { +bool Shader::IsBuilt() const { + return is_built; +} + +void Shader::AsyncOpenGLBuilt(OGLProgram new_program) { + program->source_program = std::move(new_program); + handle = program->source_program.handle; + is_built = true; +} + +void Shader::AsyncGLASMBuilt(OGLAssemblyProgram new_program) { + program->assembly_program = std::move(new_program); + handle = program->assembly_program.handle; + is_built = true; +} + +std::unique_ptr Shader::CreateStageFromMemory( + const ShaderParameters& params, Maxwell::ShaderProgram program_type, ProgramCode code, + ProgramCode code_b, VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr) { const auto shader_type = GetShaderType(program_type); const std::size_t size_in_bytes = code.size() * sizeof(u64); - auto registry = std::make_shared(shader_type, params.system.GPU().Maxwell3D()); - const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); - // TODO(Rodrigo): Handle VertexA shaders - // std::optional ir_b; - // if (!code_b.empty()) { - // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); - // } - auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry); + auto& gpu = params.system.GPU(); + gpu.ShaderNotify().MarkSharderBuilding(); - ShaderDiskCacheEntry entry; - entry.type = shader_type; - entry.code = std::move(code); - entry.code_b = std::move(code_b); - entry.unique_identifier = params.unique_identifier; - entry.bound_buffer = registry->GetBoundBuffer(); - entry.graphics_info = registry->GetGraphicsInfo(); - entry.keys = registry->GetKeys(); - entry.bound_samplers = registry->GetBoundSamplers(); - entry.bindless_samplers = registry->GetBindlessSamplers(); - params.disk_cache.SaveEntry(std::move(entry)); + auto registry = std::make_shared(shader_type, gpu.Maxwell3D()); + if (!async_shaders.IsShaderAsync(params.system.GPU()) || + !params.device.UseAsynchronousShaders()) { + const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); + // TODO(Rodrigo): Handle VertexA shaders + // std::optional ir_b; + // if (!code_b.empty()) { + // ir_b.emplace(code_b, STAGE_MAIN_OFFSET); + // } + auto program = + BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry); + ShaderDiskCacheEntry entry; + entry.type = shader_type; + entry.code = std::move(code); + entry.code_b = std::move(code_b); + entry.unique_identifier = params.unique_identifier; + entry.bound_buffer = registry->GetBoundBuffer(); + entry.graphics_info = registry->GetGraphicsInfo(); + entry.keys = registry->GetKeys(); + entry.bound_samplers = registry->GetBoundSamplers(); + entry.bindless_samplers = registry->GetBindlessSamplers(); + params.disk_cache.SaveEntry(std::move(entry)); - return std::unique_ptr(new Shader( - std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program))); + gpu.ShaderNotify().MarkShaderComplete(); + + return std::unique_ptr(new Shader(std::move(registry), + MakeEntries(params.device, ir, shader_type), + std::move(program), true)); + } else { + // Required for entries + const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry); + auto entries = MakeEntries(params.device, ir, shader_type); + + async_shaders.QueueOpenGLShader(params.device, shader_type, params.unique_identifier, + std::move(code), std::move(code_b), STAGE_MAIN_OFFSET, + COMPILER_SETTINGS, *registry, cpu_addr); + + auto program = std::make_shared(); + return std::unique_ptr( + new Shader(std::move(registry), std::move(entries), std::move(program), false)); + } } std::unique_ptr Shader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { const std::size_t size_in_bytes = code.size() * sizeof(u64); - auto& engine = params.system.GPU().KeplerCompute(); + auto& gpu = params.system.GPU(); + gpu.ShaderNotify().MarkSharderBuilding(); + + auto& engine = gpu.KeplerCompute(); auto registry = std::make_shared(ShaderType::Compute, engine); const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry); const u64 uid = params.unique_identifier; @@ -266,6 +309,8 @@ std::unique_ptr Shader::CreateKernelFromMemory(const ShaderParameters& p entry.bindless_samplers = registry->GetBindlessSamplers(); params.disk_cache.SaveEntry(std::move(entry)); + gpu.ShaderNotify().MarkShaderComplete(); + return std::unique_ptr(new Shader(std::move(registry), MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); @@ -436,14 +481,51 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( return program; } -Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { +Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program, + VideoCommon::Shader::AsyncShaders& async_shaders) { if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) { - return last_shaders[static_cast(program)]; + auto* last_shader = last_shaders[static_cast(program)]; + if (last_shader->IsBuilt()) { + return last_shader; + } } auto& memory_manager{system.GPU().MemoryManager()}; const GPUVAddr address{GetShaderAddress(system, program)}; + if (device.UseAsynchronousShaders() && async_shaders.HasCompletedWork()) { + auto completed_work = async_shaders.GetCompletedWork(); + for (auto& work : completed_work) { + Shader* shader = TryGet(work.cpu_address); + auto& gpu = system.GPU(); + gpu.ShaderNotify().MarkShaderComplete(); + if (shader == nullptr) { + continue; + } + using namespace VideoCommon::Shader; + if (work.backend == AsyncShaders::Backend::OpenGL) { + shader->AsyncOpenGLBuilt(std::move(work.program.opengl)); + } else if (work.backend == AsyncShaders::Backend::GLASM) { + shader->AsyncGLASMBuilt(std::move(work.program.glasm)); + } + + ShaderDiskCacheEntry entry; + entry.type = work.shader_type; + entry.code = std::move(work.code); + entry.code_b = std::move(work.code_b); + entry.unique_identifier = work.uid; + + auto& registry = shader->GetRegistry(); + + entry.bound_buffer = registry.GetBoundBuffer(); + entry.graphics_info = registry.GetGraphicsInfo(); + entry.keys = registry.GetKeys(); + entry.bound_samplers = registry.GetBoundSamplers(); + entry.bindless_samplers = registry.GetBindlessSamplers(); + disk_cache.SaveEntry(std::move(entry)); + } + } + // Look up shader in the cache based on address const auto cpu_addr{memory_manager.GpuToCpuAddress(address)}; if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) { @@ -471,7 +553,8 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { std::unique_ptr shader; const auto found = runtime_cache.find(unique_identifier); if (found == runtime_cache.end()) { - shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b)); + shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b), + async_shaders, cpu_addr.value_or(0)); } else { shader = Shader::CreateFromCache(params, found->second); } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 994aaeaf2..7528ac686 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -33,6 +33,10 @@ namespace Core::Frontend { class EmuWindow; } +namespace VideoCommon::Shader { +class AsyncShaders; +} + namespace OpenGL { class Device; @@ -61,6 +65,11 @@ struct ShaderParameters { u64 unique_identifier; }; +ProgramSharedPtr BuildShader(const Device& device, Tegra::Engines::ShaderType shader_type, + u64 unique_identifier, const VideoCommon::Shader::ShaderIR& ir, + const VideoCommon::Shader::Registry& registry, + bool hint_retrievable = false); + class Shader final { public: ~Shader(); @@ -68,15 +77,28 @@ public: /// Gets the GL program handle for the shader GLuint GetHandle() const; + bool IsBuilt() const; + /// Gets the shader entries for the shader const ShaderEntries& GetEntries() const { return entries; } - static std::unique_ptr CreateStageFromMemory(const ShaderParameters& params, - Maxwell::ShaderProgram program_type, - ProgramCode program_code, - ProgramCode program_code_b); + const VideoCommon::Shader::Registry& GetRegistry() const { + return *registry; + } + + /// Mark a OpenGL shader as built + void AsyncOpenGLBuilt(OGLProgram new_program); + + /// Mark a GLASM shader as built + void AsyncGLASMBuilt(OGLAssemblyProgram new_program); + + static std::unique_ptr CreateStageFromMemory( + const ShaderParameters& params, Maxwell::ShaderProgram program_type, + ProgramCode program_code, ProgramCode program_code_b, + VideoCommon::Shader::AsyncShaders& async_shaders, VAddr cpu_addr); + static std::unique_ptr CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code); @@ -85,12 +107,13 @@ public: private: explicit Shader(std::shared_ptr registry, ShaderEntries entries, - ProgramSharedPtr program); + ProgramSharedPtr program, bool is_built = true); std::shared_ptr registry; ShaderEntries entries; ProgramSharedPtr program; GLuint handle = 0; + bool is_built{}; }; class ShaderCacheOpenGL final : public VideoCommon::ShaderCache { @@ -104,7 +127,8 @@ public: const VideoCore::DiskResourceLoadCallback& callback); /// Gets the current specified shader stage program - Shader* GetStageProgram(Maxwell::ShaderProgram program); + Shader* GetStageProgram(Maxwell::ShaderProgram program, + VideoCommon::Shader::AsyncShaders& async_shaders); /// Gets a compute kernel in the passed address Shader* GetComputeKernel(GPUVAddr code_addr); diff --git a/src/video_core/shader/async_shaders.cpp b/src/video_core/shader/async_shaders.cpp new file mode 100644 index 000000000..b7f66d7ee --- /dev/null +++ b/src/video_core/shader/async_shaders.cpp @@ -0,0 +1,181 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include +#include +#include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_base.h" +#include "video_core/renderer_opengl/gl_shader_cache.h" +#include "video_core/shader/async_shaders.h" + +namespace VideoCommon::Shader { + +AsyncShaders::AsyncShaders(Core::Frontend::EmuWindow& emu_window) : emu_window(emu_window) {} + +AsyncShaders::~AsyncShaders() { + KillWorkers(); +} + +void AsyncShaders::AllocateWorkers(std::size_t num_workers) { + // If we're already have workers queued or don't want to queue workers, ignore + if (num_workers == worker_threads.size() || num_workers == 0) { + return; + } + + // If workers already exist, clear them + if (!worker_threads.empty()) { + FreeWorkers(); + } + + // Create workers + for (std::size_t i = 0; i < num_workers; i++) { + context_list.push_back(emu_window.CreateSharedContext()); + worker_threads.push_back(std::move( + std::thread(&AsyncShaders::ShaderCompilerThread, this, context_list[i].get()))); + } +} + +void AsyncShaders::FreeWorkers() { + // Mark all threads to quit + is_thread_exiting.store(true); + cv.notify_all(); + for (auto& thread : worker_threads) { + thread.join(); + } + // Clear our shared contexts + context_list.clear(); + + // Clear our worker threads + worker_threads.clear(); +} + +void AsyncShaders::KillWorkers() { + is_thread_exiting.store(true); + for (auto& thread : worker_threads) { + thread.detach(); + } + // Clear our shared contexts + context_list.clear(); + + // Clear our worker threads + worker_threads.clear(); +} + +bool AsyncShaders::HasWorkQueued() { + return !pending_queue.empty(); +} + +bool AsyncShaders::HasCompletedWork() { + std::shared_lock lock{completed_mutex}; + return !finished_work.empty(); +} + +bool AsyncShaders::IsShaderAsync(const Tegra::GPU& gpu) const { + const auto& regs = gpu.Maxwell3D().regs; + + // If something is using depth, we can assume that games are not rendering anything which will + // be used one time. + if (regs.zeta_enable) { + return true; + } + + // If games are using a small index count, we can assume these are full screen quads. Usually + // these shaders are only used once for building textures so we can assume they can't be built + // async + if (regs.index_array.count <= 6 || regs.vertex_buffer.count <= 6) { + return false; + } + + return true; +} + +std::vector AsyncShaders::GetCompletedWork() { + std::vector results; + { + std::unique_lock lock{completed_mutex}; + results.assign(std::make_move_iterator(finished_work.begin()), + std::make_move_iterator(finished_work.end())); + finished_work.clear(); + } + return results; +} + +void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device, + Tegra::Engines::ShaderType shader_type, u64 uid, + std::vector code, std::vector code_b, + u32 main_offset, + VideoCommon::Shader::CompilerSettings compiler_settings, + const VideoCommon::Shader::Registry& registry, + VAddr cpu_addr) { + WorkerParams params{device.UseAssemblyShaders() ? AsyncShaders::Backend::GLASM + : AsyncShaders::Backend::OpenGL, + device, + shader_type, + uid, + std::move(code), + std::move(code_b), + main_offset, + compiler_settings, + registry, + cpu_addr}; + std::unique_lock lock(queue_mutex); + pending_queue.push_back(std::move(params)); + cv.notify_one(); +} + +void AsyncShaders::ShaderCompilerThread(Core::Frontend::GraphicsContext* context) { + using namespace std::chrono_literals; + while (!is_thread_exiting.load(std::memory_order_relaxed)) { + std::unique_lock lock{queue_mutex}; + cv.wait(lock, [this] { return HasWorkQueued() || is_thread_exiting; }); + if (is_thread_exiting) { + return; + } + + // Partial lock to allow all threads to read at the same time + if (!HasWorkQueued()) { + continue; + } + // Another thread beat us, just unlock and wait for the next load + if (pending_queue.empty()) { + continue; + } + // Pull work from queue + WorkerParams work = std::move(pending_queue.front()); + pending_queue.pop_front(); + + lock.unlock(); + + if (work.backend == AsyncShaders::Backend::OpenGL || + work.backend == AsyncShaders::Backend::GLASM) { + const ShaderIR ir(work.code, work.main_offset, work.compiler_settings, work.registry); + const auto scope = context->Acquire(); + auto program = + OpenGL::BuildShader(work.device, work.shader_type, work.uid, ir, work.registry); + Result result{}; + result.backend = work.backend; + result.cpu_address = work.cpu_address; + result.uid = work.uid; + result.code = std::move(work.code); + result.code_b = std::move(work.code_b); + result.shader_type = work.shader_type; + + if (work.backend == AsyncShaders::Backend::OpenGL) { + result.program.opengl = std::move(program->source_program); + } else if (work.backend == AsyncShaders::Backend::GLASM) { + result.program.glasm = std::move(program->assembly_program); + } + + { + std::unique_lock complete_lock(completed_mutex); + finished_work.push_back(std::move(result)); + } + } + } +} + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h new file mode 100644 index 000000000..2f5ee94ad --- /dev/null +++ b/src/video_core/shader/async_shaders.h @@ -0,0 +1,109 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include +#include +#include +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_shader_decompiler.h" + +namespace Core::Frontend { +class EmuWindow; +class GraphicsContext; +} // namespace Core::Frontend + +namespace Tegra { +class GPU; +} + +namespace VideoCommon::Shader { + +class AsyncShaders { +public: + enum class Backend { + OpenGL, + GLASM, + }; + + struct ResultPrograms { + OpenGL::OGLProgram opengl; + OpenGL::OGLAssemblyProgram glasm; + }; + + struct Result { + u64 uid; + VAddr cpu_address; + Backend backend; + ResultPrograms program; + std::vector code; + std::vector code_b; + Tegra::Engines::ShaderType shader_type; + }; + + explicit AsyncShaders(Core::Frontend::EmuWindow& emu_window); + ~AsyncShaders(); + + /// Start up shader worker threads + void AllocateWorkers(std::size_t num_workers); + + /// Clear the shader queue and kill all worker threads + void FreeWorkers(); + + // Force end all threads + void KillWorkers(); + + /// Check to see if any shaders have actually been compiled + bool HasCompletedWork(); + + /// Deduce if a shader can be build on another thread of MUST be built in sync. We cannot build + /// every shader async as some shaders are only built and executed once. We try to "guess" which + /// shader would be used only once + bool IsShaderAsync(const Tegra::GPU& gpu) const; + + /// Pulls completed compiled shaders + std::vector GetCompletedWork(); + + void QueueOpenGLShader(const OpenGL::Device& device, Tegra::Engines::ShaderType shader_type, + u64 uid, std::vector code, std::vector code_b, u32 main_offset, + VideoCommon::Shader::CompilerSettings compiler_settings, + const VideoCommon::Shader::Registry& registry, VAddr cpu_addr); + +private: + void ShaderCompilerThread(Core::Frontend::GraphicsContext* context); + + /// Check our worker queue to see if we have any work queued already + bool HasWorkQueued(); + + struct WorkerParams { + AsyncShaders::Backend backend; + OpenGL::Device device; + Tegra::Engines::ShaderType shader_type; + u64 uid; + std::vector code; + std::vector code_b; + u32 main_offset; + VideoCommon::Shader::CompilerSettings compiler_settings; + VideoCommon::Shader::Registry registry; + VAddr cpu_address; + }; + + std::condition_variable cv; + std::mutex queue_mutex; + std::shared_mutex completed_mutex; + std::atomic is_thread_exiting{}; + std::vector> context_list; + std::vector worker_threads; + std::deque pending_queue; + std::vector finished_work; + Core::Frontend::EmuWindow& emu_window; +}; + +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader_notify.cpp b/src/video_core/shader_notify.cpp new file mode 100644 index 000000000..c3c71657d --- /dev/null +++ b/src/video_core/shader_notify.cpp @@ -0,0 +1,42 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "video_core/shader_notify.h" + +using namespace std::chrono_literals; + +namespace VideoCore { +namespace { +constexpr auto UPDATE_TICK = 32ms; +} + +ShaderNotify::ShaderNotify() = default; +ShaderNotify::~ShaderNotify() = default; + +std::size_t ShaderNotify::GetShadersBuilding() { + const auto now = std::chrono::high_resolution_clock::now(); + const auto diff = now - last_update; + if (diff > UPDATE_TICK) { + std::shared_lock lock(mutex); + last_updated_count = accurate_count; + } + return last_updated_count; +} + +std::size_t ShaderNotify::GetShadersBuildingAccurate() { + std::shared_lock lock{mutex}; + return accurate_count; +} + +void ShaderNotify::MarkShaderComplete() { + std::unique_lock lock{mutex}; + accurate_count--; +} + +void ShaderNotify::MarkSharderBuilding() { + std::unique_lock lock{mutex}; + accurate_count++; +} + +} // namespace VideoCore diff --git a/src/video_core/shader_notify.h b/src/video_core/shader_notify.h new file mode 100644 index 000000000..a9c92d179 --- /dev/null +++ b/src/video_core/shader_notify.h @@ -0,0 +1,29 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include +#include "common/common_types.h" + +namespace VideoCore { +class ShaderNotify { +public: + ShaderNotify(); + ~ShaderNotify(); + + std::size_t GetShadersBuilding(); + std::size_t GetShadersBuildingAccurate(); + + void MarkShaderComplete(); + void MarkSharderBuilding(); + +private: + std::size_t last_updated_count{}; + std::size_t accurate_count{}; + std::shared_mutex mutex; + std::chrono::high_resolution_clock::time_point last_update{}; +}; +} // namespace VideoCore diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index d25b99a32..805bb954b 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -661,6 +661,8 @@ void Config::ReadRendererValues() { ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true); ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"), false); + ReadSettingGlobal(Settings::values.use_asynchronous_shaders, + QStringLiteral("use_asynchronous_shaders"), false); ReadSettingGlobal(Settings::values.use_fast_gpu_time, QStringLiteral("use_fast_gpu_time"), true); ReadSettingGlobal(Settings::values.force_30fps_mode, QStringLiteral("force_30fps_mode"), false); @@ -1145,6 +1147,8 @@ void Config::SaveRendererValues() { WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true); WriteSettingGlobal(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders, false); + WriteSettingGlobal(QStringLiteral("use_asynchronous_shaders"), + Settings::values.use_asynchronous_shaders, false); WriteSettingGlobal(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true); WriteSettingGlobal(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp index 7c0fa7ec5..ce30188cd 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.cpp +++ b/src/yuzu/configuration/configure_graphics_advanced.cpp @@ -24,6 +24,7 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { const bool runtime_lock = !Core::System::GetInstance().IsPoweredOn(); ui->use_vsync->setEnabled(runtime_lock); ui->use_assembly_shaders->setEnabled(runtime_lock); + ui->use_asynchronous_shaders->setEnabled(runtime_lock); ui->force_30fps_mode->setEnabled(runtime_lock); ui->anisotropic_filtering_combobox->setEnabled(runtime_lock); @@ -32,6 +33,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { static_cast(Settings::values.gpu_accuracy.GetValue())); ui->use_vsync->setChecked(Settings::values.use_vsync.GetValue()); ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders.GetValue()); + ui->use_asynchronous_shaders->setChecked( + Settings::values.use_asynchronous_shaders.GetValue()); ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time.GetValue()); ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode.GetValue()); ui->anisotropic_filtering_combobox->setCurrentIndex( @@ -41,6 +44,10 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { ConfigurationShared::SetPerGameSetting(ui->use_vsync, &Settings::values.use_vsync); ConfigurationShared::SetPerGameSetting(ui->use_assembly_shaders, &Settings::values.use_assembly_shaders); + ConfigurationShared::SetPerGameSetting(ui->use_asynchronous_shaders, + &Settings::values.use_asynchronous_shaders); + ConfigurationShared::SetPerGameSetting(ui->use_asynchronous_shaders, + &Settings::values.use_asynchronous_shaders); ConfigurationShared::SetPerGameSetting(ui->use_fast_gpu_time, &Settings::values.use_fast_gpu_time); ConfigurationShared::SetPerGameSetting(ui->force_30fps_mode, @@ -67,6 +74,14 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() { if (Settings::values.use_assembly_shaders.UsingGlobal()) { Settings::values.use_assembly_shaders.SetValue(ui->use_assembly_shaders->isChecked()); } + if (Settings::values.use_asynchronous_shaders.UsingGlobal()) { + Settings::values.use_asynchronous_shaders.SetValue( + ui->use_asynchronous_shaders->isChecked()); + } + if (Settings::values.use_asynchronous_shaders.UsingGlobal()) { + Settings::values.use_asynchronous_shaders.SetValue( + ui->use_asynchronous_shaders->isChecked()); + } if (Settings::values.use_fast_gpu_time.UsingGlobal()) { Settings::values.use_fast_gpu_time.SetValue(ui->use_fast_gpu_time->isChecked()); } @@ -83,6 +98,10 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() { ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_vsync, ui->use_vsync); ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_assembly_shaders, ui->use_assembly_shaders); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_shaders, + ui->use_asynchronous_shaders); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_asynchronous_shaders, + ui->use_asynchronous_shaders); ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_fast_gpu_time, ui->use_fast_gpu_time); ConfigurationShared::ApplyPerGameSetting(&Settings::values.force_30fps_mode, @@ -117,6 +136,8 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { ui->gpu_accuracy->setEnabled(Settings::values.gpu_accuracy.UsingGlobal()); ui->use_vsync->setEnabled(Settings::values.use_vsync.UsingGlobal()); ui->use_assembly_shaders->setEnabled(Settings::values.use_assembly_shaders.UsingGlobal()); + ui->use_asynchronous_shaders->setEnabled( + Settings::values.use_asynchronous_shaders.UsingGlobal()); ui->use_fast_gpu_time->setEnabled(Settings::values.use_fast_gpu_time.UsingGlobal()); ui->force_30fps_mode->setEnabled(Settings::values.force_30fps_mode.UsingGlobal()); ui->anisotropic_filtering_combobox->setEnabled( @@ -128,6 +149,7 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { ConfigurationShared::InsertGlobalItem(ui->gpu_accuracy); ui->use_vsync->setTristate(true); ui->use_assembly_shaders->setTristate(true); + ui->use_asynchronous_shaders->setTristate(true); ui->use_fast_gpu_time->setTristate(true); ui->force_30fps_mode->setTristate(true); ConfigurationShared::InsertGlobalItem(ui->anisotropic_filtering_combobox); diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui index 0021607ac..71e7dfe5e 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.ui +++ b/src/yuzu/configuration/configure_graphics_advanced.ui @@ -72,6 +72,16 @@ + + + + Enables asynchronous shader compilation, which may reduce shader stutter. This feature is experimental. + + + Use asynchronous shader building (experimental, OpenGL or Assembly shaders only) + + + diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 9f758605a..6909d65d0 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -94,6 +94,8 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual #include "core/perf_stats.h" #include "core/settings.h" #include "core/telemetry_session.h" +#include "video_core/gpu.h" +#include "video_core/shader_notify.h" #include "yuzu/about_dialog.h" #include "yuzu/bootmanager.h" #include "yuzu/compatdb.h" @@ -498,6 +500,8 @@ void GMainWindow::InitializeWidgets() { message_label->setAlignment(Qt::AlignLeft); statusBar()->addPermanentWidget(message_label, 1); + shader_building_label = new QLabel(); + shader_building_label->setToolTip(tr("The amount of shaders currently being built")); emu_speed_label = new QLabel(); emu_speed_label->setToolTip( tr("Current emulation speed. Values higher or lower than 100% " @@ -510,7 +514,8 @@ void GMainWindow::InitializeWidgets() { tr("Time taken to emulate a Switch frame, not counting framelimiting or v-sync. For " "full-speed emulation this should be at most 16.67 ms.")); - for (auto& label : {emu_speed_label, game_fps_label, emu_frametime_label}) { + for (auto& label : + {shader_building_label, emu_speed_label, game_fps_label, emu_frametime_label}) { label->setVisible(false); label->setFrameStyle(QFrame::NoFrame); label->setContentsMargins(4, 0, 4, 0); @@ -1176,6 +1181,7 @@ void GMainWindow::ShutdownGame() { // Disable status bar updates status_bar_update_timer.stop(); + shader_building_label->setVisible(false); emu_speed_label->setVisible(false); game_fps_label->setVisible(false); emu_frametime_label->setVisible(false); @@ -2186,6 +2192,17 @@ void GMainWindow::UpdateStatusBar() { } auto results = Core::System::GetInstance().GetAndResetPerfStats(); + auto& shader_notify = Core::System::GetInstance().GPU().ShaderNotify(); + const auto shaders_building = shader_notify.GetShadersBuilding(); + + if (shaders_building != 0) { + shader_building_label->setText( + tr("Building: %1 shader").arg(shaders_building) + + (shaders_building != 1 ? QString::fromStdString("s") : QString::fromStdString(""))); + shader_building_label->setVisible(true); + } else { + shader_building_label->setVisible(false); + } if (Settings::values.use_frame_limit.GetValue()) { emu_speed_label->setText(tr("Speed: %1% / %2%") @@ -2315,9 +2332,12 @@ void GMainWindow::OnReinitializeKeys(ReinitializeKeyBehavior behavior) { if (behavior == ReinitializeKeyBehavior::Warning) { const auto res = QMessageBox::information( this, tr("Confirm Key Rederivation"), - tr("You are about to force rederive all of your keys. \nIf you do not know what this " - "means or what you are doing, \nthis is a potentially destructive action. \nPlease " - "make sure this is what you want \nand optionally make backups.\n\nThis will delete " + tr("You are about to force rederive all of your keys. \nIf you do not know what " + "this " + "means or what you are doing, \nthis is a potentially destructive action. " + "\nPlease " + "make sure this is what you want \nand optionally make backups.\n\nThis will " + "delete " "your autogenerated key files and re-run the key derivation module."), QMessageBox::StandardButtons{QMessageBox::Ok, QMessageBox::Cancel}); @@ -2628,8 +2648,8 @@ int main(int argc, char* argv[]) { #ifdef __APPLE__ // If you start a bundle (binary) on OSX without the Terminal, the working directory is "/". - // But since we require the working directory to be the executable path for the location of the - // user folder in the Qt Frontend, we need to cd into that working directory + // But since we require the working directory to be the executable path for the location of + // the user folder in the Qt Frontend, we need to cd into that working directory const std::string bin_path = FileUtil::GetBundleDirectory() + DIR_SEP + ".."; chdir(bin_path.c_str()); #endif diff --git a/src/yuzu/main.h b/src/yuzu/main.h index adff65fb5..59d9073ae 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -248,6 +248,7 @@ private: // Status bar elements QLabel* message_label = nullptr; + QLabel* shader_building_label = nullptr; QLabel* emu_speed_label = nullptr; QLabel* game_fps_label = nullptr; QLabel* emu_frametime_label = nullptr; diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 7773228c8..c2a2982fb 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -394,6 +394,10 @@ void Config::ReadValues() { static_cast(sdl2_config->GetInteger("Renderer", "use_vsync", 1))); Settings::values.use_assembly_shaders.SetValue( sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false)); + Settings::values.use_asynchronous_shaders.SetValue( + sdl2_config->GetBoolean("Renderer", "use_asynchronous_shaders", false)); + Settings::values.use_asynchronous_shaders.SetValue( + sdl2_config->GetBoolean("Renderer", "use_asynchronous_shaders", false)); Settings::values.use_fast_gpu_time.SetValue( sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true)); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index 5bed47fd7..aa9e40380 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -166,6 +166,10 @@ use_vsync = # 0 (default): Off, 1: On use_assembly_shaders = +# Whether to allow asynchronous shader building. +# 0 (default): Off, 1: On +use_asynchronous_shaders = + # Turns on the frame limiter, which will limit frames output to the target game speed # 0: Off, 1: On (default) use_frame_limit =