From 47a7c4f4fed12d30c2c724df8c320cf34b654433 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 17 May 2020 21:06:32 -0300 Subject: [PATCH 1/2] yuzu: Add frontend settings for assembly shaders Add settings for assembly shaders. Currently hidden to avoid users from accidentally enabled them. --- src/core/settings.cpp | 1 + src/core/settings.h | 1 + src/core/telemetry_session.cpp | 1 + src/yuzu/configuration/config.cpp | 4 ++++ src/yuzu/configuration/configure_graphics_advanced.cpp | 6 ++++++ src/yuzu/configuration/configure_graphics_advanced.ui | 10 ++++++++++ src/yuzu_cmd/config.cpp | 2 ++ src/yuzu_cmd/default_ini.h | 4 ++++ 8 files changed, 29 insertions(+) diff --git a/src/core/settings.cpp b/src/core/settings.cpp index da53cde05..4edff9cd8 100644 --- a/src/core/settings.cpp +++ b/src/core/settings.cpp @@ -112,6 +112,7 @@ void LogSettings() { LogSetting("Renderer_UseAsynchronousGpuEmulation", Settings::values.use_asynchronous_gpu_emulation); LogSetting("Renderer_UseVsync", Settings::values.use_vsync); + LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders); LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy); LogSetting("Audio_OutputEngine", Settings::values.sink_id); LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching); diff --git a/src/core/settings.h b/src/core/settings.h index c1266b341..78eb33737 100644 --- a/src/core/settings.h +++ b/src/core/settings.h @@ -446,6 +446,7 @@ struct Values { GPUAccuracy gpu_accuracy; bool use_asynchronous_gpu_emulation; bool use_vsync; + bool use_assembly_shaders; bool force_30fps_mode; bool use_fast_gpu_time; diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp index 1c3b03a1c..c781b3cfc 100644 --- a/src/core/telemetry_session.cpp +++ b/src/core/telemetry_session.cpp @@ -201,6 +201,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) { AddField(field_type, "Renderer_UseAsynchronousGpuEmulation", Settings::values.use_asynchronous_gpu_emulation); AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync); + AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders); AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode); } diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index 27775701d..b08b87426 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -643,6 +643,8 @@ void Config::ReadRendererValues() { Settings::values.use_asynchronous_gpu_emulation = ReadSetting(QStringLiteral("use_asynchronous_gpu_emulation"), false).toBool(); Settings::values.use_vsync = ReadSetting(QStringLiteral("use_vsync"), true).toBool(); + Settings::values.use_assembly_shaders = + ReadSetting(QStringLiteral("use_assembly_shaders"), false).toBool(); Settings::values.use_fast_gpu_time = ReadSetting(QStringLiteral("use_fast_gpu_time"), true).toBool(); Settings::values.force_30fps_mode = @@ -1090,6 +1092,8 @@ void Config::SaveRendererValues() { WriteSetting(QStringLiteral("use_asynchronous_gpu_emulation"), Settings::values.use_asynchronous_gpu_emulation, false); WriteSetting(QStringLiteral("use_vsync"), Settings::values.use_vsync, true); + WriteSetting(QStringLiteral("use_assembly_shaders"), Settings::values.use_assembly_shaders, + false); WriteSetting(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, true); WriteSetting(QStringLiteral("force_30fps_mode"), Settings::values.force_30fps_mode, false); diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp index 5bb2ae555..37aadf7f8 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.cpp +++ b/src/yuzu/configuration/configure_graphics_advanced.cpp @@ -12,6 +12,9 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent) ui->setupUi(this); + // TODO: Remove this after assembly shaders are fully integrated + ui->use_assembly_shaders->setVisible(false); + SetConfiguration(); } @@ -22,6 +25,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { ui->gpu_accuracy->setCurrentIndex(static_cast(Settings::values.gpu_accuracy)); ui->use_vsync->setEnabled(runtime_lock); ui->use_vsync->setChecked(Settings::values.use_vsync); + ui->use_assembly_shaders->setEnabled(runtime_lock); + ui->use_assembly_shaders->setChecked(Settings::values.use_assembly_shaders); ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time); ui->force_30fps_mode->setEnabled(runtime_lock); ui->force_30fps_mode->setChecked(Settings::values.force_30fps_mode); @@ -33,6 +38,7 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() { auto gpu_accuracy = static_cast(ui->gpu_accuracy->currentIndex()); Settings::values.gpu_accuracy = gpu_accuracy; Settings::values.use_vsync = ui->use_vsync->isChecked(); + Settings::values.use_assembly_shaders = ui->use_assembly_shaders->isChecked(); Settings::values.use_fast_gpu_time = ui->use_fast_gpu_time->isChecked(); Settings::values.force_30fps_mode = ui->force_30fps_mode->isChecked(); Settings::values.max_anisotropy = ui->anisotropic_filtering_combobox->currentIndex(); diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui index 770b80c50..0021607ac 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.ui +++ b/src/yuzu/configuration/configure_graphics_advanced.ui @@ -62,6 +62,16 @@ + + + + Enabling this reduces shader stutter. Enables OpenGL assembly shaders on supported Nvidia devices (NV_gpu_program5 is required). This feature is experimental. + + + Use assembly shaders (experimental, Nvidia OpenGL only) + + + diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 2348e6e0d..c20d48c42 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -397,6 +397,8 @@ void Config::ReadValues() { sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false); Settings::values.use_vsync = static_cast(sdl2_config->GetInteger("Renderer", "use_vsync", 1)); + Settings::values.use_assembly_shaders = + sdl2_config->GetBoolean("Renderer", "use_assembly_shaders", false); Settings::values.use_fast_gpu_time = sdl2_config->GetBoolean("Renderer", "use_fast_gpu_time", true); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index ae94b51c4..abc6e6e65 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -134,6 +134,10 @@ max_anisotropy = # 0 (default): Off, 1: On use_vsync = +# Whether to use OpenGL assembly shaders or not. NV_gpu_program5 is required. +# 0 (default): Off, 1: On +use_assembly_shaders = + # Turns on the frame limiter, which will limit frames output to the target game speed # 0: Off, 1: On (default) use_frame_limit = From 420cc13248350ef5c2d19e0b961cb4185cd16a8a Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sun, 17 May 2020 22:32:49 -0300 Subject: [PATCH 2/2] renderer_opengl: Add assembly program code paths Add code required to use OpenGL assembly programs based on NV_gpu_program5. Decompilation for ARB programs is intended to be added in a follow up commit. This does **not** include ARB decompilation and it's not in an usable state. The intention behind assembly programs is to reduce shader stutter significantly on drivers supporting NV_gpu_program5 (and other required extensions). Currently only Nvidia's proprietary driver supports these extensions. Add a UI option hidden for now to avoid people enabling this option accidentally. This code path has some limitations that OpenGL compatibility doesn't have: - NV_shader_storage_buffer_object is limited to 16 entries for a single OpenGL context state (I don't know if this is an intended limitation, an specification issue or I am missing something). Currently causes issues on The Legend of Zelda: Link's Awakening. - NV_parameter_buffer_object can't bind buffers using an offset different to zero. The used workaround is to copy to a temporary buffer (this doesn't happen often so it's not an issue). On the other hand, it has the following advantages: - Shaders build a lot faster. - We have control over how floating point rounding is done over individual instructions (SPIR-V on Vulkan can't do this). - Operations on shared memory can be unsigned and signed. - Transform feedbacks are dynamic state (not yet implemented). - Parameter buffers (uniform buffers) are per stage, matching NVN and hardware's behavior. - The API to bind and create assembly programs makes sense, unlike ARB_separate_shader_objects. --- src/video_core/renderer_opengl/gl_device.cpp | 7 ++ src/video_core/renderer_opengl/gl_device.h | 5 + .../renderer_opengl/gl_rasterizer.cpp | 96 ++++++++++++---- .../renderer_opengl/gl_rasterizer.h | 16 ++- .../renderer_opengl/gl_resource_manager.cpp | 9 ++ .../renderer_opengl/gl_resource_manager.h | 16 +++ .../renderer_opengl/gl_shader_cache.cpp | 101 +++++++++++++---- .../renderer_opengl/gl_shader_cache.h | 15 ++- .../renderer_opengl/gl_shader_manager.cpp | 106 ++++++++++++++---- .../renderer_opengl/gl_shader_manager.h | 56 +++++---- .../renderer_opengl/renderer_opengl.cpp | 16 +-- .../renderer_opengl/renderer_opengl.h | 5 +- 12 files changed, 339 insertions(+), 109 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index d83dca25a..466a911db 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -13,6 +13,7 @@ #include "common/logging/log.h" #include "common/scope_exit.h" +#include "core/settings.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -183,10 +184,16 @@ Device::Device() : base_bindings{BuildBaseBindings()} { has_precise_bug = TestPreciseBug(); has_broken_compute = is_intel_proprietary; has_fast_buffer_sub_data = is_nvidia; + use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 && + GLAD_GL_NV_compute_program5; LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); + + if (Settings::values.use_assembly_shaders && !use_assembly_shaders) { + LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported"); + } } Device::Device(std::nullptr_t) { diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index a55050cb5..e915dbd86 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -88,6 +88,10 @@ public: return has_fast_buffer_sub_data; } + bool UseAssemblyShaders() const { + return use_assembly_shaders; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); @@ -107,6 +111,7 @@ private: bool has_precise_bug{}; bool has_broken_compute{}; bool has_fast_buffer_sub_data{}; + bool use_assembly_shaders{}; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 69dcf952f..92ca22136 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -94,17 +94,30 @@ void oglEnable(GLenum cap, bool state) { } // Anonymous namespace RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info, GLShader::ProgramManager& program_manager, - StateTracker& state_tracker) - : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker}, + const Device& device, ScreenInfo& info, + ProgramManager& program_manager, StateTracker& state_tracker) + : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device, + state_tracker}, shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE}, fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system}, screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { CheckExtensions(); + + if (device.UseAssemblyShaders()) { + glCreateBuffers(static_cast(staging_cbufs.size()), staging_cbufs.data()); + for (const GLuint cbuf : staging_cbufs) { + glNamedBufferStorage(cbuf, static_cast(Maxwell::MaxConstBufferSize), + nullptr, 0); + } + } } -RasterizerOpenGL::~RasterizerOpenGL() {} +RasterizerOpenGL::~RasterizerOpenGL() { + if (device.UseAssemblyShaders()) { + glDeleteBuffers(static_cast(staging_cbufs.size()), staging_cbufs.data()); + } +} void RasterizerOpenGL::CheckExtensions() { if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { @@ -230,6 +243,7 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() { void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { MICROPROFILE_SCOPE(OpenGL_Shader); auto& gpu = system.GPU().Maxwell3D(); + std::size_t num_ssbos = 0; u32 clip_distances = 0; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { @@ -261,6 +275,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { Shader shader{shader_cache.GetStageProgram(program)}; + if (device.UseAssemblyShaders()) { + // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this + // all stages share the same bindings. + const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size(); + ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage"); + num_ssbos += num_stage_ssbos; + } + // Stage indices are 0 - 5 const std::size_t stage = index == 0 ? 0 : index - 1; SetupDrawConstBuffers(stage, shader); @@ -526,6 +548,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { SyncFramebufferSRGB(); buffer_cache.Acquire(); + current_cbuf = 0; std::size_t buffer_size = CalculateVertexArraysSize(); @@ -535,9 +558,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } // Uniform space for the 5 shader stages - buffer_size = Common::AlignUp(buffer_size, 4) + - (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * - Maxwell::MaxShaderStage; + buffer_size = + Common::AlignUp(buffer_size, 4) + + (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; // Add space for at least 18 constant buffers buffer_size += Maxwell::MaxConstBuffers * @@ -558,12 +581,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } // Setup emulation uniform buffer. - GLShader::MaxwellUniformData ubo; - ubo.SetFromRegs(gpu); - const auto [buffer, offset] = - buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); - glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, - static_cast(sizeof(ubo))); + if (!device.UseAssemblyShaders()) { + MaxwellUniformData ubo; + ubo.SetFromRegs(gpu); + const auto [buffer, offset] = + buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); + glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset, + static_cast(sizeof(ubo))); + } // Setup shaders and their used resources. texture_cache.GuardSamplers(true); @@ -635,11 +660,11 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { } buffer_cache.Acquire(); + current_cbuf = 0; auto kernel = shader_cache.GetComputeKernel(code_addr); SetupComputeTextures(kernel); SetupComputeImages(kernel); - program_manager.BindComputeShader(kernel->GetHandle()); const std::size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * @@ -652,6 +677,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { buffer_cache.Unmap(); const auto& launch_desc = system.GPU().KeplerCompute().launch_description; + program_manager.BindCompute(kernel->GetHandle()); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); ++num_queued_commands; } @@ -812,14 +838,20 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, } void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) { + static constexpr std::array PARAMETER_LUT = { + GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, + GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, + GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV}; + MICROPROFILE_SCOPE(OpenGL_UBO); const auto& stages = system.GPU().Maxwell3D().state.shader_stages; const auto& shader_stage = stages[stage_index]; - u32 binding = device.GetBaseBindings(stage_index).uniform_buffer; + u32 binding = + device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer; for (const auto& entry : shader->GetEntries().const_buffers) { const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; - SetupConstBuffer(binding++, buffer, entry); + SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry); } } @@ -835,16 +867,21 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { buffer.address = config.Address(); buffer.size = config.size; buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(binding++, buffer, entry); + SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry); } } -void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, +void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, + const Tegra::Engines::ConstBufferInfo& buffer, const ConstBufferEntry& entry) { if (!buffer.enabled) { // Set values to zero to unbind buffers - glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0, - sizeof(float)); + if (device.UseAssemblyShaders()) { + glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding, + buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float)); + } return; } @@ -853,9 +890,19 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); const auto alignment = device.GetUniformBufferAlignment(); - const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, - device.HasFastBufferSubData()); - glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); + auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, + device.HasFastBufferSubData()); + if (!device.UseAssemblyShaders()) { + glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); + return; + } + if (offset != 0) { + const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; + glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); + cbuf = staging_cbuf; + offset = 0; + } + glBindBufferRangeNV(stage, binding, cbuf, offset, size); } void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { @@ -863,7 +910,8 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad auto& memory_manager{gpu.MemoryManager()}; const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; - u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer; + u32 binding = + device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; for (const auto& entry : shader->GetEntries().global_memory_entries) { const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; const GPUVAddr gpu_addr{memory_manager.Read(addr)}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index b94c65907..87f7fe159 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -56,8 +56,8 @@ struct DrawParameters; class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, - ScreenInfo& info, GLShader::ProgramManager& program_manager, - StateTracker& state_tracker); + const Device& device, ScreenInfo& info, + ProgramManager& program_manager, StateTracker& state_tracker); ~RasterizerOpenGL() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -106,7 +106,7 @@ private: void SetupComputeConstBuffers(const Shader& kernel); /// Configures a constant buffer. - void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, + void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, const ConstBufferEntry& entry); /// Configures the current global memory entries to use for the draw command. @@ -224,7 +224,7 @@ private: void SetupShaders(GLenum primitive_mode); - const Device device; + const Device& device; TextureCacheOpenGL texture_cache; ShaderCacheOpenGL shader_cache; @@ -236,7 +236,7 @@ private: Core::System& system; ScreenInfo& screen_info; - GLShader::ProgramManager& program_manager; + ProgramManager& program_manager; StateTracker& state_tracker; static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; @@ -248,6 +248,12 @@ private: std::bitset enabled_transform_feedback_buffers; + static constexpr std::size_t NUM_CONSTANT_BUFFERS = + Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * + Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + std::array staging_cbufs{}; + std::size_t current_cbuf = 0; + /// Number of commands queued to the OpenGL driver. Reseted on flush. std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 97803d480..a787e27d2 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -125,6 +125,15 @@ void OGLProgram::Release() { handle = 0; } +void OGLAssemblyProgram::Release() { + if (handle == 0) { + return; + } + MICROPROFILE_SCOPE(OpenGL_ResourceDeletion); + glDeleteProgramsARB(1, &handle); + handle = 0; +} + void OGLPipeline::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index de93f4212..f8b322227 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -167,6 +167,22 @@ public: GLuint handle = 0; }; +class OGLAssemblyProgram : private NonCopyable { +public: + OGLAssemblyProgram() = default; + + OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {} + + ~OGLAssemblyProgram() { + Release(); + } + + /// Deletes the internal OpenGL resource + void Release(); + + GLuint handle = 0; +}; + class OGLPipeline : private NonCopyable { public: OGLPipeline() = default; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 9759a7078..4cd0f36cf 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -97,6 +97,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) { return {}; } +constexpr GLenum AssemblyEnum(ShaderType shader_type) { + switch (shader_type) { + case ShaderType::Vertex: + return GL_VERTEX_PROGRAM_NV; + case ShaderType::TesselationControl: + return GL_TESS_CONTROL_PROGRAM_NV; + case ShaderType::TesselationEval: + return GL_TESS_EVALUATION_PROGRAM_NV; + case ShaderType::Geometry: + return GL_GEOMETRY_PROGRAM_NV; + case ShaderType::Fragment: + return GL_FRAGMENT_PROGRAM_NV; + case ShaderType::Compute: + return GL_COMPUTE_PROGRAM_NV; + } + return {}; +} + std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) { return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier); } @@ -120,18 +138,43 @@ std::shared_ptr MakeRegistry(const ShaderDiskCacheEntry& entry) { return registry; } -std::shared_ptr BuildShader(const Device& device, ShaderType shader_type, - u64 unique_identifier, const ShaderIR& ir, - const Registry& registry, bool hint_retrievable = false) { +ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier, + const ShaderIR& ir, const Registry& registry, + bool hint_retrievable = false) { const std::string shader_id = MakeShaderID(unique_identifier, shader_type); LOG_INFO(Render_OpenGL, "{}", shader_id); - const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); - OGLShader shader; - shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); + auto program = std::make_shared(); + + if (device.UseAssemblyShaders()) { + const std::string arb = "Not implemented"; + + GLuint& arb_prog = program->assembly_program.handle; + +// Commented out functions signal OpenGL errors but are compatible with apitrace. +// Use them only to capture and replay on apitrace. +#if 0 + glGenProgramsNV(1, &arb_prog); + glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast(arb.size()), + reinterpret_cast(arb.data())); +#else + glGenProgramsARB(1, &arb_prog); + glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB, + static_cast(arb.size()), arb.data()); +#endif + const auto err = reinterpret_cast(glGetString(GL_PROGRAM_ERROR_STRING_NV)); + if (err && *err) { + LOG_CRITICAL(Render_OpenGL, "{}", err); + LOG_INFO(Render_OpenGL, "\n{}", arb); + } + } else { + const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id); + OGLShader shader; + shader.Create(glsl.c_str(), GetGLShaderType(shader_type)); + + program->source_program.Create(true, hint_retrievable, shader.handle); + } - auto program = std::make_shared(); - program->Create(true, hint_retrievable, shader.handle); return program; } @@ -153,15 +196,22 @@ std::unordered_set GetSupportedFormats() { CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, std::shared_ptr registry, - ShaderEntries entries, std::shared_ptr program) + ShaderEntries entries, ProgramSharedPtr program_) : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)}, - size_in_bytes{size_in_bytes}, program{std::move(program)} {} + size_in_bytes{size_in_bytes}, program{std::move(program_)} { + // Assign either the assembly program or source program. We can't have both. + handle = program->assembly_program.handle; + if (handle == 0) { + handle = program->source_program.handle; + } + ASSERT(handle != 0); +} CachedShader::~CachedShader() = default; GLuint CachedShader::GetHandle() const { DEBUG_ASSERT(registry->IsConsistent()); - return program->handle; + return handle; } Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, @@ -239,7 +289,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, return; } - const std::vector gl_cache = disk_cache.LoadPrecompiled(); + std::vector gl_cache; + if (!device.UseAssemblyShaders()) { + // Only load precompiled cache when we are not using assembly shaders + gl_cache = disk_cache.LoadPrecompiled(); + } const auto supported_formats = GetSupportedFormats(); // Track if precompiled cache was altered during loading to know if we have to @@ -278,7 +332,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, auto registry = MakeRegistry(entry); const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry); - std::shared_ptr program; + ProgramSharedPtr program; if (precompiled_entry) { // If the shader is precompiled, attempt to load it with program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats); @@ -332,6 +386,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, return; } + if (device.UseAssemblyShaders()) { + // Don't store precompiled binaries for assembly shaders. + return; + } + // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw // before precompiling them @@ -339,7 +398,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const u64 id = (*transferable)[i].unique_identifier; const auto it = find_precompiled(id); if (it == gl_cache.end()) { - const GLuint program = runtime_cache.at(id).program->handle; + const GLuint program = runtime_cache.at(id).program->source_program.handle; disk_cache.SavePrecompiled(id, program); precompiled_cache_altered = true; } @@ -350,7 +409,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, } } -std::shared_ptr ShaderCacheOpenGL::GeneratePrecompiledProgram( +ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set& supported_formats) { if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { @@ -358,15 +417,15 @@ std::shared_ptr ShaderCacheOpenGL::GeneratePrecompiledProgram( return {}; } - auto program = std::make_shared(); - program->handle = glCreateProgram(); - glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE); - glProgramBinary(program->handle, precompiled_entry.binary_format, - precompiled_entry.binary.data(), + auto program = std::make_shared(); + GLuint& handle = program->source_program.handle; + handle = glCreateProgram(); + glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE); + glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(), static_cast(precompiled_entry.binary.size())); GLint link_status; - glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status); + glGetProgramiv(handle, GL_LINK_STATUS, &link_status); if (link_status == GL_FALSE) { LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing"); return {}; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 91690b470..b2ae8d7f9 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -43,8 +43,14 @@ struct UnspecializedShader; using Shader = std::shared_ptr; using Maxwell = Tegra::Engines::Maxwell3D::Regs; +struct ProgramHandle { + OGLProgram source_program; + OGLAssemblyProgram assembly_program; +}; +using ProgramSharedPtr = std::shared_ptr; + struct PrecompiledShader { - std::shared_ptr program; + ProgramSharedPtr program; std::shared_ptr registry; ShaderEntries entries; }; @@ -87,12 +93,13 @@ public: private: explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes, std::shared_ptr registry, - ShaderEntries entries, std::shared_ptr program); + ShaderEntries entries, ProgramSharedPtr program); std::shared_ptr registry; ShaderEntries entries; std::size_t size_in_bytes = 0; - std::shared_ptr program; + ProgramSharedPtr program; + GLuint handle = 0; }; class ShaderCacheOpenGL final : public RasterizerCache { @@ -115,7 +122,7 @@ protected: void FlushObjectInner(const Shader& object) override {} private: - std::shared_ptr GeneratePrecompiledProgram( + ProgramSharedPtr GeneratePrecompiledProgram( const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set& supported_formats); diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 9c7b0adbd..96605db84 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -6,47 +6,107 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_shader_manager.h" -namespace OpenGL::GLShader { +namespace OpenGL { -ProgramManager::ProgramManager() = default; +ProgramManager::ProgramManager(const Device& device) { + use_assembly_programs = device.UseAssemblyShaders(); + if (use_assembly_programs) { + glEnable(GL_COMPUTE_PROGRAM_NV); + } else { + graphics_pipeline.Create(); + glBindProgramPipeline(graphics_pipeline.handle); + } +} ProgramManager::~ProgramManager() = default; -void ProgramManager::Create() { - graphics_pipeline.Create(); - glBindProgramPipeline(graphics_pipeline.handle); +void ProgramManager::BindCompute(GLuint program) { + if (use_assembly_programs) { + glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program); + } else { + is_graphics_bound = false; + glUseProgram(program); + } } void ProgramManager::BindGraphicsPipeline() { + if (use_assembly_programs) { + UpdateAssemblyPrograms(); + } else { + UpdateSourcePrograms(); + } +} + +void ProgramManager::BindHostPipeline(GLuint pipeline) { + if (use_assembly_programs) { + if (geometry_enabled) { + geometry_enabled = false; + old_state.geometry = 0; + glDisable(GL_GEOMETRY_PROGRAM_NV); + } + } + glBindProgramPipeline(pipeline); +} + +void ProgramManager::RestoreGuestPipeline() { + if (use_assembly_programs) { + glBindProgramPipeline(0); + } else { + glBindProgramPipeline(graphics_pipeline.handle); + } +} + +void ProgramManager::UpdateAssemblyPrograms() { + const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) { + if (current == old) { + return; + } + if (current == 0) { + if (enabled) { + enabled = false; + glDisable(stage); + } + return; + } + if (!enabled) { + enabled = true; + glEnable(stage); + } + glBindProgramARB(stage, current); + }; + + update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex); + update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry, + old_state.geometry); + update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment, + old_state.fragment); + + old_state = current_state; +} + +void ProgramManager::UpdateSourcePrograms() { if (!is_graphics_bound) { is_graphics_bound = true; glUseProgram(0); } - // Avoid updating the pipeline when values have no changed - if (old_state == current_state) { - return; - } - - // Workaround for AMD bug - static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | - GL_FRAGMENT_SHADER_BIT}; const GLuint handle = graphics_pipeline.handle; - glUseProgramStages(handle, all_used_stages, 0); - glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader); - glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader); - glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader); + const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) { + if (current == old) { + return; + } + glUseProgramStages(handle, stage, current); + }; + update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex); + update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry); + update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment); old_state = current_state; } -void ProgramManager::BindComputeShader(GLuint program) { - is_graphics_bound = false; - glUseProgram(program); -} - void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { const auto& regs = maxwell.regs; @@ -54,4 +114,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) { y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f; } -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index d2e47f2a9..0f03b4f12 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -11,7 +11,9 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" -namespace OpenGL::GLShader { +namespace OpenGL { + +class Device; /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at @@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384, class ProgramManager { public: - explicit ProgramManager(); + explicit ProgramManager(const Device& device); ~ProgramManager(); - void Create(); + /// Binds a compute program + void BindCompute(GLuint program); - /// Updates the graphics pipeline and binds it. + /// Updates bound programs. void BindGraphicsPipeline(); - /// Binds a compute shader. - void BindComputeShader(GLuint program); + /// Binds an OpenGL pipeline object unsynchronized with the guest state. + void BindHostPipeline(GLuint pipeline); + + /// Rewinds BindHostPipeline state changes. + void RestoreGuestPipeline(); void UseVertexShader(GLuint program) { - current_state.vertex_shader = program; + current_state.vertex = program; } void UseGeometryShader(GLuint program) { - current_state.geometry_shader = program; + current_state.geometry = program; } void UseFragmentShader(GLuint program) { - current_state.fragment_shader = program; + current_state.fragment = program; } private: struct PipelineState { - bool operator==(const PipelineState& rhs) const noexcept { - return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader && - geometry_shader == rhs.geometry_shader; - } - - bool operator!=(const PipelineState& rhs) const noexcept { - return !operator==(rhs); - } - - GLuint vertex_shader = 0; - GLuint fragment_shader = 0; - GLuint geometry_shader = 0; + GLuint vertex = 0; + GLuint geometry = 0; + GLuint fragment = 0; }; + /// Update NV_gpu_program5 programs. + void UpdateAssemblyPrograms(); + + /// Update GLSL programs. + void UpdateSourcePrograms(); + OGLPipeline graphics_pipeline; - OGLPipeline compute_pipeline; + PipelineState current_state; PipelineState old_state; + + bool use_assembly_programs = false; + bool is_graphics_bound = true; + + bool vertex_enabled = false; + bool geometry_enabled = false; + bool fragment_enabled = false; }; -} // namespace OpenGL::GLShader +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index b2a179746..6b489e6db 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -316,7 +316,7 @@ public: RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system, Core::Frontend::GraphicsContext& context) : RendererBase{emu_window}, emu_window{emu_window}, system{system}, context{context}, - has_debug_tool{HasDebugTool()} {} + program_manager{device}, has_debug_tool{HasDebugTool()} {} RendererOpenGL::~RendererOpenGL() = default; @@ -468,8 +468,9 @@ void RendererOpenGL::InitOpenGLObjects() { vertex_program.Create(true, false, vertex_shader.handle); fragment_program.Create(true, false, fragment_shader.handle); - // Create program pipeline - program_manager.Create(); + pipeline.Create(); + glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle); + glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle); // Generate VBO handle for drawing vertex_buffer.Create(); @@ -508,7 +509,7 @@ void RendererOpenGL::CreateRasterizer() { if (rasterizer) { return; } - rasterizer = std::make_unique(system, emu_window, screen_info, + rasterizer = std::make_unique(system, emu_window, device, screen_info, program_manager, state_tracker); } @@ -620,10 +621,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { state_tracker.NotifyClipControl(); state_tracker.NotifyAlphaTest(); - program_manager.UseVertexShader(vertex_program.handle); - program_manager.UseGeometryShader(0); - program_manager.UseFragmentShader(fragment_program.handle); - program_manager.BindGraphicsPipeline(); + program_manager.BindHostPipeline(pipeline.handle); glEnable(GL_CULL_FACE); if (screen_info.display_srgb) { @@ -665,6 +663,8 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { glClear(GL_COLOR_BUFFER_BIT); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); + + program_manager.RestoreGuestPipeline(); } bool RendererOpenGL::TryPresent(int timeout_ms) { diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 50b647661..61bf507f4 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -9,6 +9,7 @@ #include "common/common_types.h" #include "common/math_util.h" #include "video_core/renderer_base.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" @@ -95,6 +96,7 @@ private: Core::Frontend::EmuWindow& emu_window; Core::System& system; Core::Frontend::GraphicsContext& context; + const Device device; StateTracker state_tracker{system}; @@ -102,13 +104,14 @@ private: OGLBuffer vertex_buffer; OGLProgram vertex_program; OGLProgram fragment_program; + OGLPipeline pipeline; OGLFramebuffer screenshot_framebuffer; /// Display information for Switch screen ScreenInfo screen_info; /// Global dummy shader pipeline - GLShader::ProgramManager program_manager; + ProgramManager program_manager; /// OpenGL framebuffer data std::vector gl_framebuffer_data;