Merge pull request #10916 from ameerj/lolmem
OpenGL: Add Local Memory warmup shader for Nvidia
This commit is contained in:
commit
dafbc86366
|
@ -461,7 +461,7 @@ std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, I
|
||||||
header += fmt::format("R{},", index);
|
header += fmt::format("R{},", index);
|
||||||
}
|
}
|
||||||
if (program.local_memory_size > 0) {
|
if (program.local_memory_size > 0) {
|
||||||
header += fmt::format("lmem[{}],", program.local_memory_size);
|
header += fmt::format("lmem[{}],", Common::DivCeil(program.local_memory_size, 4U));
|
||||||
}
|
}
|
||||||
if (program.info.uses_fswzadd) {
|
if (program.info.uses_fswzadd) {
|
||||||
header += "FSWZA[4],FSWZB[4],";
|
header += "FSWZA[4],FSWZB[4],";
|
||||||
|
|
|
@ -424,6 +424,10 @@ void VisitUsages(Info& info, IR::Inst& inst) {
|
||||||
info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2;
|
info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2;
|
||||||
info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4;
|
info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4;
|
||||||
break;
|
break;
|
||||||
|
case IR::Opcode::LoadLocal:
|
||||||
|
case IR::Opcode::WriteLocal:
|
||||||
|
info.uses_local_memory = true;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -172,6 +172,7 @@ struct Info {
|
||||||
bool stores_indexed_attributes{};
|
bool stores_indexed_attributes{};
|
||||||
|
|
||||||
bool stores_global_memory{};
|
bool stores_global_memory{};
|
||||||
|
bool uses_local_memory{};
|
||||||
|
|
||||||
bool uses_fp16{};
|
bool uses_fp16{};
|
||||||
bool uses_fp64{};
|
bool uses_fp64{};
|
||||||
|
|
|
@ -33,6 +33,7 @@ set(SHADER_FILES
|
||||||
opengl_fidelityfx_fsr.frag
|
opengl_fidelityfx_fsr.frag
|
||||||
opengl_fidelityfx_fsr_easu.frag
|
opengl_fidelityfx_fsr_easu.frag
|
||||||
opengl_fidelityfx_fsr_rcas.frag
|
opengl_fidelityfx_fsr_rcas.frag
|
||||||
|
opengl_lmem_warmup.comp
|
||||||
opengl_present.frag
|
opengl_present.frag
|
||||||
opengl_present.vert
|
opengl_present.vert
|
||||||
opengl_present_scaleforce.frag
|
opengl_present_scaleforce.frag
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
// This shader is a workaround for a quirk in NVIDIA OpenGL drivers
|
||||||
|
// Shaders using local memory see a great performance benefit if a shader that was dispatched
|
||||||
|
// before it had more local memory allocated.
|
||||||
|
// This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that
|
||||||
|
// subsequent shaders see the performance boost.
|
||||||
|
|
||||||
|
// NOTE: This shader does no actual meaningful work and returns immediately,
|
||||||
|
// it is simply a means to have the driver expect a shader using lots of local memory.
|
||||||
|
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout(location = 0) uniform uint uniform_data;
|
||||||
|
|
||||||
|
layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image;
|
||||||
|
|
||||||
|
#define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler
|
||||||
|
#define NUM_LMEM_CONSTANTS 1
|
||||||
|
#define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS
|
||||||
|
|
||||||
|
uint lmem_0[ARRAY_SIZE];
|
||||||
|
const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0));
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint global_id = gl_GlobalInvocationID.x;
|
||||||
|
if (global_id <= 128) {
|
||||||
|
// Since the shader is called with a dispatch of 1x1x1
|
||||||
|
// This should always be the case, and this shader will not actually execute
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (uint t = 0; t < uniform_data; t++) {
|
||||||
|
const uint offset = (t * uniform_data);
|
||||||
|
lmem_0[offset] = t;
|
||||||
|
}
|
||||||
|
const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x);
|
||||||
|
const uint value = lmem_0[offset];
|
||||||
|
const uint const_value = constant_values[offset / 4][offset % 4];
|
||||||
|
const uvec4 color = uvec4(value + const_value);
|
||||||
|
|
||||||
|
// A "side-effect" is needed so the variables don't get optimized out,
|
||||||
|
// but this should never execute so there should be no clobbering of previously bound state.
|
||||||
|
imageStore(dest_image, ivec3(gl_GlobalInvocationID), color);
|
||||||
|
}
|
|
@ -63,6 +63,7 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac
|
||||||
writes_global_memory = !use_storage_buffers &&
|
writes_global_memory = !use_storage_buffers &&
|
||||||
std::ranges::any_of(info.storage_buffers_descriptors,
|
std::ranges::any_of(info.storage_buffers_descriptors,
|
||||||
[](const auto& desc) { return desc.is_written; });
|
[](const auto& desc) { return desc.is_written; });
|
||||||
|
uses_local_memory = info.uses_local_memory;
|
||||||
if (force_context_flush) {
|
if (force_context_flush) {
|
||||||
std::scoped_lock lock{built_mutex};
|
std::scoped_lock lock{built_mutex};
|
||||||
built_fence.Create();
|
built_fence.Create();
|
||||||
|
|
|
@ -59,6 +59,10 @@ public:
|
||||||
return writes_global_memory;
|
return writes_global_memory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] bool UsesLocalMemory() const noexcept {
|
||||||
|
return uses_local_memory;
|
||||||
|
}
|
||||||
|
|
||||||
void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_,
|
void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_,
|
||||||
Tegra::MemoryManager* gpu_memory_) {
|
Tegra::MemoryManager* gpu_memory_) {
|
||||||
kepler_compute = kepler_compute_;
|
kepler_compute = kepler_compute_;
|
||||||
|
@ -84,6 +88,7 @@ private:
|
||||||
|
|
||||||
bool use_storage_buffers{};
|
bool use_storage_buffers{};
|
||||||
bool writes_global_memory{};
|
bool writes_global_memory{};
|
||||||
|
bool uses_local_memory{};
|
||||||
|
|
||||||
std::mutex built_mutex;
|
std::mutex built_mutex;
|
||||||
std::condition_variable built_condvar;
|
std::condition_variable built_condvar;
|
||||||
|
|
|
@ -194,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) {
|
||||||
has_bool_ref_bug = true;
|
has_bool_ref_bug = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
has_lmem_perf_bug = is_nvidia;
|
||||||
|
|
||||||
strict_context_required = emu_window.StrictContextRequired();
|
strict_context_required = emu_window.StrictContextRequired();
|
||||||
// Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation.
|
// Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation.
|
||||||
|
|
|
@ -192,6 +192,10 @@ public:
|
||||||
return supports_conditional_barriers;
|
return supports_conditional_barriers;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool HasLmemPerfBug() const {
|
||||||
|
return has_lmem_perf_bug;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static bool TestVariableAoffi();
|
static bool TestVariableAoffi();
|
||||||
static bool TestPreciseBug();
|
static bool TestPreciseBug();
|
||||||
|
@ -238,6 +242,7 @@ private:
|
||||||
bool can_report_memory{};
|
bool can_report_memory{};
|
||||||
bool strict_context_required{};
|
bool strict_context_required{};
|
||||||
bool supports_conditional_barriers{};
|
bool supports_conditional_barriers{};
|
||||||
|
bool has_lmem_perf_bug{};
|
||||||
|
|
||||||
std::string vendor_name;
|
std::string vendor_name;
|
||||||
};
|
};
|
||||||
|
|
|
@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c
|
||||||
|
|
||||||
writes_global_memory |= std::ranges::any_of(
|
writes_global_memory |= std::ranges::any_of(
|
||||||
info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; });
|
info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; });
|
||||||
|
uses_local_memory |= info.uses_local_memory;
|
||||||
}
|
}
|
||||||
ASSERT(num_textures <= MAX_TEXTURES);
|
ASSERT(num_textures <= MAX_TEXTURES);
|
||||||
ASSERT(num_images <= MAX_IMAGES);
|
ASSERT(num_images <= MAX_IMAGES);
|
||||||
|
|
|
@ -98,6 +98,10 @@ public:
|
||||||
return writes_global_memory;
|
return writes_global_memory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] bool UsesLocalMemory() const noexcept {
|
||||||
|
return uses_local_memory;
|
||||||
|
}
|
||||||
|
|
||||||
[[nodiscard]] bool IsBuilt() noexcept;
|
[[nodiscard]] bool IsBuilt() noexcept;
|
||||||
|
|
||||||
template <typename Spec>
|
template <typename Spec>
|
||||||
|
@ -146,6 +150,7 @@ private:
|
||||||
|
|
||||||
bool use_storage_buffers{};
|
bool use_storage_buffers{};
|
||||||
bool writes_global_memory{};
|
bool writes_global_memory{};
|
||||||
|
bool uses_local_memory{};
|
||||||
|
|
||||||
static constexpr std::size_t XFB_ENTRY_STRIDE = 3;
|
static constexpr std::size_t XFB_ENTRY_STRIDE = 3;
|
||||||
GLsizei num_xfb_attribs{};
|
GLsizei num_xfb_attribs{};
|
||||||
|
|
|
@ -222,6 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) {
|
||||||
gpu.TickWork();
|
gpu.TickWork();
|
||||||
|
|
||||||
std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
|
std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
|
||||||
|
if (pipeline->UsesLocalMemory()) {
|
||||||
|
program_manager.LocalMemoryWarmup();
|
||||||
|
}
|
||||||
pipeline->SetEngine(maxwell3d, gpu_memory);
|
pipeline->SetEngine(maxwell3d, gpu_memory);
|
||||||
pipeline->Configure(is_indexed);
|
pipeline->Configure(is_indexed);
|
||||||
|
|
||||||
|
@ -371,6 +374,9 @@ void RasterizerOpenGL::DispatchCompute() {
|
||||||
if (!pipeline) {
|
if (!pipeline) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (pipeline->UsesLocalMemory()) {
|
||||||
|
program_manager.LocalMemoryWarmup();
|
||||||
|
}
|
||||||
pipeline->SetEngine(kepler_compute, gpu_memory);
|
pipeline->SetEngine(kepler_compute, gpu_memory);
|
||||||
pipeline->Configure();
|
pipeline->Configure();
|
||||||
const auto& qmd{kepler_compute->launch_description};
|
const auto& qmd{kepler_compute->launch_description};
|
||||||
|
|
|
@ -3,7 +3,9 @@
|
||||||
|
|
||||||
#include <glad/glad.h>
|
#include <glad/glad.h>
|
||||||
|
|
||||||
|
#include "video_core/host_shaders/opengl_lmem_warmup_comp.h"
|
||||||
#include "video_core/renderer_opengl/gl_shader_manager.h"
|
#include "video_core/renderer_opengl/gl_shader_manager.h"
|
||||||
|
#include "video_core/renderer_opengl/gl_shader_util.h"
|
||||||
|
|
||||||
namespace OpenGL {
|
namespace OpenGL {
|
||||||
|
|
||||||
|
@ -17,6 +19,10 @@ ProgramManager::ProgramManager(const Device& device) {
|
||||||
if (device.UseAssemblyShaders()) {
|
if (device.UseAssemblyShaders()) {
|
||||||
glEnable(GL_COMPUTE_PROGRAM_NV);
|
glEnable(GL_COMPUTE_PROGRAM_NV);
|
||||||
}
|
}
|
||||||
|
if (device.HasLmemPerfBug()) {
|
||||||
|
lmem_warmup_program =
|
||||||
|
CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ProgramManager::BindComputeProgram(GLuint program) {
|
void ProgramManager::BindComputeProgram(GLuint program) {
|
||||||
|
@ -98,6 +104,13 @@ void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NU
|
||||||
|
|
||||||
void ProgramManager::RestoreGuestCompute() {}
|
void ProgramManager::RestoreGuestCompute() {}
|
||||||
|
|
||||||
|
void ProgramManager::LocalMemoryWarmup() {
|
||||||
|
if (lmem_warmup_program.handle != 0) {
|
||||||
|
BindComputeProgram(lmem_warmup_program.handle);
|
||||||
|
glDispatchCompute(1, 1, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ProgramManager::BindPipeline() {
|
void ProgramManager::BindPipeline() {
|
||||||
if (!is_pipeline_bound) {
|
if (!is_pipeline_bound) {
|
||||||
is_pipeline_bound = true;
|
is_pipeline_bound = true;
|
||||||
|
|
|
@ -30,6 +30,8 @@ public:
|
||||||
|
|
||||||
void RestoreGuestCompute();
|
void RestoreGuestCompute();
|
||||||
|
|
||||||
|
void LocalMemoryWarmup();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void BindPipeline();
|
void BindPipeline();
|
||||||
|
|
||||||
|
@ -44,6 +46,7 @@ private:
|
||||||
u32 current_stage_mask = 0;
|
u32 current_stage_mask = 0;
|
||||||
std::array<GLuint, NUM_STAGES> current_programs{};
|
std::array<GLuint, NUM_STAGES> current_programs{};
|
||||||
GLuint current_assembly_compute_program = 0;
|
GLuint current_assembly_compute_program = 0;
|
||||||
|
OGLProgram lmem_warmup_program;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace OpenGL
|
} // namespace OpenGL
|
||||||
|
|
Reference in New Issue