From a07c88e686fb9b65924876d472a8184f1f1849df Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Tue, 22 Aug 2023 12:28:25 +0200
Subject: [PATCH] Query Cache: Simplify Prefix Sum compute shader

---
 src/video_core/host_shaders/CMakeLists.txt    |   4 +-
 .../host_shaders/queries_prefix_scan_sum.comp | 166 +++++++++++-------
 .../queries_prefix_scan_sum_nosubgroups.comp  | 120 +++++++++++++
 .../renderer_vulkan/vk_compute_pass.cpp       |  27 ++-
 .../renderer_vulkan/vk_compute_pass.h         |   4 +-
 .../renderer_vulkan/vk_query_cache.cpp        |   4 +-
 6 files changed, 252 insertions(+), 73 deletions(-)
 create mode 100644 src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp

diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 8218ec4c8..6b912027f 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SHADER_FILES
     present_bicubic.frag
     present_gaussian.frag
     queries_prefix_scan_sum.comp
+    queries_prefix_scan_sum_nosubgroups.comp
     resolve_conditional_render.comp
     smaa_edge_detection.vert
     smaa_edge_detection.frag
@@ -72,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND")
 endif()
 
 set(GLSL_FLAGS "")
+set(SPIR_V_VERSION "spirv1.3")
 set(QUIET_FLAG "--quiet")
 
 set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
@@ -125,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES})
             OUTPUT
                 ${SPIRV_HEADER_FILE}
             COMMAND
-                ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE}
+                ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION}
             MAIN_DEPENDENCY
                 ${SOURCE_FILE}
         )
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
index dce1279fe..8f10e248e 100644
--- a/src/video_core/host_shaders/queries_prefix_scan_sum.comp
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp
@@ -1,26 +1,24 @@
-// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
-// SPDX-License-Identifier: MIT
-
-// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
-// Nicholas Haemel. Modified to suit needs and optimize for subgroup
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
 
 #version 460 core
 
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_shuffle_relative : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+
 #ifdef VULKAN
 
-#extension GL_KHR_shader_subgroup_arithmetic : enable
 #define HAS_EXTENDED_TYPES 1
 #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
-#define END_PUSH_CONSTANTS                                                                         \
-    }                                                                                              \
-    ;
+#define END_PUSH_CONSTANTS };
 #define UNIFORM(n)
 #define BINDING_INPUT_BUFFER 0
 #define BINDING_OUTPUT_IMAGE 1
 
 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
 
-#extension GL_KHR_shader_subgroup_arithmetic : enable
 #extension GL_NV_gpu_shader5 : enable
 #ifdef GL_NV_gpu_shader5
 #define HAS_EXTENDED_TYPES 1
@@ -43,19 +41,20 @@ END_PUSH_CONSTANTS
 layout(local_size_x = 32) in;
 
 layout(std430, binding = 0) readonly buffer block1 {
-    uvec2 input_data[gl_WorkGroupSize.x];
+    uvec2 input_data[];
 };
 
-layout(std430, binding = 1) writeonly coherent buffer block2 {
-    uvec2 output_data[gl_WorkGroupSize.x];
+layout(std430, binding = 1) coherent buffer block2 {
+    uvec2 output_data[];
 };
 
 layout(std430, binding = 2) coherent buffer block3 {
     uvec2 accumulated_data;
 };
 
-shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
+shared uvec2 shared_data[2];
 
+// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64
 uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
     uint carry = 0;
     uvec2 result;
@@ -64,61 +63,102 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
     return result;
 }
 
-void main(void) {
-    uint id = gl_LocalInvocationID.x;
-    uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
-    uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
-    uint work_size = gl_WorkGroupSize.x;
-    uint rd_id;
-    uint wr_id;
-    uint mask;
-    uvec2 input_1 = input_data[id * 2];
-    uvec2 input_2 = input_data[id * 2 + 1];
-    // The number of steps is the log base 2 of the
-    // work group size, which should be a power of 2
-    const uint steps = uint(log2(work_size)) + 1;
-    uint step = 0;
+// do subgroup Prefix Sum using Hillis and Steele's algorithm
+uvec2 subgroupInclusiveAddUint64(uvec2 value) {
+    uvec2 result = value;
+    for (uint i = 1; i < gl_SubgroupSize; i *= 2) {
+        if (i <= gl_SubgroupInvocationID) {
+            uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i;
+            result = AddUint64(result, other);
+        }
+    }
+    return result;
+}
 
-    // Each invocation is responsible for the content of
-    // two elements of the output array
-    shared_data[id * 2] = input_1;
-    shared_data[id * 2 + 1] = input_2;
-    // Synchronize to make sure that everyone has initialized
-    // their elements of shared_data[] with data loaded from
-    // the input arrays
+// Writes down the results to the output buffer and to the accumulation buffer
+void WriteResults(uvec2 result) {
+    uint current_global_id = gl_GlobalInvocationID.x;
+    uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0);
+    output_data[current_global_id] = result + base_data;
+    if (max_accumulation_base >= accumulation_limit + 1) {
+        if (current_global_id == accumulation_limit) {
+            accumulated_data = result;
+        }
+        return;
+    }
+    // We have that ugly case in which the accumulation data is reset in the middle somewhere.
+    barrier();
+    groupMemoryBarrier();
+    if (current_global_id == accumulation_limit) {
+        uvec2 value_1 = output_data[max_accumulation_base];
+        accumulated_data = AddUint64(result, -value_1);
+    }
+}
+
+void main() {
+    uint subgroup_inv_id = gl_SubgroupInvocationID;
+    uint subgroup_id = gl_SubgroupID;
+    uint last_subgroup_id = subgroupMax(subgroup_inv_id);
+    uint current_global_id = gl_GlobalInvocationID.x;
+    uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
+    uvec2 data = input_data[current_global_id];
+    // make sure all input data has been loaded
+    subgroupBarrier();
+    subgroupMemoryBarrier();
+
+    uvec2 result = subgroupInclusiveAddUint64(data);
+
+    // if we had less queries than our subgroup, just write down the results.
+    if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch.
+        WriteResults(result);
+        return;
+    }
+
+    // We now have more, so lets write the last result into shared memory.
+    // Only pick the last subgroup.
+    if (subgroup_inv_id == last_subgroup_id) {
+        shared_data[subgroup_id] = result;
+    }
+    // wait until everyone loaded their stuffs
     barrier();
     memoryBarrierShared();
-    // For each step...
-    for (step = 0; step < steps; step++) {
-        // Calculate the read and write index in the
-        // shared array
-        mask = (1 << step) - 1;
-        rd_id = ((id >> step) << (step + 1)) + mask;
-        wr_id = rd_id + 1 + (id & mask);
-        // Accumulate the read data into our element
 
-        shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
-        // Synchronize again to make sure that everyone
-        // has caught up with us
+    // Case 1: the total work for the grouped results can be calculated in a single subgroup
+    // operation (about 1024 queries).
+    uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x;
+    if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch.
+        if (subgroup_id != 0) {
+            uvec2 tmp = shared_data[subgroup_inv_id];
+            subgroupBarrier();
+            subgroupMemoryBarrierShared();
+            tmp = subgroupInclusiveAddUint64(tmp);
+            result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1));
+        }
+
+        WriteResults(result);
+        return;
+    }
+
+    // Case 2: our work amount is huge, so lets do it in O(log n) steps.
+    const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0;
+    const uint steps = 1 << (findMSB(total_extra_work) + extra);
+    uint step;
+    // Hillis and Steele's algorithm
+    for (step = 1; step < steps; step *= 2) {
+        if (current_global_id < steps && current_global_id >= step) {
+            uvec2 current = shared_data[current_global_id];
+            uvec2 other = shared_data[current_global_id - step];
+            shared_data[current_global_id] = AddUint64(current, other);
+        }
+        // steps is constant, so this will always execute in ever workgroup's thread.
         barrier();
         memoryBarrierShared();
     }
-    // Add the accumulation
-    shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
-    shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
-    barrier();
-    memoryBarrierShared();
-
-    // Finally write our data back to the output buffer
-    output_data[id * 2] = shared_data[id * 2];
-    output_data[id * 2 + 1] = shared_data[id * 2 + 1];
-    if (id == 0) {
-        if (max_accumulation_base >= accumulation_limit + 1) {
-            accumulated_data = shared_data[accumulation_limit];
-            return;
-        }
-        uvec2 value_1 = shared_data[max_accumulation_base];
-        uvec2 value_2 = shared_data[accumulation_limit];
-        accumulated_data = AddUint64(value_1, -value_2);
+    // Only add results for groups higher than 0
+    if (subgroup_id != 0) {
+        result = AddUint64(result, shared_data[subgroup_id - 1]);
     }
+
+    // Just write the final results. We are done
+    WriteResults(result);
 }
\ No newline at end of file
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp
new file mode 100644
index 000000000..8021476ed
--- /dev/null
+++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp
@@ -0,0 +1,120 @@
+// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
+// SPDX-License-Identifier: MIT
+
+// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
+// Nicholas Haemel. Modified to suit needs.
+
+#version 460 core
+
+#ifdef VULKAN
+
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 1
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout(location = n) uniform
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uint max_accumulation_base;
+UNIFORM(1) uint accumulation_limit;
+END_PUSH_CONSTANTS
+
+layout(local_size_x = 32) in;
+
+layout(std430, binding = 0) readonly buffer block1 {
+    uvec2 input_data[gl_WorkGroupSize.x];
+};
+
+layout(std430, binding = 1) writeonly coherent buffer block2 {
+    uvec2 output_data[gl_WorkGroupSize.x];
+};
+
+layout(std430, binding = 2) coherent buffer block3 {
+    uvec2 accumulated_data;
+};
+
+shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
+
+uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
+    uint carry = 0;
+    uvec2 result;
+    result.x = uaddCarry(value_1.x, value_2.x, carry);
+    result.y = value_1.y + value_2.y + carry;
+    return result;
+}
+
+void main(void) {
+    uint id = gl_LocalInvocationID.x;
+    uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
+    uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
+    uint work_size = gl_WorkGroupSize.x;
+    uint rd_id;
+    uint wr_id;
+    uint mask;
+    uvec2 input_1 = input_data[id * 2];
+    uvec2 input_2 = input_data[id * 2 + 1];
+    // The number of steps is the log base 2 of the
+    // work group size, which should be a power of 2
+    const uint steps = uint(log2(work_size)) + 1;
+    uint step = 0;
+
+    // Each invocation is responsible for the content of
+    // two elements of the output array
+    shared_data[id * 2] = input_1;
+    shared_data[id * 2 + 1] = input_2;
+    // Synchronize to make sure that everyone has initialized
+    // their elements of shared_data[] with data loaded from
+    // the input arrays
+    barrier();
+    memoryBarrierShared();
+    // For each step...
+    for (step = 0; step < steps; step++) {
+        // Calculate the read and write index in the
+        // shared array
+        mask = (1 << step) - 1;
+        rd_id = ((id >> step) << (step + 1)) + mask;
+        wr_id = rd_id + 1 + (id & mask);
+        // Accumulate the read data into our element
+
+        shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
+        // Synchronize again to make sure that everyone
+        // has caught up with us
+        barrier();
+        memoryBarrierShared();
+    }
+    // Add the accumulation
+    shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
+    shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
+    barrier();
+    memoryBarrierShared();
+
+    // Finally write our data back to the output buffer
+    output_data[id * 2] = shared_data[id * 2];
+    output_data[id * 2 + 1] = shared_data[id * 2 + 1];
+    if (id == 0) {
+        if (max_accumulation_base >= accumulation_limit + 1) {
+            accumulated_data = shared_data[accumulation_limit];
+            return;
+        }
+        uvec2 value_1 = shared_data[max_accumulation_base];
+        uvec2 value_2 = shared_data[accumulation_limit];
+        accumulated_data = AddUint64(value_1, -value_2);
+    }
+}
\ No newline at end of file
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
index a1af08cda..44ec5a032 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -13,6 +13,7 @@
 #include "common/div_ceil.h"
 #include "video_core/host_shaders/astc_decoder_comp_spv.h"
 #include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
+#include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h"
 #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
 #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
 #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
@@ -187,7 +188,8 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
                          vk::Span<VkDescriptorSetLayoutBinding> bindings,
                          vk::Span<VkDescriptorUpdateTemplateEntry> templates,
                          const DescriptorBankInfo& bank_info,
-                         vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code)
+                         vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code,
+                         std::optional<u32> optional_subgroup_size)
     : device{device_} {
     descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({
         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
@@ -228,13 +230,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
         .pCode = code.data(),
     });
     device.SaveShader(code);
+    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+        .pNext = nullptr,
+        .requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U,
+    };
+    bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size;
     pipeline = device.GetLogical().CreateComputePipeline({
         .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
         .pNext = nullptr,
         .flags = 0,
         .stage{
             .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
-            .pNext = nullptr,
+            .pNext = use_setup_size ? &subgroup_size_ci : nullptr,
             .flags = 0,
             .stage = VK_SHADER_STAGE_COMPUTE_BIT,
             .module = *module,
@@ -399,10 +407,17 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
 QueriesPrefixScanPass::QueriesPrefixScanPass(
     const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
     ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
-    : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
-                  QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
-                  COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
-                  QUERIES_PREFIX_SCAN_SUM_COMP_SPV),
+    : ComputePass(
+          device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
+          QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
+          COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
+          device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) &&
+                  device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) &&
+                  device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) &&
+                  device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT)
+              ? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV)
+              : std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV),
+          {32}),
       scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
 
 void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h
index e6ff86e9a..68ffb1b82 100644
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <optional>
 #include <span>
 #include <utility>
 
@@ -31,7 +32,8 @@ public:
                          vk::Span<VkDescriptorSetLayoutBinding> bindings,
                          vk::Span<VkDescriptorUpdateTemplateEntry> templates,
                          const DescriptorBankInfo& bank_info,
-                         vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code);
+                         vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code,
+                         std::optional<u32> optional_subgroup_size = std::nullopt);
     ~ComputePass();
 
 protected:
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
index ded190ae0..825e1a72e 100644
--- a/src/video_core/renderer_vulkan/vk_query_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -1376,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
             return true;
         }
     }
-    /*if (!is_in_bc[0] && !is_in_bc[1]) {
+    if (!is_in_bc[0] && !is_in_bc[1]) {
         // Both queries are in query cache, it's best to just flush.
         return true;
-    }*/
+    }
     HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
     return true;
 }