Query Cache: Implement host side sample counting.
This commit is contained in:
parent
2fea1b8407
commit
c8237d5c31
|
@ -41,6 +41,7 @@ set(SHADER_FILES
|
|||
pitch_unswizzle.comp
|
||||
present_bicubic.frag
|
||||
present_gaussian.frag
|
||||
queries_prefix_scan_sum.comp
|
||||
resolve_conditional_render.comp
|
||||
smaa_edge_detection.vert
|
||||
smaa_edge_detection.frag
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
|
||||
// Nicholas Haemel. Modified to suit needs and optimize for subgroup
|
||||
|
||||
#version 460 core
|
||||
|
||||
#ifdef VULKAN
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : enable
|
||||
#define HAS_EXTENDED_TYPES 1
|
||||
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
|
||||
#define END_PUSH_CONSTANTS \
|
||||
} \
|
||||
;
|
||||
#define UNIFORM(n)
|
||||
#define BINDING_INPUT_BUFFER 0
|
||||
#define BINDING_OUTPUT_IMAGE 1
|
||||
|
||||
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : enable
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
#ifdef GL_NV_gpu_shader5
|
||||
#define HAS_EXTENDED_TYPES 1
|
||||
#else
|
||||
#define HAS_EXTENDED_TYPES 0
|
||||
#endif
|
||||
#define BEGIN_PUSH_CONSTANTS
|
||||
#define END_PUSH_CONSTANTS
|
||||
#define UNIFORM(n) layout(location = n) uniform
|
||||
#define BINDING_INPUT_BUFFER 0
|
||||
#define BINDING_OUTPUT_IMAGE 0
|
||||
|
||||
#endif
|
||||
|
||||
BEGIN_PUSH_CONSTANTS
|
||||
UNIFORM(0) uint max_accumulation_base;
|
||||
UNIFORM(1) uint accumulation_limit;
|
||||
END_PUSH_CONSTANTS
|
||||
|
||||
layout(local_size_x = 32) in;
|
||||
|
||||
layout(std430, binding = 0) readonly buffer block1 {
|
||||
uvec2 input_data[gl_WorkGroupSize.x];
|
||||
};
|
||||
|
||||
layout(std430, binding = 1) writeonly coherent buffer block2 {
|
||||
uvec2 output_data[gl_WorkGroupSize.x];
|
||||
};
|
||||
|
||||
layout(std430, binding = 2) coherent buffer block3 {
|
||||
uvec2 accumulated_data;
|
||||
};
|
||||
|
||||
shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
|
||||
|
||||
uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
|
||||
uint carry = 0;
|
||||
uvec2 result;
|
||||
result.x = uaddCarry(value_1.x, value_2.x, carry);
|
||||
result.y = value_1.y + value_2.y + carry;
|
||||
return result;
|
||||
}
|
||||
|
||||
void main(void) {
|
||||
uint id = gl_LocalInvocationID.x;
|
||||
uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
|
||||
uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
|
||||
uint work_size = gl_WorkGroupSize.x;
|
||||
uint rd_id;
|
||||
uint wr_id;
|
||||
uint mask;
|
||||
uvec2 input_1 = input_data[id * 2];
|
||||
uvec2 input_2 = input_data[id * 2 + 1];
|
||||
// The number of steps is the log base 2 of the
|
||||
// work group size, which should be a power of 2
|
||||
const uint steps = uint(log2(work_size)) + 1;
|
||||
uint step = 0;
|
||||
|
||||
// Each invocation is responsible for the content of
|
||||
// two elements of the output array
|
||||
shared_data[id * 2] = input_1;
|
||||
shared_data[id * 2 + 1] = input_2;
|
||||
// Synchronize to make sure that everyone has initialized
|
||||
// their elements of shared_data[] with data loaded from
|
||||
// the input arrays
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
// For each step...
|
||||
for (step = 0; step < steps; step++) {
|
||||
// Calculate the read and write index in the
|
||||
// shared array
|
||||
mask = (1 << step) - 1;
|
||||
rd_id = ((id >> step) << (step + 1)) + mask;
|
||||
wr_id = rd_id + 1 + (id & mask);
|
||||
// Accumulate the read data into our element
|
||||
|
||||
shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
|
||||
// Synchronize again to make sure that everyone
|
||||
// has caught up with us
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
// Add the accumulation
|
||||
shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
|
||||
shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
|
||||
// Finally write our data back to the output buffer
|
||||
output_data[id * 2] = shared_data[id * 2];
|
||||
output_data[id * 2 + 1] = shared_data[id * 2 + 1];
|
||||
if (id == 0) {
|
||||
if (max_accumulation_base >= accumulation_limit + 1) {
|
||||
accumulated_data = shared_data[accumulation_limit];
|
||||
return;
|
||||
}
|
||||
uvec2 value_1 = shared_data[max_accumulation_base];
|
||||
uvec2 value_2 = shared_data[accumulation_limit];
|
||||
accumulated_data = AddUint64(value_1, -value_2);
|
||||
}
|
||||
}
|
|
@ -12,6 +12,7 @@
|
|||
#include "common/common_types.h"
|
||||
#include "common/div_ceil.h"
|
||||
#include "video_core/host_shaders/astc_decoder_comp_spv.h"
|
||||
#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
|
||||
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
|
||||
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
|
||||
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
|
||||
|
@ -58,6 +59,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE
|
|||
},
|
||||
}};
|
||||
|
||||
constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{
|
||||
{
|
||||
.binding = 0,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.pImmutableSamplers = nullptr,
|
||||
},
|
||||
{
|
||||
.binding = 1,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.pImmutableSamplers = nullptr,
|
||||
},
|
||||
{
|
||||
.binding = 2,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.pImmutableSamplers = nullptr,
|
||||
},
|
||||
}};
|
||||
|
||||
constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
|
||||
.uniform_buffers = 0,
|
||||
.storage_buffers = 2,
|
||||
|
@ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
|
|||
.score = 2,
|
||||
};
|
||||
|
||||
constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{
|
||||
.uniform_buffers = 0,
|
||||
.storage_buffers = 3,
|
||||
.texture_buffers = 0,
|
||||
.image_buffers = 0,
|
||||
.textures = 0,
|
||||
.images = 0,
|
||||
.score = 3,
|
||||
};
|
||||
|
||||
constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
|
||||
{
|
||||
.binding = ASTC_BINDING_INPUT_BUFFER,
|
||||
|
@ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT
|
|||
.stride = sizeof(DescriptorUpdateEntry),
|
||||
};
|
||||
|
||||
constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{
|
||||
.dstBinding = 0,
|
||||
.dstArrayElement = 0,
|
||||
.descriptorCount = 3,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||
.offset = 0,
|
||||
.stride = sizeof(DescriptorUpdateEntry),
|
||||
};
|
||||
|
||||
constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS>
|
||||
ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
|
||||
{
|
||||
|
@ -132,6 +176,11 @@ struct AstcPushConstants {
|
|||
u32 block_height;
|
||||
u32 block_height_mask;
|
||||
};
|
||||
|
||||
struct QueriesPrefixScanPushConstants {
|
||||
u32 max_accumulation_base;
|
||||
u32 accumulation_limit;
|
||||
};
|
||||
} // Anonymous namespace
|
||||
|
||||
ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
|
||||
|
@ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(
|
|||
|
||||
void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
|
||||
u32 src_offset, bool compare_to_zero) {
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
|
||||
const size_t compare_size = compare_to_zero ? 8 : 24;
|
||||
|
||||
compute_pass_descriptor_queue.Acquire();
|
||||
|
@ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
|
|||
static constexpr VkMemoryBarrier read_barrier{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
|
||||
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||
};
|
||||
static constexpr VkMemoryBarrier write_barrier{
|
||||
|
@ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
|
|||
});
|
||||
}
|
||||
|
||||
QueriesPrefixScanPass::QueriesPrefixScanPass(
|
||||
const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
|
||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
|
||||
: ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
|
||||
QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
|
||||
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
|
||||
QUERIES_PREFIX_SCAN_SUM_COMP_SPV),
|
||||
scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
|
||||
|
||||
void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
|
||||
VkBuffer src_buffer, size_t number_of_sums,
|
||||
size_t max_accumulation_limit) {
|
||||
size_t aligned_runs = Common::AlignUp(number_of_sums, 32);
|
||||
|
||||
compute_pass_descriptor_queue.Acquire();
|
||||
compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64));
|
||||
compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64));
|
||||
compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64));
|
||||
const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
|
||||
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums,
|
||||
aligned_runs](vk::CommandBuffer cmdbuf) {
|
||||
static constexpr VkMemoryBarrier read_barrier{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||
};
|
||||
static constexpr VkMemoryBarrier write_barrier{
|
||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT |
|
||||
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
|
||||
VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT |
|
||||
VK_ACCESS_UNIFORM_READ_BIT |
|
||||
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
|
||||
};
|
||||
const QueriesPrefixScanPushConstants uniforms{
|
||||
.max_accumulation_base = static_cast<u32>(max_accumulation_limit),
|
||||
.accumulation_limit = static_cast<u32>(number_of_sums - 1),
|
||||
};
|
||||
const VkDescriptorSet set = descriptor_allocator.Commit();
|
||||
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
|
||||
|
||||
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
|
||||
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
|
||||
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
|
||||
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
|
||||
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms);
|
||||
cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1);
|
||||
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier);
|
||||
});
|
||||
}
|
||||
|
||||
ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
|
||||
DescriptorPool& descriptor_pool_,
|
||||
StagingBufferPool& staging_buffer_pool_,
|
||||
|
|
|
@ -95,6 +95,20 @@ private:
|
|||
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
|
||||
};
|
||||
|
||||
class QueriesPrefixScanPass final : public ComputePass {
|
||||
public:
|
||||
explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_,
|
||||
DescriptorPool& descriptor_pool_,
|
||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
|
||||
|
||||
void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer,
|
||||
size_t number_of_sums, size_t max_accumulation_limit);
|
||||
|
||||
private:
|
||||
Scheduler& scheduler;
|
||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
|
||||
};
|
||||
|
||||
class ASTCDecoderPass final : public ComputePass {
|
||||
public:
|
||||
explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "common/bit_util.h"
|
||||
#include "common/common_types.h"
|
||||
#include "core/memory.h"
|
||||
#include "video_core/engines/draw_manager.h"
|
||||
|
@ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer {
|
|||
public:
|
||||
explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_,
|
||||
VideoCore::RasterizerInterface* rasterizer_, const Device& device_,
|
||||
Scheduler& scheduler_, const MemoryAllocator& memory_allocator_)
|
||||
Scheduler& scheduler_, const MemoryAllocator& memory_allocator_,
|
||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue,
|
||||
DescriptorPool& descriptor_pool)
|
||||
: BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_},
|
||||
scheduler{scheduler_}, memory_allocator{memory_allocator_} {
|
||||
BuildResolveBuffer();
|
||||
current_bank = nullptr;
|
||||
current_query = nullptr;
|
||||
ammend_value = 0;
|
||||
acumulation_value = 0;
|
||||
queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>(
|
||||
device, scheduler, descriptor_pool, compute_pass_descriptor_queue);
|
||||
|
||||
const VkBufferCreateInfo buffer_ci = {
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||
.pNext = nullptr,
|
||||
.flags = 0,
|
||||
.size = 8,
|
||||
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
|
||||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
|
||||
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
||||
.queueFamilyIndexCount = 0,
|
||||
.pQueueFamilyIndices = nullptr,
|
||||
};
|
||||
accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
|
||||
cmdbuf.FillBuffer(buffer, 0, 8, 0);
|
||||
});
|
||||
}
|
||||
|
||||
~SamplesStreamer() = default;
|
||||
|
@ -159,6 +180,8 @@ public:
|
|||
acumulation_value = 0;
|
||||
});
|
||||
rasterizer->SyncOperation(std::move(func));
|
||||
accumulation_since_last_sync = false;
|
||||
last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used);
|
||||
}
|
||||
|
||||
void CloseCounter() override {
|
||||
|
@ -175,7 +198,8 @@ public:
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < sync_values_stash.size(); i++) {
|
||||
runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], *resolve_buffers[i]);
|
||||
runtime.template SyncValues<HostSyncValues>(sync_values_stash[i],
|
||||
*buffers[resolve_buffers[i]]);
|
||||
}
|
||||
|
||||
sync_values_stash.clear();
|
||||
|
@ -189,36 +213,21 @@ public:
|
|||
sync_values_stash.clear();
|
||||
sync_values_stash.emplace_back();
|
||||
std::vector<HostSyncValues>* sync_values = &sync_values_stash.back();
|
||||
sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
|
||||
sync_values->reserve(num_slots_used);
|
||||
std::unordered_map<size_t, std::pair<size_t, size_t>> offsets;
|
||||
size_t this_bank_slot = std::numeric_limits<size_t>::max();
|
||||
size_t resolve_slots_remaining = resolve_slots;
|
||||
size_t resolve_buffer_index = 0;
|
||||
resolve_buffers.clear();
|
||||
size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used);
|
||||
resolve_buffers.push_back(resolve_buffer_index);
|
||||
size_t base_offset = 0;
|
||||
|
||||
ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start,
|
||||
size_t amount) {
|
||||
size_t bank_id = bank->GetIndex();
|
||||
if (this_bank_slot != bank_id) {
|
||||
this_bank_slot = bank_id;
|
||||
if (resolve_slots_remaining == 0) {
|
||||
resolve_buffer_index++;
|
||||
if (resolve_buffer_index >= resolve_buffers.size()) {
|
||||
BuildResolveBuffer();
|
||||
}
|
||||
resolve_slots_remaining = resolve_slots;
|
||||
sync_values_stash.emplace_back();
|
||||
sync_values = &sync_values_stash.back();
|
||||
sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
|
||||
}
|
||||
resolve_slots_remaining--;
|
||||
}
|
||||
auto& resolve_buffer = resolve_buffers[resolve_buffer_index];
|
||||
const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE *
|
||||
(resolve_slots - resolve_slots_remaining - 1);
|
||||
auto& resolve_buffer = buffers[resolve_buffer_index];
|
||||
VkQueryPool query_pool = bank->GetInnerPool();
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([start, amount, base_offset, query_pool,
|
||||
buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) {
|
||||
size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE;
|
||||
const VkBufferMemoryBarrier copy_query_pool_barrier{
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
|
||||
.pNext = nullptr,
|
||||
|
@ -227,39 +236,60 @@ public:
|
|||
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||
.buffer = buffer,
|
||||
.offset = final_offset,
|
||||
.offset = base_offset,
|
||||
.size = amount * SamplesQueryBank::QUERY_SIZE,
|
||||
};
|
||||
|
||||
cmdbuf.CopyQueryPoolResults(
|
||||
query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer,
|
||||
static_cast<u32>(final_offset), SamplesQueryBank::QUERY_SIZE,
|
||||
static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE,
|
||||
VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
|
||||
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier);
|
||||
});
|
||||
offsets[bank_id] = {sync_values_stash.size() - 1, base_offset};
|
||||
offsets[bank_id] = {start, base_offset};
|
||||
base_offset += amount * SamplesQueryBank::QUERY_SIZE;
|
||||
});
|
||||
|
||||
// Convert queries
|
||||
bool has_multi_queries = false;
|
||||
for (auto q : pending_sync) {
|
||||
auto* query = GetQuery(q);
|
||||
size_t sync_value_slot = 0;
|
||||
if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) {
|
||||
continue;
|
||||
}
|
||||
if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) {
|
||||
continue;
|
||||
}
|
||||
if (query->size_slots > 1) {
|
||||
// This is problematic.
|
||||
// UNIMPLEMENTED();
|
||||
if (accumulation_since_last_sync || query->size_slots > 1) {
|
||||
if (!has_multi_queries) {
|
||||
has_multi_queries = true;
|
||||
sync_values_stash.emplace_back();
|
||||
}
|
||||
sync_value_slot = 1;
|
||||
}
|
||||
query->flags |= VideoCommon::QueryFlagBits::IsHostSynced;
|
||||
auto loc_data = offsets[query->start_bank_id];
|
||||
sync_values_stash[loc_data.first].emplace_back(HostSyncValues{
|
||||
sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{
|
||||
.address = query->guest_address,
|
||||
.size = SamplesQueryBank::QUERY_SIZE,
|
||||
.offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE,
|
||||
.offset =
|
||||
loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) *
|
||||
SamplesQueryBank::QUERY_SIZE,
|
||||
});
|
||||
}
|
||||
|
||||
if (has_multi_queries) {
|
||||
size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used);
|
||||
resolve_buffers.push_back(intermediary_buffer_index);
|
||||
queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index],
|
||||
*buffers[resolve_buffer_index], num_slots_used,
|
||||
std::min(last_accumulation_checkpoint, num_slots_used));
|
||||
} else {
|
||||
scheduler.RequestOutsideRenderPassOperationContext();
|
||||
scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
|
||||
cmdbuf.FillBuffer(buffer, 0, 8, 0);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -267,6 +297,9 @@ public:
|
|||
std::function<void()> func([this] { ammend_value = acumulation_value; });
|
||||
rasterizer->SyncOperation(std::move(func));
|
||||
AbandonCurrentQuery();
|
||||
num_slots_used = 0;
|
||||
last_accumulation_checkpoint = std::numeric_limits<size_t>::max();
|
||||
accumulation_since_last_sync = has_multi_queries;
|
||||
pending_sync.clear();
|
||||
}
|
||||
|
||||
|
@ -400,6 +433,7 @@ private:
|
|||
void ReserveHostQuery() {
|
||||
size_t new_slot = ReserveBankSlot();
|
||||
current_bank->AddReference(1);
|
||||
num_slots_used++;
|
||||
if (current_query) {
|
||||
size_t bank_id = current_query->start_bank_id;
|
||||
size_t banks_set = current_query->size_banks - 1;
|
||||
|
@ -470,32 +504,50 @@ private:
|
|||
});
|
||||
}
|
||||
|
||||
void BuildResolveBuffer() {
|
||||
template <bool is_resolve>
|
||||
size_t ObtainBuffer(size_t num_needed) {
|
||||
const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed));
|
||||
if constexpr (is_resolve) {
|
||||
if (resolve_table[log_2] != 0) {
|
||||
return resolve_table[log_2] - 1;
|
||||
}
|
||||
} else {
|
||||
if (intermediary_table[log_2] != 0) {
|
||||
return intermediary_table[log_2] - 1;
|
||||
}
|
||||
}
|
||||
const VkBufferCreateInfo buffer_ci = {
|
||||
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||
.pNext = nullptr,
|
||||
.flags = 0,
|
||||
.size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots,
|
||||
.size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2),
|
||||
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
|
||||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
|
||||
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
||||
.queueFamilyIndexCount = 0,
|
||||
.pQueueFamilyIndices = nullptr,
|
||||
};
|
||||
resolve_buffers.emplace_back(
|
||||
memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
|
||||
buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
|
||||
if constexpr (is_resolve) {
|
||||
resolve_table[log_2] = buffers.size();
|
||||
} else {
|
||||
intermediary_table[log_2] = buffers.size();
|
||||
}
|
||||
return buffers.size() - 1;
|
||||
}
|
||||
|
||||
static constexpr size_t resolve_slots = 8;
|
||||
|
||||
QueryCacheRuntime& runtime;
|
||||
VideoCore::RasterizerInterface* rasterizer;
|
||||
const Device& device;
|
||||
Scheduler& scheduler;
|
||||
const MemoryAllocator& memory_allocator;
|
||||
VideoCommon::BankPool<SamplesQueryBank> bank_pool;
|
||||
std::deque<vk::Buffer> resolve_buffers;
|
||||
std::deque<vk::Buffer> buffers;
|
||||
std::array<size_t, 32> resolve_table{};
|
||||
std::array<size_t, 32> intermediary_table{};
|
||||
vk::Buffer accumulation_buffer;
|
||||
std::deque<std::vector<HostSyncValues>> sync_values_stash;
|
||||
std::vector<size_t> resolve_buffers;
|
||||
|
||||
// syncing queue
|
||||
std::vector<size_t> pending_sync;
|
||||
|
@ -510,10 +562,14 @@ private:
|
|||
SamplesQueryBank* current_bank;
|
||||
VkQueryPool current_query_pool;
|
||||
size_t current_query_id;
|
||||
size_t num_slots_used{};
|
||||
size_t last_accumulation_checkpoint{};
|
||||
bool accumulation_since_last_sync{};
|
||||
VideoCommon::HostQueryBase* current_query;
|
||||
bool has_started{};
|
||||
bool current_unset{};
|
||||
std::mutex flush_guard;
|
||||
|
||||
std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass;
|
||||
};
|
||||
|
||||
// Transform feedback queries
|
||||
|
@ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl {
|
|||
memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_},
|
||||
guest_streamer(0, runtime),
|
||||
sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer,
|
||||
device, scheduler, memory_allocator),
|
||||
device, scheduler, memory_allocator, compute_pass_descriptor_queue,
|
||||
descriptor_pool),
|
||||
tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device,
|
||||
scheduler, memory_allocator, staging_pool),
|
||||
primitives_succeeded_streamer(
|
||||
|
@ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
|
|||
return true;
|
||||
}
|
||||
}
|
||||
if (!is_in_bc[0] && !is_in_bc[1]) {
|
||||
/*if (!is_in_bc[0] && !is_in_bc[1]) {
|
||||
// Both queries are in query cache, it's best to just flush.
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}*/
|
||||
HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
|
||||
return true;
|
||||
}
|
||||
|
|
Reference in New Issue