From 0f89828073a541eaa2cfd985483f839bd2f97b74 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 9 Feb 2022 15:39:40 +0100 Subject: [PATCH] MacroHLE: Implement DrawIndexedIndirect & DrawArraysIndirect. --- src/video_core/buffer_cache/buffer_cache.h | 160 +++++++++++++++--- src/video_core/dma_pusher.cpp | 3 +- src/video_core/dma_pusher.h | 1 + src/video_core/engines/draw_manager.cpp | 2 +- src/video_core/engines/draw_manager.h | 5 +- src/video_core/engines/maxwell_3d.cpp | 4 + src/video_core/engines/maxwell_3d.h | 12 +- src/video_core/macro/macro_hle.cpp | 45 +++-- src/video_core/rasterizer_interface.h | 2 +- .../renderer_vulkan/vk_buffer_cache.cpp | 4 +- .../renderer_vulkan/vk_rasterizer.cpp | 48 ++++-- .../renderer_vulkan/vk_rasterizer.h | 2 +- .../vulkan_common/vulkan_device.cpp | 5 +- src/video_core/vulkan_common/vulkan_device.h | 1 + .../vulkan_common/vulkan_wrapper.cpp | 6 +- src/video_core/vulkan_common/vulkan_wrapper.h | 24 ++- 16 files changed, 252 insertions(+), 72 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 99abe0edf..557227b37 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -171,7 +171,9 @@ public: bool is_written, bool is_image); [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, - bool synchronize, bool mark_as_written); + bool synchronize = true, + bool mark_as_written = false, + bool discard_downloads = false); void FlushCachedWrites(); @@ -203,6 +205,14 @@ public: /// Return true when a CPU region is modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + void SetDrawIndirect(const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { + current_draw_indirect = current_draw_indirect_; + } + + [[nodiscard]] std::pair GetDrawIndirectCount(); + + [[nodiscard]] std::pair GetDrawIndirectBuffer(); + std::mutex mutex; Runtime& runtime; @@ -275,6 +285,8 @@ private: void BindHostVertexBuffers(); + void BindHostDrawIndirectBuffers(); + void BindHostGraphicsUniformBuffers(size_t stage); void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); @@ -301,6 +313,8 @@ private: void UpdateVertexBuffer(u32 index); + void UpdateDrawIndirect(); + void UpdateUniformBuffers(size_t stage); void UpdateStorageBuffers(size_t stage); @@ -340,6 +354,8 @@ private: bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span copies); @@ -375,6 +391,8 @@ private: SlotVector slot_buffers; DelayedDestructionRing delayed_destruction_ring; + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; + u32 last_index_count = 0; Binding index_buffer; @@ -383,6 +401,8 @@ private: std::array, NUM_STAGES> storage_buffers; std::array, NUM_STAGES> texture_buffers; std::array transform_feedback_buffers; + Binding count_buffer_binding; + Binding indirect_buffer_binding; std::array compute_uniform_buffers; std::array compute_storage_buffers; @@ -422,6 +442,7 @@ private: std::vector cached_write_buffer_ids; + IntervalSet discarded_ranges; IntervalSet uncommitted_ranges; IntervalSet common_ranges; std::deque committed_ranges; @@ -579,13 +600,17 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am }}; boost::container::small_vector tmp_intervals; + const bool is_high_accuracy = + Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; auto mirror = [&](VAddr base_address, VAddr base_address_end) { const u64 size = base_address_end - base_address; const VAddr diff = base_address - *cpu_src_address; const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; - uncommitted_ranges.add(add_interval); tmp_intervals.push_back(add_interval); + if (is_high_accuracy) { + uncommitted_ranges.add(add_interval); + } }; ForEachWrittenRange(*cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. @@ -677,6 +702,9 @@ void BufferCache

::BindHostGeometryBuffers(bool is_indexed) { } BindHostVertexBuffers(); BindHostTransformFeedbackBuffers(); + if (current_draw_indirect) { + BindHostDrawIndirectBuffers(); + } } template @@ -796,7 +824,8 @@ void BufferCache

::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add template std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_addr, u32 size, bool synchronize, - bool mark_as_written) { + bool mark_as_written, + bool discard_downloads) { const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); if (!cpu_addr) { return {&slot_buffers[NULL_BUFFER_ID], 0}; @@ -804,11 +833,17 @@ std::pair BufferCache

::ObtainBuffer(GPUVAddr gpu_ad const BufferId buffer_id = FindBuffer(*cpu_addr, size); Buffer& buffer = slot_buffers[buffer_id]; if (synchronize) { - SynchronizeBuffer(buffer, *cpu_addr, size); + // SynchronizeBuffer(buffer, *cpu_addr, size); + SynchronizeBufferNoModified(buffer, *cpu_addr, size); } if (mark_as_written) { MarkWrittenBuffer(buffer_id, *cpu_addr, size); } + if (discard_downloads) { + IntervalType interval{*cpu_addr, size}; + ClearDownload(interval); + discarded_ranges.subtract(interval); + } return {&buffer, buffer.Offset(*cpu_addr)}; } @@ -827,10 +862,6 @@ bool BufferCache

::HasUncommittedFlushes() const noexcept { template void BufferCache

::AccumulateFlushes() { - if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { - uncommitted_ranges.clear(); - return; - } if (uncommitted_ranges.empty()) { return; } @@ -845,12 +876,15 @@ bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { template void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); + + for (const auto& interval : discarded_ranges) { + common_ranges.subtract(interval); + } + if (committed_ranges.empty()) { return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); - const bool is_accuracy_normal = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; auto it = committed_ranges.begin(); while (it != committed_ranges.end()) { @@ -875,9 +909,6 @@ void BufferCache

::CommitAsyncFlushesHigh() { ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { buffer.ForEachDownloadRangeAndClear( cpu_addr, size, [&](u64 range_offset, u64 range_size) { - if (is_accuracy_normal) { - return; - } const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { const u64 new_offset = start - buffer_addr; @@ -891,7 +922,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { buffer_id, }); // Align up to avoid cache conflicts - constexpr u64 align = 256ULL; + constexpr u64 align = 8ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); @@ -942,12 +973,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { template void BufferCache

::CommitAsyncFlushes() { - if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { - CommitAsyncFlushesHigh(); - } else { - uncommitted_ranges.clear(); - committed_ranges.clear(); - } + CommitAsyncFlushesHigh(); } template @@ -1063,6 +1089,19 @@ void BufferCache

::BindHostVertexBuffers() { } } +template +void BufferCache

::BindHostDrawIndirectBuffers() { + const auto bind_buffer = [this](const Binding& binding) { + Buffer& buffer = slot_buffers[binding.buffer_id]; + TouchBuffer(buffer, binding.buffer_id); + SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); + }; + if (current_draw_indirect->include_count) { + bind_buffer(count_buffer_binding); + } + bind_buffer(indirect_buffer_binding); +} + template void BufferCache

::BindHostGraphicsUniformBuffers(size_t stage) { u32 dirty = ~0U; @@ -1294,6 +1333,9 @@ void BufferCache

::DoUpdateGraphicsBuffers(bool is_indexed) { UpdateStorageBuffers(stage); UpdateTextureBuffers(stage); } + if (current_draw_indirect) { + UpdateDrawIndirect(); + } } while (has_deleted_buffers); } @@ -1383,6 +1425,27 @@ void BufferCache

::UpdateVertexBuffer(u32 index) { }; } +template +void BufferCache

::UpdateDrawIndirect() { + const auto update = [this](GPUVAddr gpu_addr, size_t size, Binding& binding) { + const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + binding = NULL_BINDING; + return; + } + binding = Binding{ + .cpu_addr = *cpu_addr, + .size = static_cast(size), + .buffer_id = FindBuffer(*cpu_addr, static_cast(size)), + }; + }; + if (current_draw_indirect->include_count) { + update(current_draw_indirect->count_start_address, sizeof(u32), count_buffer_binding); + } + update(current_draw_indirect->indirect_start_address, current_draw_indirect->buffer_size, + indirect_buffer_binding); +} + template void BufferCache

::UpdateUniformBuffers(size_t stage) { ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) { @@ -1704,6 +1767,51 @@ bool BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s return false; } +template +bool BufferCache

::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + IntervalSet found_sets{}; + auto make_copies = [&] { + for (auto& interval : found_sets) { + const std::size_t sub_size = interval.upper() - interval.lower(); + const VAddr cpu_addr = interval.lower(); + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = cpu_addr - buffer.CpuAddr(), + .size = sub_size, + }); + total_size_bytes += sub_size; + largest_copy = std::max(largest_copy, sub_size); + } + const std::span copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); + }; + buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + const VAddr base_adr = buffer.CpuAddr() + range_offset; + const VAddr end_adr = base_adr + range_size; + const IntervalType add_interval{base_adr, end_adr}; + found_sets.add(add_interval); + }); + if (found_sets.empty()) { + return true; + } + const IntervalType search_interval{cpu_addr, cpu_addr + size}; + auto it = common_ranges.lower_bound(search_interval); + auto it_end = common_ranges.upper_bound(search_interval); + if (it == common_ranges.end()) { + make_copies(); + return false; + } + while (it != it_end) { + found_sets.subtract(*it); + it++; + } + make_copies(); + return false; +} + template void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span copies) { @@ -1963,4 +2071,16 @@ bool BufferCache

::HasFastUniformBufferBound(size_t stage, u32 binding_index) } } +template +std::pair::Buffer*, u32> BufferCache

::GetDrawIndirectCount() { + auto& buffer = slot_buffers[count_buffer_binding.buffer_id]; + return std::make_pair(&buffer, buffer.Offset(count_buffer_binding.cpu_addr)); +} + +template +std::pair::Buffer*, u32> BufferCache

::GetDrawIndirectBuffer() { + auto& buffer = slot_buffers[indirect_buffer_binding.buffer_id]; + return std::make_pair(&buffer, buffer.Offset(indirect_buffer_binding.cpu_addr)); +} + } // namespace VideoCommon diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index eb1371612..13ff64fa3 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -97,6 +97,7 @@ void DmaPusher::ProcessCommands(std::span commands) { if (dma_state.non_incrementing) { const u32 max_write = static_cast( std::min(index + dma_state.method_count, commands.size()) - index); + dma_state.dma_word_offset = static_cast(index * sizeof(u32)); CallMultiMethod(&command_header.argument, max_write); dma_state.method_count -= max_write; dma_state.is_last_call = true; @@ -175,7 +176,7 @@ void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const { dma_state.method_count); } else { auto subchannel = subchannels[dma_state.subchannel]; - subchannel->current_dma_segment = dma_state.dma_get; + subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset; subchannel->CallMultiMethod(dma_state.method, base_start, num_methods, dma_state.method_count); } diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index ca0899ba7..da7728ded 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -157,6 +157,7 @@ private: u32 method_count; ///< Current method count u32 length_pending; ///< Large NI command length pending GPUVAddr dma_get; ///< Currently read segment + u32 dma_word_offset; ///< Current word ofset from address bool non_incrementing; ///< Current command's NI flag bool is_last_call; }; diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp index 4fa77b684..c60f32aad 100644 --- a/src/video_core/engines/draw_manager.cpp +++ b/src/video_core/engines/draw_manager.cpp @@ -216,7 +216,7 @@ void DrawManager::ProcessDrawIndirect(bool draw_indexed) { UpdateTopology(); if (maxwell3d->ShouldExecute()) { - maxwell3d->rasterizer->DrawIndirect(draw_indexed); + maxwell3d->rasterizer->DrawIndirect(); } } } // namespace Tegra::Engines diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h index 0cdb37f83..437990162 100644 --- a/src/video_core/engines/draw_manager.h +++ b/src/video_core/engines/draw_manager.h @@ -33,7 +33,10 @@ public: }; struct IndirectParams { - GPUVAddr start_address; + bool is_indexed; + bool include_count; + GPUVAddr count_start_address; + GPUVAddr indirect_start_address; size_t buffer_size; size_t max_draw_counts; size_t stride; diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 9b182b653..cd6274a9b 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -130,11 +130,15 @@ void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool } macro_params.insert(macro_params.end(), base_start, base_start + amount); + for (size_t i = 0; i < amount; i++) { + macro_addresses.push_back(current_dma_segment + i * sizeof(u32)); + } // Call the macro when there are no more parameters in the command buffer if (is_last_call) { CallMacroMethod(executing_macro, macro_params); macro_params.clear(); + macro_addresses.clear(); } } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 22b904319..ac5e87563 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -3066,6 +3066,15 @@ public: std::unique_ptr draw_manager; friend class DrawManager; + + std::vector inline_index_draw_indexes; + std::vector macro_addresses; + + Core::System& system; + MemoryManager& memory_manager; + + /// Handles a write to the CLEAR_BUFFERS register. + void ProcessClearBuffers(u32 layer_count); private: void InitializeRegisterDefaults(); @@ -3126,9 +3135,6 @@ private: /// Returns a query's value or an empty object if the value will be deferred through a cache. std::optional GetQueryResult(); - Core::System& system; - MemoryManager& memory_manager; - VideoCore::RasterizerInterface* rasterizer = nullptr; /// Start offsets of each macro in macro_memory diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index 1cc202cc7..da988cc0d 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -9,6 +9,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/macro/macro.h" #include "video_core/macro/macro_hle.h" +#include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" namespace Tegra { @@ -24,15 +25,14 @@ void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector& parameters[4], parameters[1], parameters[3], parameters[5], instance_count); } -void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { +void HLE_DrawArraysIndirect(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); maxwell3d.draw_manager->DrawArray( static_cast(parameters[0]), parameters[3], parameters[1], parameters[4], instance_count); } -void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { - const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]); +void HLE_DrawIndexedIndirect(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { const u32 element_base = parameters[4]; const u32 base_instance = parameters[5]; maxwell3d.regs.vertex_id_base = element_base; @@ -41,9 +41,18 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector& maxwell3d.CallMethod(0x8e4, element_base, true); maxwell3d.CallMethod(0x8e5, base_instance, true); - maxwell3d.draw_manager->DrawIndex( - static_cast(parameters[0]), - parameters[3], parameters[1], element_base, base_instance, instance_count); + auto& params = maxwell3d.draw_manager->GetIndirectParams(); + params.is_indexed = true; + params.include_count = false; + params.count_start_address = 0; + params.indirect_start_address = maxwell3d.macro_addresses[1]; + params.buffer_size = 5 * sizeof(u32); + params.max_draw_counts = 1; + params.stride = 0; + + maxwell3d.draw_manager->DrawIndexedIndirect( + static_cast(parameters[0]), 0, + 1U << 18); maxwell3d.regs.vertex_id_base = 0x0; maxwell3d.CallMethod(0x8e3, 0x640, true); @@ -51,8 +60,9 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector& maxwell3d.CallMethod(0x8e5, 0x0, true); } -// Multidraw Indirect -void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { +// Multidraw Indixed Indirect +void HLE_MultiDrawIndexedIndirect(Engines::Maxwell3D& maxwell3d, + const std::vector& parameters) { const u32 start_indirect = parameters[0]; const u32 end_indirect = parameters[1]; if (start_indirect >= end_indirect) { @@ -66,7 +76,6 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector& // size of each indirect segment const u32 indirect_words = 5 + padding; const u32 stride = indirect_words * sizeof(u32); - const GPUVAddr start_address = maxwell3d.current_dma_segment + 4 * sizeof(u32); const std::size_t draw_count = end_indirect - start_indirect; u32 lowest_first = std::numeric_limits::max(); u32 highest_limit = std::numeric_limits::min(); @@ -80,12 +89,16 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector& const u32 base_vertex = parameters[8]; const u32 base_instance = parameters[9]; + maxwell3d.regs.vertex_id_base = base_vertex; maxwell3d.CallMethod(0x8e3, 0x640, true); maxwell3d.CallMethod(0x8e4, base_vertex, true); maxwell3d.CallMethod(0x8e5, base_instance, true); auto& params = maxwell3d.draw_manager->GetIndirectParams(); - params.start_address = start_address; - params.buffer_size = sizeof(u32) + stride * draw_count; + params.is_indexed = true; + params.include_count = true; + params.count_start_address = maxwell3d.macro_addresses[4]; + params.indirect_start_address = maxwell3d.macro_addresses[5]; + params.buffer_size = stride * draw_count; params.max_draw_counts = draw_count; params.stride = stride; maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; @@ -93,7 +106,7 @@ void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector& } // Multi-layer Clear -void HLE_EAD26C3E2109B06B(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { +void HLE_MultiLayerClear(Engines::Maxwell3D& maxwell3d, const std::vector& parameters) { ASSERT(parameters.size() == 1); const Engines::Maxwell3D::Regs::ClearSurface clear_params{parameters[0]}; @@ -107,10 +120,10 @@ void HLE_EAD26C3E2109B06B(Engines::Maxwell3D& maxwell3d, const std::vector& constexpr std::array, 5> hle_funcs{{ {0x771BB18C62444DA0, &HLE_771BB18C62444DA0}, - {0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD}, - {0x0217920100488FF7, &HLE_0217920100488FF7}, - {0x3F5E74B9C9A50164, &HLE_3F5E74B9C9A50164}, - {0xEAD26C3E2109B06B, &HLE_EAD26C3E2109B06B}, + {0x0D61FC9FAAC9FCAD, &HLE_DrawArraysIndirect}, + {0x0217920100488FF7, &HLE_DrawIndexedIndirect}, + {0x3F5E74B9C9A50164, &HLE_MultiDrawIndexedIndirect}, + {0xEAD26C3E2109B06B, &HLE_MultiLayerClear}, }}; class HLEMacroImpl final : public CachedMacro { diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index a2a651f34..641b95c7c 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -43,7 +43,7 @@ public: virtual void Draw(bool is_indexed, u32 instance_count) = 0; /// Dispatches an indirect draw invocation - virtual void DrawIndirect(bool is_indexed) {} + virtual void DrawIndirect() {} /// Clear the current framebuffer virtual void Clear(u32 layer_count) = 0; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 6b54d7111..487d8b416 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -56,7 +56,8 @@ vk::Buffer CreateBuffer(const Device& device, u64 size) { VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | + VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; if (device.IsExtTransformFeedbackSupported()) { flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } @@ -516,6 +517,7 @@ void BufferCacheRuntime::ReserveNullBuffer() { if (device.IsExtTransformFeedbackSupported()) { create_info.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; } + create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; null_buffer = device.GetLogical().CreateBuffer(create_info); if (device.HasDebuggingToolAttached()) { null_buffer.SetObjectNameEXT("Null buffer"); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 9b75f33dd..6f1adc97f 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -225,25 +225,40 @@ void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { }); } -void RasterizerVulkan::DrawIndirect(bool is_indexed) { - PrepareDraw(is_indexed, [this, is_indexed] { - const auto params = maxwell3d->draw_manager->GetIndirectParams(); - const auto [buffer, offset] = buffer_cache.ObtainBuffer( - params.start_address, static_cast(params.buffer_size), true, false); - scheduler.Record([buffer_obj = buffer->Handle(), offset, - max_draw_counts = params.max_draw_counts, stride = params.stride, - is_indexed](vk::CommandBuffer cmdbuf) { - if (is_indexed) { - cmdbuf.DrawIndexedIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset, - static_cast(max_draw_counts), - static_cast(stride)); +void RasterizerVulkan::DrawIndirect() { + const auto& params = maxwell3d->draw_manager->GetIndirectParams(); + buffer_cache.SetDrawIndirect(¶ms); + PrepareDraw(params.is_indexed, [this, ¶ms] { + const auto [buffer, offset] = buffer_cache.GetDrawIndirectBuffer(); + if (params.include_count) { + const auto [draw_buffer, offset_base] = buffer_cache.GetDrawIndirectCount(); + scheduler.Record([draw_buffer_obj = draw_buffer->Handle(), + buffer_obj = buffer->Handle(), offset_base, offset, + params](vk::CommandBuffer cmdbuf) { + if (params.is_indexed) { + cmdbuf.DrawIndexedIndirectCount( + buffer_obj, offset, draw_buffer_obj, offset_base, + static_cast(params.max_draw_counts), static_cast(params.stride)); + } else { + cmdbuf.DrawIndirectCount(buffer_obj, offset, draw_buffer_obj, offset_base, + static_cast(params.max_draw_counts), + static_cast(params.stride)); + } + }); + return; + } + scheduler.Record([buffer_obj = buffer->Handle(), offset, params](vk::CommandBuffer cmdbuf) { + if (params.is_indexed) { + cmdbuf.DrawIndexedIndirect(buffer_obj, offset, + static_cast(params.max_draw_counts), + static_cast(params.stride)); } else { - cmdbuf.DrawIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset, - static_cast(max_draw_counts), - static_cast(stride)); + cmdbuf.DrawIndirect(buffer_obj, offset, static_cast(params.max_draw_counts), + static_cast(params.stride)); } }); }); + buffer_cache.SetDrawIndirect(nullptr); } void RasterizerVulkan::Clear(u32 layer_count) { @@ -425,9 +440,6 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) { std::scoped_lock lock{texture_cache.mutex, buffer_cache.mutex}; - if (!Settings::IsGPULevelHigh()) { - return buffer_cache.IsRegionGpuModified(addr, size); - } return texture_cache.IsRegionGpuModified(addr, size) || buffer_cache.IsRegionGpuModified(addr, size); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index bc43a8a1f..43a210da0 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -65,7 +65,7 @@ public: ~RasterizerVulkan() override; void Draw(bool is_indexed, u32 instance_count) override; - void DrawIndirect(bool is_indexed) override; + void DrawIndirect() override; void Clear(u32 layer_count) override; void DispatchCompute() override; void ResetCounter(VideoCore::QueryType type) override; diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 477fc428b..207fae8c9 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -351,7 +351,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR .dualSrcBlend = true, .logicOp = true, .multiDrawIndirect = true, - .drawIndirectFirstInstance = false, + .drawIndirectFirstInstance = true, .depthClamp = true, .depthBiasClamp = true, .fillModeNonSolid = true, @@ -1024,6 +1024,8 @@ void Device::CheckSuitability(bool requires_swapchain) const { std::make_pair(features.vertexPipelineStoresAndAtomics, "vertexPipelineStoresAndAtomics"), std::make_pair(features.imageCubeArray, "imageCubeArray"), std::make_pair(features.independentBlend, "independentBlend"), + std::make_pair(features.multiDrawIndirect, "multiDrawIndirect"), + std::make_pair(features.drawIndirectFirstInstance, "drawIndirectFirstInstance"), std::make_pair(features.depthClamp, "depthClamp"), std::make_pair(features.samplerAnisotropy, "samplerAnisotropy"), std::make_pair(features.largePoints, "largePoints"), @@ -1117,6 +1119,7 @@ std::vector Device::LoadExtensions(bool requires_surface) { test(khr_spirv_1_4, VK_KHR_SPIRV_1_4_EXTENSION_NAME, true); test(khr_push_descriptor, VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, true); test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); + test(khr_draw_indirect_count, VK_KHR_DRAW_INDIRECT_COUNT_EXTENSION_NAME, true); test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); test(has_ext_primitive_topology_list_restart, diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 6a26c4e6e..d0d7c2299 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -451,6 +451,7 @@ private: bool nv_viewport_swizzle{}; ///< Support for VK_NV_viewport_swizzle. bool nv_viewport_array2{}; ///< Support for VK_NV_viewport_array2. bool nv_geometry_shader_passthrough{}; ///< Support for VK_NV_geometry_shader_passthrough. + bool khr_draw_indirect_count{}; ///< Support for VK_KHR_draw_indirect_count. bool khr_uniform_buffer_standard_layout{}; ///< Support for scalar uniform buffer layouts. bool khr_spirv_1_4{}; ///< Support for VK_KHR_spirv_1_4. bool khr_workgroup_memory_explicit_layout{}; ///< Support for explicit workgroup layouts. diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index c58c4c1c4..f8f8ed9f8 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -94,8 +94,10 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdDispatch); X(vkCmdDraw); X(vkCmdDrawIndexed); - X(vkCmdDrawIndirectCount); - X(vkCmdDrawIndexedIndirectCount); + X(vkCmdDrawIndirect); + X(vkCmdDrawIndexedIndirect); + X(vkCmdDrawIndirectCountKHR); + X(vkCmdDrawIndexedIndirectCountKHR); X(vkCmdEndQuery); X(vkCmdEndRenderPass); X(vkCmdEndTransformFeedbackEXT); diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 9bd158dce..493a48573 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -213,8 +213,10 @@ struct DeviceDispatch : InstanceDispatch { PFN_vkCmdDispatch vkCmdDispatch{}; PFN_vkCmdDraw vkCmdDraw{}; PFN_vkCmdDrawIndexed vkCmdDrawIndexed{}; - PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; - PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; + PFN_vkCmdDrawIndirect vkCmdDrawIndirect{}; + PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; + PFN_vkCmdDrawIndirectCountKHR vkCmdDrawIndirectCountKHR{}; + PFN_vkCmdDrawIndexedIndirectCountKHR vkCmdDrawIndexedIndirectCountKHR{}; PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; PFN_vkCmdEndQuery vkCmdEndQuery{}; PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; @@ -1021,17 +1023,27 @@ public: first_instance); } + void DrawIndirect(VkBuffer src_buffer, VkDeviceSize src_offset, u32 draw_count, + u32 stride) const noexcept { + dld->vkCmdDrawIndirect(handle, src_buffer, src_offset, draw_count, stride); + } + + void DrawIndexedIndirect(VkBuffer src_buffer, VkDeviceSize src_offset, u32 draw_count, + u32 stride) const noexcept { + dld->vkCmdDrawIndexedIndirect(handle, src_buffer, src_offset, draw_count, stride); + } + void DrawIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset, VkBuffer count_buffer, VkDeviceSize count_offset, u32 draw_count, u32 stride) const noexcept { - dld->vkCmdDrawIndirectCount(handle, src_buffer, src_offset, count_buffer, count_offset, - draw_count, stride); + dld->vkCmdDrawIndirectCountKHR(handle, src_buffer, src_offset, count_buffer, count_offset, + draw_count, stride); } void DrawIndexedIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset, VkBuffer count_buffer, VkDeviceSize count_offset, u32 draw_count, u32 stride) const noexcept { - dld->vkCmdDrawIndexedIndirectCount(handle, src_buffer, src_offset, count_buffer, - count_offset, draw_count, stride); + dld->vkCmdDrawIndexedIndirectCountKHR(handle, src_buffer, src_offset, count_buffer, + count_offset, draw_count, stride); } void ClearAttachments(Span attachments,