yuzu-emu
/
yuzu-android
Archived
1
0
Fork 0

vk_staging_buffer_pool: Add stream buffer for small uploads

This uses a ring buffer similar to OpenGL's stream buffer for small
uploads. This stops us from allocating several small buffers, reducing
memory fragmentation and cache locality.

It uses dedicated allocations when possible.
This commit is contained in:
ReinUsesLisp 2021-01-16 16:20:18 -03:00
parent 8fd518ec40
commit 35df1d1864
15 changed files with 298 additions and 127 deletions

View File

@ -239,8 +239,7 @@ private:
void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
std::span<const BufferCopy> copies); std::span<const BufferCopy> copies);
void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies);
std::span<const BufferCopy> copies);
void DeleteBuffer(BufferId buffer_id); void DeleteBuffer(BufferId buffer_id);
@ -362,11 +361,17 @@ void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
const u8* const mapped_memory = download_staging.mapped_span.data(); const u8* const mapped_memory = download_staging.mapped_span.data();
const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size()); const std::span<BufferCopy> copies_span(copies.data(), copies.data() + copies.size());
for (BufferCopy& copy : copies) {
// Modify copies to have the staging offset in mind
copy.dst_offset += download_staging.offset;
}
runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); runtime.CopyBuffer(download_staging.buffer, buffer, copies_span);
runtime.Finish(); runtime.Finish();
for (const BufferCopy& copy : copies) { for (const BufferCopy& copy : copies) {
const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset;
const u8* copy_mapped_memory = mapped_memory + copy.dst_offset; // Undo the modified offset
const u64 dst_offset = copy.dst_offset - download_staging.offset;
const u8* copy_mapped_memory = mapped_memory + dst_offset;
cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size);
} }
} else { } else {
@ -554,7 +559,9 @@ void BufferCache<P>::PopAsyncFlushes() {
} }
if constexpr (USE_MEMORY_MAPS) { if constexpr (USE_MEMORY_MAPS) {
auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes);
for (const auto [copy, buffer_id] : downloads) { for (auto& [copy, buffer_id] : downloads) {
// Have in mind the staging buffer offset for the copy
copy.dst_offset += download_staging.offset;
const std::array copies{copy}; const std::array copies{copy};
runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies);
} }
@ -562,7 +569,9 @@ void BufferCache<P>::PopAsyncFlushes() {
for (const auto [copy, buffer_id] : downloads) { for (const auto [copy, buffer_id] : downloads) {
const Buffer& buffer = slot_buffers[buffer_id]; const Buffer& buffer = slot_buffers[buffer_id];
const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset;
const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset; // Undo the modified offset
const u64 dst_offset = copy.dst_offset - download_staging.offset;
const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset;
cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size);
} }
} else { } else {
@ -1117,13 +1126,16 @@ void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy,
template <class P> template <class P>
void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
std::span<const BufferCopy> copies) { std::span<BufferCopy> copies) {
auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes);
const std::span<u8> staging_pointer = upload_staging.mapped_span; const std::span<u8> staging_pointer = upload_staging.mapped_span;
for (const BufferCopy& copy : copies) { for (BufferCopy& copy : copies) {
const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
u8* const src_pointer = staging_pointer.data() + copy.src_offset; u8* const src_pointer = staging_pointer.data() + copy.src_offset;
const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset;
cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size);
// Apply the staging offset
copy.src_offset += upload_staging.offset;
} }
runtime.CopyBuffer(buffer, upload_staging.buffer, copies); runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
} }

View File

@ -550,15 +550,14 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
} }
void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map,
size_t buffer_offset,
std::span<const SwizzleParameters> swizzles) { std::span<const SwizzleParameters> swizzles) {
switch (image.info.type) { switch (image.info.type) {
case ImageType::e2D: case ImageType::e2D:
return util_shaders.BlockLinearUpload2D(image, map, buffer_offset, swizzles); return util_shaders.BlockLinearUpload2D(image, map, swizzles);
case ImageType::e3D: case ImageType::e3D:
return util_shaders.BlockLinearUpload3D(image, map, buffer_offset, swizzles); return util_shaders.BlockLinearUpload3D(image, map, swizzles);
case ImageType::Linear: case ImageType::Linear:
return util_shaders.PitchUpload(image, map, buffer_offset, swizzles); return util_shaders.PitchUpload(image, map, swizzles);
default: default:
UNREACHABLE(); UNREACHABLE();
break; break;
@ -710,10 +709,10 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,
} }
} }
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void Image::UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies) { std::span<const VideoCommon::BufferImageCopy> copies) {
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes);
glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
@ -729,19 +728,19 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
current_image_height = copy.buffer_image_height; current_image_height = copy.buffer_image_height;
glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height); glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height);
} }
CopyBufferToImage(copy, buffer_offset); CopyBufferToImage(copy, map.offset);
} }
} }
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void Image::UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferCopy> copies) { std::span<const VideoCommon::BufferCopy> copies) {
for (const VideoCommon::BufferCopy& copy : copies) { for (const VideoCommon::BufferCopy& copy : copies) {
glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + buffer_offset, glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + map.offset,
copy.dst_offset, copy.size); copy.dst_offset, copy.size);
} }
} }
void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset, void Image::DownloadMemory(ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies) { std::span<const VideoCommon::BufferImageCopy> copies) {
glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
@ -760,7 +759,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset,
current_image_height = copy.buffer_image_height; current_image_height = copy.buffer_image_height;
glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height);
} }
CopyImageToBuffer(copy, buffer_offset); CopyImageToBuffer(copy, map.offset);
} }
} }

View File

@ -35,6 +35,7 @@ struct ImageBufferMap {
~ImageBufferMap(); ~ImageBufferMap();
std::span<u8> mapped_span; std::span<u8> mapped_span;
size_t offset = 0;
OGLSync* sync; OGLSync* sync;
GLuint buffer; GLuint buffer;
}; };
@ -78,7 +79,7 @@ public:
Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Filter filter,
Tegra::Engines::Fermi2D::Operation operation); Tegra::Engines::Fermi2D::Operation operation);
void AccelerateImageUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, void AccelerateImageUpload(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
void InsertUploadMemoryBarrier(); void InsertUploadMemoryBarrier();
@ -137,14 +138,12 @@ public:
explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
VAddr cpu_addr); VAddr cpu_addr);
void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies); std::span<const VideoCommon::BufferImageCopy> copies);
void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferCopy> copies);
std::span<const VideoCommon::BufferCopy> copies);
void DownloadMemory(ImageBufferMap& map, size_t buffer_offset, void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies);
std::span<const VideoCommon::BufferImageCopy> copies);
GLuint Handle() const noexcept { GLuint Handle() const noexcept {
return texture.handle; return texture.handle;

View File

@ -63,7 +63,7 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
UtilShaders::~UtilShaders() = default; UtilShaders::~UtilShaders() = default;
void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
std::span<const SwizzleParameters> swizzles) { std::span<const SwizzleParameters> swizzles) {
static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1};
static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
@ -71,13 +71,13 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle); program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle);
glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
for (const SwizzleParameters& swizzle : swizzles) { for (const SwizzleParameters& swizzle : swizzles) {
const Extent3D num_tiles = swizzle.num_tiles; const Extent3D num_tiles = swizzle.num_tiles;
const size_t input_offset = swizzle.buffer_offset + buffer_offset; const size_t input_offset = swizzle.buffer_offset + map.offset;
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
@ -100,7 +100,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
program_manager.RestoreGuestCompute(); program_manager.RestoreGuestCompute();
} }
void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map,
std::span<const SwizzleParameters> swizzles) { std::span<const SwizzleParameters> swizzles) {
static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8}; static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8};
@ -108,14 +108,14 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
static constexpr GLuint BINDING_INPUT_BUFFER = 1; static constexpr GLuint BINDING_INPUT_BUFFER = 1;
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle); program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
for (const SwizzleParameters& swizzle : swizzles) { for (const SwizzleParameters& swizzle : swizzles) {
const Extent3D num_tiles = swizzle.num_tiles; const Extent3D num_tiles = swizzle.num_tiles;
const size_t input_offset = swizzle.buffer_offset + buffer_offset; const size_t input_offset = swizzle.buffer_offset + map.offset;
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
@ -141,7 +141,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
program_manager.RestoreGuestCompute(); program_manager.RestoreGuestCompute();
} }
void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map,
std::span<const SwizzleParameters> swizzles) { std::span<const SwizzleParameters> swizzles) {
static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1};
static constexpr GLuint BINDING_INPUT_BUFFER = 0; static constexpr GLuint BINDING_INPUT_BUFFER = 0;
@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
"Non-power of two images are not implemented"); "Non-power of two images are not implemented");
program_manager.BindHostCompute(pitch_unswizzle_program.handle); program_manager.BindHostCompute(pitch_unswizzle_program.handle);
glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
glUniform2ui(LOC_ORIGIN, 0, 0); glUniform2ui(LOC_ORIGIN, 0, 0);
glUniform2i(LOC_DESTINATION, 0, 0); glUniform2i(LOC_DESTINATION, 0, 0);
glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block); glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block);
@ -167,7 +167,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, format); glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, format);
for (const SwizzleParameters& swizzle : swizzles) { for (const SwizzleParameters& swizzle : swizzles) {
const Extent3D num_tiles = swizzle.num_tiles; const Extent3D num_tiles = swizzle.num_tiles;
const size_t input_offset = swizzle.buffer_offset + buffer_offset; const size_t input_offset = swizzle.buffer_offset + map.offset;
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);

View File

@ -24,13 +24,13 @@ public:
explicit UtilShaders(ProgramManager& program_manager); explicit UtilShaders(ProgramManager& program_manager);
~UtilShaders(); ~UtilShaders();
void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, void BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, void BlockLinearUpload3D(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
void PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, void PitchUpload(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
void CopyBC4(Image& dst_image, Image& src_image, void CopyBC4(Image& dst_image, Image& src_image,

View File

@ -138,17 +138,18 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format,
u32 base_vertex, u32 num_indices, VkBuffer buffer, u32 base_vertex, u32 num_indices, VkBuffer buffer,
u32 offset, [[maybe_unused]] u32 size) { u32 offset, [[maybe_unused]] u32 size) {
VkIndexType index_type = MaxwellToVK::IndexFormat(index_format); VkIndexType vk_index_type = MaxwellToVK::IndexFormat(index_format);
VkDeviceSize vk_offset = offset;
if (topology == PrimitiveTopology::Quads) { if (topology == PrimitiveTopology::Quads) {
index_type = VK_INDEX_TYPE_UINT32; vk_index_type = VK_INDEX_TYPE_UINT32;
std::tie(buffer, offset) = std::tie(buffer, vk_offset) =
quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset); quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset);
} else if (index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) { } else if (vk_index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
index_type = VK_INDEX_TYPE_UINT16; vk_index_type = VK_INDEX_TYPE_UINT16;
std::tie(buffer, offset) = uint8_pass.Assemble(num_indices, buffer, offset); std::tie(buffer, vk_offset) = uint8_pass.Assemble(num_indices, buffer, offset);
} }
scheduler.Record([buffer, offset, index_type](vk::CommandBuffer cmdbuf) { scheduler.Record([buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) {
cmdbuf.BindIndexBuffer(buffer, offset, index_type); cmdbuf.BindIndexBuffer(buffer, vk_offset, vk_index_type);
}); });
} }
@ -251,10 +252,10 @@ void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle
} }
} }
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([src_buffer = staging.buffer, dst_buffer = *quad_array_lut, scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset,
size_bytes](vk::CommandBuffer cmdbuf) { dst_buffer = *quad_array_lut, size_bytes](vk::CommandBuffer cmdbuf) {
const VkBufferCopy copy{ const VkBufferCopy copy{
.srcOffset = 0, .srcOffset = src_offset,
.dstOffset = 0, .dstOffset = 0,
.size = size_bytes, .size = size_bytes,
}; };

View File

@ -10,6 +10,7 @@
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/common_types.h" #include "common/common_types.h"
#include "common/div_ceil.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_compute_pass.h"
@ -148,38 +149,33 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
Uint8Pass::~Uint8Pass() = default; Uint8Pass::~Uint8Pass() = default;
std::pair<VkBuffer, u32> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, std::pair<VkBuffer, VkDeviceSize> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
u32 src_offset) { u32 src_offset) {
const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16)); const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
update_descriptor_queue.Acquire(); update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size); update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size);
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
num_vertices](vk::CommandBuffer cmdbuf) { num_vertices](vk::CommandBuffer cmdbuf) {
constexpr u32 dispatch_size = 1024; static constexpr u32 DISPATCH_SIZE = 1024;
static constexpr VkMemoryBarrier WRITE_BARRIER{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
};
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
cmdbuf.Dispatch(Common::AlignUp(num_vertices, dispatch_size) / dispatch_size, 1, 1); cmdbuf.Dispatch(Common::DivCeil(num_vertices, DISPATCH_SIZE), 1, 1);
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barrier.pNext = nullptr;
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.buffer = buffer;
barrier.offset = 0;
barrier.size = static_cast<VkDeviceSize>(num_vertices * sizeof(u16));
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER);
}); });
return {staging.buffer, 0}; return {staging.buffer, staging.offset};
} }
QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
@ -194,7 +190,7 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
QuadIndexedPass::~QuadIndexedPass() = default; QuadIndexedPass::~QuadIndexedPass() = default;
std::pair<VkBuffer, u32> QuadIndexedPass::Assemble( std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
VkBuffer src_buffer, u32 src_offset) { VkBuffer src_buffer, u32 src_offset) {
const u32 index_shift = [index_format] { const u32 index_shift = [index_format] {
@ -217,34 +213,29 @@ std::pair<VkBuffer, u32> QuadIndexedPass::Assemble(
update_descriptor_queue.Acquire(); update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size); update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size);
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) { num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) {
static constexpr u32 dispatch_size = 1024; static constexpr u32 DISPATCH_SIZE = 1024;
static constexpr VkMemoryBarrier WRITE_BARRIER{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
};
const std::array push_constants = {base_vertex, index_shift}; const std::array push_constants = {base_vertex, index_shift};
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
&push_constants); &push_constants);
cmdbuf.Dispatch(Common::AlignUp(num_tri_vertices, dispatch_size) / dispatch_size, 1, 1); cmdbuf.Dispatch(Common::DivCeil(num_tri_vertices, DISPATCH_SIZE), 1, 1);
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barrier.pNext = nullptr;
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.buffer = buffer;
barrier.offset = 0;
barrier.size = static_cast<VkDeviceSize>(num_tri_vertices * sizeof(u32));
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER);
}); });
return {staging.buffer, 0}; return {staging.buffer, staging.offset};
} }
} // namespace Vulkan } // namespace Vulkan

View File

@ -50,7 +50,8 @@ public:
/// Assemble uint8 indices into an uint16 index buffer /// Assemble uint8 indices into an uint16 index buffer
/// Returns a pair with the staging buffer, and the offset where the assembled data is /// Returns a pair with the staging buffer, and the offset where the assembled data is
std::pair<VkBuffer, u32> Assemble(u32 num_vertices, VkBuffer src_buffer, u32 src_offset); std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, VkBuffer src_buffer,
u32 src_offset);
private: private:
VKScheduler& scheduler; VKScheduler& scheduler;
@ -66,9 +67,9 @@ public:
VKUpdateDescriptorQueue& update_descriptor_queue_); VKUpdateDescriptorQueue& update_descriptor_queue_);
~QuadIndexedPass(); ~QuadIndexedPass();
std::pair<VkBuffer, u32> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, std::pair<VkBuffer, VkDeviceSize> Assemble(
u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices,
u32 src_offset); u32 base_vertex, VkBuffer src_buffer, u32 src_offset);
private: private:
VKScheduler& scheduler; VKScheduler& scheduler;

View File

@ -8,6 +8,7 @@
#include <fmt/format.h> #include <fmt/format.h>
#include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/bit_util.h" #include "common/bit_util.h"
#include "common/common_types.h" #include "common/common_types.h"
@ -17,14 +18,117 @@
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan { namespace Vulkan {
namespace {
// Maximum potential alignment of a Vulkan buffer
constexpr VkDeviceSize MAX_ALIGNMENT = 256;
// Maximum size to put elements in the stream buffer
constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024;
// Stream buffer size in bytes
constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS;
constexpr VkMemoryPropertyFlags HOST_FLAGS =
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS;
bool IsStreamHeap(VkMemoryHeap heap) noexcept {
return STREAM_BUFFER_SIZE < (heap.size * 2) / 3;
}
std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask,
VkMemoryPropertyFlags flags) noexcept {
for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) {
if (((type_mask >> type_index) & 1) == 0) {
// Memory type is incompatible
continue;
}
const VkMemoryType& memory_type = props.memoryTypes[type_index];
if ((memory_type.propertyFlags & flags) != flags) {
// Memory type doesn't have the flags we want
continue;
}
if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) {
// Memory heap is not suitable for streaming
continue;
}
// Success!
return type_index;
}
return std::nullopt;
}
u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask) {
// Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this
std::optional<u32> type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS);
if (type) {
return *type;
}
// Otherwise try without the DEVICE_LOCAL_BIT
type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS);
if (type) {
return *type;
}
// This should never happen, and in case it does, signal it as an out of memory situation
throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);
}
size_t Region(size_t iterator) noexcept {
return iterator / REGION_SIZE;
}
} // Anonymous namespace
StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
VKScheduler& scheduler_) VKScheduler& scheduler_)
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {} : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {
const vk::Device& dev = device.GetLogical();
stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = STREAM_BUFFER_SIZE,
.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
});
if (device.HasDebuggingToolAttached()) {
stream_buffer.SetObjectNameEXT("Stream Buffer");
}
VkMemoryDedicatedRequirements dedicated_reqs{
.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS,
.pNext = nullptr,
.prefersDedicatedAllocation = VK_FALSE,
.requiresDedicatedAllocation = VK_FALSE,
};
const auto requirements = dev.GetBufferMemoryRequirements(*stream_buffer, &dedicated_reqs);
const bool make_dedicated = dedicated_reqs.prefersDedicatedAllocation == VK_TRUE ||
dedicated_reqs.requiresDedicatedAllocation == VK_TRUE;
const VkMemoryDedicatedAllocateInfo dedicated_info{
.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
.pNext = nullptr,
.image = nullptr,
.buffer = *stream_buffer,
};
const auto memory_properties = device.GetPhysical().GetMemoryProperties();
stream_memory = dev.AllocateMemory(VkMemoryAllocateInfo{
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = make_dedicated ? &dedicated_info : nullptr,
.allocationSize = requirements.size,
.memoryTypeIndex = FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits),
});
if (device.HasDebuggingToolAttached()) {
stream_memory.SetObjectNameEXT("Stream Buffer Memory");
}
stream_buffer.BindMemory(*stream_memory, 0);
stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE);
}
StagingBufferPool::~StagingBufferPool() = default; StagingBufferPool::~StagingBufferPool() = default;
StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) { StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) {
if (usage == MemoryUsage::Upload && size <= MAX_STREAM_BUFFER_REQUEST_SIZE) {
return GetStreamBuffer(size);
}
if (const std::optional<StagingBufferRef> ref = TryGetReservedBuffer(size, usage)) { if (const std::optional<StagingBufferRef> ref = TryGetReservedBuffer(size, usage)) {
return *ref; return *ref;
} }
@ -39,6 +143,42 @@ void StagingBufferPool::TickFrame() {
ReleaseCache(MemoryUsage::Download); ReleaseCache(MemoryUsage::Download);
} }
StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end;
++region) {
sync_ticks[region] = scheduler.CurrentTick();
}
used_iterator = iterator;
for (size_t region = Region(free_iterator) + 1,
region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS);
region < region_end; ++region) {
scheduler.Wait(sync_ticks[region]);
}
if (iterator + size > free_iterator) {
free_iterator = iterator + size;
}
if (iterator + size > STREAM_BUFFER_SIZE) {
for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) {
sync_ticks[region] = scheduler.CurrentTick();
}
used_iterator = 0;
iterator = 0;
free_iterator = size;
for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) {
scheduler.Wait(sync_ticks[region]);
}
}
const size_t offset = iterator;
iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
return StagingBufferRef{
.buffer = *stream_buffer,
.offset = static_cast<VkDeviceSize>(offset),
.mapped_span = std::span<u8>(stream_pointer + offset, size),
};
}
std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t size, std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t size,
MemoryUsage usage) { MemoryUsage usage) {
StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)]; StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)];

View File

@ -19,11 +19,14 @@ class VKScheduler;
struct StagingBufferRef { struct StagingBufferRef {
VkBuffer buffer; VkBuffer buffer;
VkDeviceSize offset;
std::span<u8> mapped_span; std::span<u8> mapped_span;
}; };
class StagingBufferPool { class StagingBufferPool {
public: public:
static constexpr size_t NUM_SYNCS = 16;
explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator, explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator,
VKScheduler& scheduler); VKScheduler& scheduler);
~StagingBufferPool(); ~StagingBufferPool();
@ -33,6 +36,11 @@ public:
void TickFrame(); void TickFrame();
private: private:
struct StreamBufferCommit {
size_t upper_bound;
u64 tick;
};
struct StagingBuffer { struct StagingBuffer {
vk::Buffer buffer; vk::Buffer buffer;
MemoryCommit commit; MemoryCommit commit;
@ -42,6 +50,7 @@ private:
StagingBufferRef Ref() const noexcept { StagingBufferRef Ref() const noexcept {
return { return {
.buffer = *buffer, .buffer = *buffer,
.offset = 0,
.mapped_span = mapped_span, .mapped_span = mapped_span,
}; };
} }
@ -56,6 +65,8 @@ private:
static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT; static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT;
using StagingBuffersCache = std::array<StagingBuffers, NUM_LEVELS>; using StagingBuffersCache = std::array<StagingBuffers, NUM_LEVELS>;
StagingBufferRef GetStreamBuffer(size_t size);
std::optional<StagingBufferRef> TryGetReservedBuffer(size_t size, MemoryUsage usage); std::optional<StagingBufferRef> TryGetReservedBuffer(size_t size, MemoryUsage usage);
StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage); StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage);
@ -70,6 +81,15 @@ private:
MemoryAllocator& memory_allocator; MemoryAllocator& memory_allocator;
VKScheduler& scheduler; VKScheduler& scheduler;
vk::Buffer stream_buffer;
vk::DeviceMemory stream_memory;
u8* stream_pointer = nullptr;
size_t iterator = 0;
size_t used_iterator = 0;
size_t free_iterator = 0;
std::array<u64, NUM_SYNCS> sync_ticks{};
StagingBuffersCache device_local_cache; StagingBuffersCache device_local_cache;
StagingBuffersCache upload_cache; StagingBuffersCache upload_cache;
StagingBuffersCache download_cache; StagingBuffersCache download_cache;

View File

@ -818,11 +818,10 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
} }
} }
void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
std::span<const BufferImageCopy> copies) {
// TODO: Move this to another API // TODO: Move this to another API
scheduler->RequestOutsideRenderPassOperationContext(); scheduler->RequestOutsideRenderPassOperationContext();
std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
const VkBuffer src_buffer = map.buffer; const VkBuffer src_buffer = map.buffer;
const VkImage vk_image = *image; const VkImage vk_image = *image;
const VkImageAspectFlags vk_aspect_mask = aspect_mask; const VkImageAspectFlags vk_aspect_mask = aspect_mask;
@ -833,11 +832,11 @@ void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
}); });
} }
void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, void Image::UploadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferCopy> copies) { std::span<const VideoCommon::BufferCopy> copies) {
// TODO: Move this to another API // TODO: Move this to another API
scheduler->RequestOutsideRenderPassOperationContext(); scheduler->RequestOutsideRenderPassOperationContext();
std::vector vk_copies = TransformBufferCopies(copies, buffer_offset); std::vector vk_copies = TransformBufferCopies(copies, map.offset);
const VkBuffer src_buffer = map.buffer; const VkBuffer src_buffer = map.buffer;
const VkBuffer dst_buffer = *buffer; const VkBuffer dst_buffer = *buffer;
scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
@ -846,9 +845,8 @@ void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset,
}); });
} }
void Image::DownloadMemory(const StagingBufferRef& map, size_t buffer_offset, void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
std::span<const BufferImageCopy> copies) { std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask);
scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask, scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask,
vk_copies](vk::CommandBuffer cmdbuf) { vk_copies](vk::CommandBuffer cmdbuf) {
const VkImageMemoryBarrier read_barrier{ const VkImageMemoryBarrier read_barrier{

View File

@ -82,7 +82,7 @@ struct TextureCacheRuntime {
return false; return false;
} }
void AccelerateImageUpload(Image&, const StagingBufferRef&, size_t, void AccelerateImageUpload(Image&, const StagingBufferRef&,
std::span<const VideoCommon::SwizzleParameters>) { std::span<const VideoCommon::SwizzleParameters>) {
UNREACHABLE(); UNREACHABLE();
} }
@ -100,13 +100,12 @@ public:
explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
VAddr cpu_addr); VAddr cpu_addr);
void UploadMemory(const StagingBufferRef& map, size_t buffer_offset, void UploadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies); std::span<const VideoCommon::BufferImageCopy> copies);
void UploadMemory(const StagingBufferRef& map, size_t buffer_offset, void UploadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferCopy> copies);
std::span<const VideoCommon::BufferCopy> copies);
void DownloadMemory(const StagingBufferRef& map, size_t buffer_offset, void DownloadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies); std::span<const VideoCommon::BufferImageCopy> copies);
[[nodiscard]] VkImage Handle() const noexcept { [[nodiscard]] VkImage Handle() const noexcept {

View File

@ -212,7 +212,7 @@ private:
/// Upload data from guest to an image /// Upload data from guest to an image
template <typename StagingBuffer> template <typename StagingBuffer>
void UploadImageContents(Image& image, StagingBuffer& staging_buffer, size_t buffer_offset); void UploadImageContents(Image& image, StagingBuffer& staging_buffer);
/// Find or create an image view from a guest descriptor /// Find or create an image view from a guest descriptor
[[nodiscard]] ImageViewId FindImageView(const TICEntry& config); [[nodiscard]] ImageViewId FindImageView(const TICEntry& config);
@ -592,7 +592,7 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
Image& image = slot_images[image_id]; Image& image = slot_images[image_id];
auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes); auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
const auto copies = FullDownloadCopies(image.info); const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(map, 0, copies); image.DownloadMemory(map, copies);
runtime.Finish(); runtime.Finish();
SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span); SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
} }
@ -750,24 +750,24 @@ void TextureCache<P>::PopAsyncFlushes() {
total_size_bytes += slot_images[image_id].unswizzled_size_bytes; total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
} }
auto download_map = runtime.DownloadStagingBuffer(total_size_bytes); auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
size_t buffer_offset = 0; const size_t original_offset = download_map.offset;
for (const ImageId image_id : download_ids) { for (const ImageId image_id : download_ids) {
Image& image = slot_images[image_id]; Image& image = slot_images[image_id];
const auto copies = FullDownloadCopies(image.info); const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(download_map, buffer_offset, copies); image.DownloadMemory(download_map, copies);
buffer_offset += image.unswizzled_size_bytes; download_map.offset += image.unswizzled_size_bytes;
} }
// Wait for downloads to finish // Wait for downloads to finish
runtime.Finish(); runtime.Finish();
buffer_offset = 0; download_map.offset = original_offset;
const std::span<u8> download_span = download_map.mapped_span; std::span<u8> download_span = download_map.mapped_span;
for (const ImageId image_id : download_ids) { for (const ImageId image_id : download_ids) {
const ImageBase& image = slot_images[image_id]; const ImageBase& image = slot_images[image_id];
const auto copies = FullDownloadCopies(image.info); const auto copies = FullDownloadCopies(image.info);
const std::span<u8> image_download_span = download_span.subspan(buffer_offset); SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, download_span);
SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, image_download_span); download_map.offset += image.unswizzled_size_bytes;
buffer_offset += image.unswizzled_size_bytes; download_span = download_span.subspan(image.unswizzled_size_bytes);
} }
committed_downloads.pop(); committed_downloads.pop();
} }
@ -798,32 +798,32 @@ void TextureCache<P>::RefreshContents(Image& image) {
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
return; return;
} }
auto map = runtime.UploadStagingBuffer(MapSizeBytes(image)); auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
UploadImageContents(image, map, 0); UploadImageContents(image, staging);
runtime.InsertUploadMemoryBarrier(); runtime.InsertUploadMemoryBarrier();
} }
template <class P> template <class P>
template <typename MapBuffer> template <typename StagingBuffer>
void TextureCache<P>::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) { void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) {
const std::span<u8> mapped_span = map.mapped_span.subspan(buffer_offset); const std::span<u8> mapped_span = staging.mapped_span;
const GPUVAddr gpu_addr = image.gpu_addr; const GPUVAddr gpu_addr = image.gpu_addr;
if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { if (True(image.flags & ImageFlagBits::AcceleratedUpload)) {
gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes()); gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes());
const auto uploads = FullUploadSwizzles(image.info); const auto uploads = FullUploadSwizzles(image.info);
runtime.AccelerateImageUpload(image, map, buffer_offset, uploads); runtime.AccelerateImageUpload(image, staging, uploads);
} else if (True(image.flags & ImageFlagBits::Converted)) { } else if (True(image.flags & ImageFlagBits::Converted)) {
std::vector<u8> unswizzled_data(image.unswizzled_size_bytes); std::vector<u8> unswizzled_data(image.unswizzled_size_bytes);
auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data); auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data);
ConvertImage(unswizzled_data, image.info, mapped_span, copies); ConvertImage(unswizzled_data, image.info, mapped_span, copies);
image.UploadMemory(map, buffer_offset, copies); image.UploadMemory(staging, copies);
} else if (image.info.type == ImageType::Buffer) { } else if (image.info.type == ImageType::Buffer) {
const std::array copies{UploadBufferCopy(gpu_memory, gpu_addr, image, mapped_span)}; const std::array copies{UploadBufferCopy(gpu_memory, gpu_addr, image, mapped_span)};
image.UploadMemory(map, buffer_offset, copies); image.UploadMemory(staging, copies);
} else { } else {
const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span); const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span);
image.UploadMemory(map, buffer_offset, copies); image.UploadMemory(staging, copies);
} }
} }

View File

@ -168,7 +168,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkFreeCommandBuffers); X(vkFreeCommandBuffers);
X(vkFreeDescriptorSets); X(vkFreeDescriptorSets);
X(vkFreeMemory); X(vkFreeMemory);
X(vkGetBufferMemoryRequirements); X(vkGetBufferMemoryRequirements2);
X(vkGetDeviceQueue); X(vkGetDeviceQueue);
X(vkGetEventStatus); X(vkGetEventStatus);
X(vkGetFenceStatus); X(vkGetFenceStatus);
@ -786,10 +786,20 @@ DeviceMemory Device::AllocateMemory(const VkMemoryAllocateInfo& ai) const {
return DeviceMemory(memory, handle, *dld); return DeviceMemory(memory, handle, *dld);
} }
VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer) const noexcept { VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer,
VkMemoryRequirements requirements; void* pnext) const noexcept {
dld->vkGetBufferMemoryRequirements(handle, buffer, &requirements); const VkBufferMemoryRequirementsInfo2 info{
return requirements; .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
.pNext = nullptr,
.buffer = buffer,
};
VkMemoryRequirements2 requirements{
.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
.pNext = pnext,
.memoryRequirements{},
};
dld->vkGetBufferMemoryRequirements2(handle, &info, &requirements);
return requirements.memoryRequirements;
} }
VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noexcept { VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noexcept {

View File

@ -283,7 +283,7 @@ struct DeviceDispatch : InstanceDispatch {
PFN_vkFreeCommandBuffers vkFreeCommandBuffers{}; PFN_vkFreeCommandBuffers vkFreeCommandBuffers{};
PFN_vkFreeDescriptorSets vkFreeDescriptorSets{}; PFN_vkFreeDescriptorSets vkFreeDescriptorSets{};
PFN_vkFreeMemory vkFreeMemory{}; PFN_vkFreeMemory vkFreeMemory{};
PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements{}; PFN_vkGetBufferMemoryRequirements2 vkGetBufferMemoryRequirements2{};
PFN_vkGetDeviceQueue vkGetDeviceQueue{}; PFN_vkGetDeviceQueue vkGetDeviceQueue{};
PFN_vkGetEventStatus vkGetEventStatus{}; PFN_vkGetEventStatus vkGetEventStatus{};
PFN_vkGetFenceStatus vkGetFenceStatus{}; PFN_vkGetFenceStatus vkGetFenceStatus{};
@ -871,7 +871,8 @@ public:
DeviceMemory AllocateMemory(const VkMemoryAllocateInfo& ai) const; DeviceMemory AllocateMemory(const VkMemoryAllocateInfo& ai) const;
VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer) const noexcept; VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer,
void* pnext = nullptr) const noexcept;
VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept; VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept;