From 8a099ac99f61871f0492864d7e95a5922e57223d Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 15 Apr 2019 12:43:37 -0400 Subject: [PATCH 1/4] Correct Kepler Memory on Linear Pushes. --- src/video_core/engines/kepler_memory.cpp | 40 +++++++++++++++--------- src/video_core/engines/kepler_memory.h | 24 ++++++++++++-- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index cd51a31d7..3ed28f4a7 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -10,6 +10,8 @@ #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" +#include "video_core/textures/convert.h" +#include "video_core/textures/decoders.h" namespace Tegra::Engines { @@ -27,30 +29,40 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { switch (method_call.method) { case KEPLERMEMORY_REG_INDEX(exec): { - state.write_offset = 0; + ProcessExec(); break; } case KEPLERMEMORY_REG_INDEX(data): { - ProcessData(method_call.argument); + ProcessData(method_call.argument, method_call.IsLastCall()); break; } } } -void KeplerMemory::ProcessData(u32 data) { - ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported"); - ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0); +void KeplerMemory::ProcessExec() { + state.write_offset = 0; + state.copy_size = regs.line_length_in * regs.line_count; + state.inner_buffer.resize(state.copy_size); +} - // We have to invalidate the destination region to evict any outdated surfaces from the cache. - // We do this before actually writing the new data because the destination address might - // contain a dirty surface that will have to be written back to memory. - const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)}; - rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32)); - memory_manager.Write(address, data); +void KeplerMemory::ProcessData(u32 data, bool is_last_call) { + const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset); + std::memcpy(&state.inner_buffer[state.write_offset], &data, sub_copy_size); + state.write_offset += sub_copy_size; + if (is_last_call) { + UNIMPLEMENTED_IF_MSG(regs.exec.linear == 0, "Block Linear Copy is not implemented"); + if (regs.exec.linear != 0) { + const GPUVAddr address{regs.dest.Address()}; + const auto host_ptr = memory_manager.GetPointer(address); + // We have to invalidate the destination region to evict any outdated surfaces from the + // cache. We do this before actually writing the new data because the destination + // address might contain a dirty surface that will have to be written back to memory. - system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); - - state.write_offset++; + rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), state.copy_size); + std::memcpy(host_ptr, state.inner_buffer.data(), state.copy_size); + system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + } + } } } // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index 78b6c3e45..5f892ddad 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -6,6 +6,7 @@ #include #include +#include #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" @@ -51,7 +52,11 @@ public: u32 address_high; u32 address_low; u32 pitch; - u32 block_dimensions; + union { + BitField<0, 4, u32> block_width; + BitField<4, 4, u32> block_height; + BitField<8, 4, u32> block_depth; + }; u32 width; u32 height; u32 depth; @@ -63,6 +68,18 @@ public: return static_cast((static_cast(address_high) << 32) | address_low); } + + u32 BlockWidth() const { + return 1U << block_width.Value(); + } + + u32 BlockHeight() const { + return 1U << block_height.Value(); + } + + u32 BlockDepth() const { + return 1U << block_depth.Value(); + } } dest; struct { @@ -81,6 +98,8 @@ public: struct { u32 write_offset = 0; + u32 copy_size = 0; + std::vector inner_buffer; } state{}; private: @@ -88,7 +107,8 @@ private: VideoCore::RasterizerInterface& rasterizer; MemoryManager& memory_manager; - void ProcessData(u32 data); + void ProcessExec(); + void ProcessData(u32 data, bool is_last_call); }; #define ASSERT_REG_POSITION(field_name, position) \ From bec28d692d21a42f17ae26f0ab6271aca1c233cd Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 15 Apr 2019 21:06:04 -0400 Subject: [PATCH 2/4] Implement Block Linear copies in Kepler Memory. --- src/video_core/engines/kepler_memory.cpp | 19 ++++++++++++++----- src/video_core/textures/decoders.cpp | 21 +++++++++++++++++++++ src/video_core/textures/decoders.h | 3 +++ 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 3ed28f4a7..4df19c1f5 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -10,7 +10,6 @@ #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" -#include "video_core/textures/convert.h" #include "video_core/textures/decoders.h" namespace Tegra::Engines { @@ -47,13 +46,12 @@ void KeplerMemory::ProcessExec() { void KeplerMemory::ProcessData(u32 data, bool is_last_call) { const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset); - std::memcpy(&state.inner_buffer[state.write_offset], &data, sub_copy_size); + std::memcpy(&state.inner_buffer[state.write_offset], ®s.data, sub_copy_size); state.write_offset += sub_copy_size; if (is_last_call) { - UNIMPLEMENTED_IF_MSG(regs.exec.linear == 0, "Block Linear Copy is not implemented"); + const GPUVAddr address{regs.dest.Address()}; + const auto host_ptr = memory_manager.GetPointer(address); if (regs.exec.linear != 0) { - const GPUVAddr address{regs.dest.Address()}; - const auto host_ptr = memory_manager.GetPointer(address); // We have to invalidate the destination region to evict any outdated surfaces from the // cache. We do this before actually writing the new data because the destination // address might contain a dirty surface that will have to be written back to memory. @@ -61,6 +59,17 @@ void KeplerMemory::ProcessData(u32 data, bool is_last_call) { rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), state.copy_size); std::memcpy(host_ptr, state.inner_buffer.data(), state.copy_size); system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + } else { + UNIMPLEMENTED_IF(regs.dest.z != 0); + UNIMPLEMENTED_IF(regs.dest.depth != 1); + UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1); + UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1); + const std::size_t dst_size = Tegra::Texture::CalculateSize( + true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1); + rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), dst_size); + Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, + regs.dest.y, regs.dest.BlockHeight(), state.copy_size, + state.inner_buffer.data(), host_ptr); } } } diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 995d0e068..6e02a6407 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -288,6 +288,27 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 } } +void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, + std::size_t copy_size, u8* source_data, u8* swizzle_data) { + const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x}; + std::size_t count = 0; + for (u32 y = dst_y; y < height && count < copy_size; ++y) { + const u32 gob_address_y = + (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + + ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size; + const auto& table = legacy_swizzle_table[y % gob_size_y]; + for (u32 x = dst_x; x < width && count < copy_size; ++x) { + const u32 gob_address = gob_address_y + (x / gob_size_x) * gob_size * block_height; + const u32 swizzled_offset = gob_address + table[x % gob_size_x]; + const u8* source_line = source_data + count; + u8* dest_addr = swizzle_data + swizzled_offset; + count++; + + std::memcpy(dest_addr, source_line, 1); + } + } +} + std::vector DecodeTexture(const std::vector& texture_data, TextureFormat format, u32 width, u32 height) { std::vector rgba_data; diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index e078fa274..21d4b37fc 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -51,4 +51,7 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, u32 offset_x, u32 offset_y); +void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, + std::size_t copy_size, u8* source_data, u8* swizzle_data); + } // namespace Tegra::Texture From 3e96c367bd1729d1a6c8bfd8b532301da85d4b5a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Mon, 15 Apr 2019 22:42:34 -0400 Subject: [PATCH 3/4] Use WriteBlock and ReadBlock. --- src/video_core/engines/kepler_memory.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 4df19c1f5..7387886a3 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -50,15 +50,8 @@ void KeplerMemory::ProcessData(u32 data, bool is_last_call) { state.write_offset += sub_copy_size; if (is_last_call) { const GPUVAddr address{regs.dest.Address()}; - const auto host_ptr = memory_manager.GetPointer(address); if (regs.exec.linear != 0) { - // We have to invalidate the destination region to evict any outdated surfaces from the - // cache. We do this before actually writing the new data because the destination - // address might contain a dirty surface that will have to be written back to memory. - - rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), state.copy_size); - std::memcpy(host_ptr, state.inner_buffer.data(), state.copy_size); - system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + memory_manager.WriteBlock(address, state.inner_buffer.data(), state.copy_size); } else { UNIMPLEMENTED_IF(regs.dest.z != 0); UNIMPLEMENTED_IF(regs.dest.depth != 1); @@ -66,11 +59,14 @@ void KeplerMemory::ProcessData(u32 data, bool is_last_call) { UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1); const std::size_t dst_size = Tegra::Texture::CalculateSize( true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1); - rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), dst_size); + std::vector tmp_buffer(dst_size); + memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size); Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y, regs.dest.BlockHeight(), state.copy_size, - state.inner_buffer.data(), host_ptr); + state.inner_buffer.data(), tmp_buffer.data()); + memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); } + system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); } } From da91e6e4b6aabe13f7b748930de8afa28d10aa6c Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Tue, 16 Apr 2019 12:00:46 -0400 Subject: [PATCH 4/4] Apply Const correctness to SwizzleKepler and replace u32 for size_t on iterators. --- src/video_core/textures/decoders.cpp | 16 +++++++++------- src/video_core/textures/decoders.h | 5 +++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 6e02a6407..217805386 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -288,18 +288,20 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 } } -void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, - std::size_t copy_size, u8* source_data, u8* swizzle_data) { +void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, + const u32 block_height, const std::size_t copy_size, const u8* source_data, + u8* swizzle_data) { const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x}; std::size_t count = 0; - for (u32 y = dst_y; y < height && count < copy_size; ++y) { - const u32 gob_address_y = + for (std::size_t y = dst_y; y < height && count < copy_size; ++y) { + const std::size_t gob_address_y = (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size; const auto& table = legacy_swizzle_table[y % gob_size_y]; - for (u32 x = dst_x; x < width && count < copy_size; ++x) { - const u32 gob_address = gob_address_y + (x / gob_size_x) * gob_size * block_height; - const u32 swizzled_offset = gob_address + table[x % gob_size_x]; + for (std::size_t x = dst_x; x < width && count < copy_size; ++x) { + const std::size_t gob_address = + gob_address_y + (x / gob_size_x) * gob_size * block_height; + const std::size_t swizzled_offset = gob_address + table[x % gob_size_x]; const u8* source_line = source_data + count; u8* dest_addr = swizzle_data + swizzled_offset; count++; diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index 21d4b37fc..e072d8401 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -51,7 +51,8 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, u32 offset_x, u32 offset_y); -void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height, - std::size_t copy_size, u8* source_data, u8* swizzle_data); +void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, + const u32 block_height, const std::size_t copy_size, const u8* source_data, + u8* swizzle_data); } // namespace Tegra::Texture