From 809126c94a0ed8e7964d5a550abf7b3731d00512 Mon Sep 17 00:00:00 2001 From: Morph <39850852+Morph1984@users.noreply.github.com> Date: Thu, 1 Sep 2022 21:29:22 -0400 Subject: [PATCH] astc: Enable parallel CPU astc decoding Given the issues with GPU accelerated ASTC decoding with NVIDIA's latest drivers, parallelize astc decoding on the CPU. Uses half the available threads in the system for astc decoding. --- src/video_core/textures/astc.cpp | 50 ++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index e3f3d3c5d..b159494c5 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -13,7 +13,9 @@ #include +#include "common/alignment.h" #include "common/common_types.h" +#include "common/thread_worker.h" #include "video_core/textures/astc.h" class InputBitStream { @@ -1650,29 +1652,41 @@ static void DecompressBlock(std::span inBuf, const u32 blockWidth, void Decompress(std::span data, uint32_t width, uint32_t height, uint32_t depth, uint32_t block_width, uint32_t block_height, std::span output) { - u32 block_index = 0; - std::size_t depth_offset = 0; - for (u32 z = 0; z < depth; z++) { - for (u32 y = 0; y < height; y += block_height) { - for (u32 x = 0; x < width; x += block_width) { - const std::span blockPtr{data.subspan(block_index * 16, 16)}; + const u32 rows = Common::DivideUp(height, block_height); + const u32 cols = Common::DivideUp(width, block_width); - // Blocks can be at most 12x12 - std::array uncompData; - DecompressBlock(blockPtr, block_width, block_height, uncompData); + Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2, + "yuzu:ASTCDecompress"}; - u32 decompWidth = std::min(block_width, width - x); - u32 decompHeight = std::min(block_height, height - y); + for (u32 z = 0; z < depth; ++z) { + const u32 depth_offset = z * height * width * 4; + for (u32 y_index = 0; y_index < rows; ++y_index) { + auto decompress_stride = [data, width, height, depth, block_width, block_height, output, + rows, cols, z, depth_offset, y_index] { + const u32 y = y_index * block_height; + for (u32 x_index = 0; x_index < cols; ++x_index) { + const u32 block_index = (z * rows * cols) + (y_index * cols) + x_index; + const u32 x = x_index * block_width; - const std::span outRow = output.subspan(depth_offset + (y * width + x) * 4); - for (u32 jj = 0; jj < decompHeight; jj++) { - std::memcpy(outRow.data() + jj * width * 4, - uncompData.data() + jj * block_width, decompWidth * 4); + const std::span blockPtr{data.subspan(block_index * 16, 16)}; + + // Blocks can be at most 12x12 + std::array uncompData; + DecompressBlock(blockPtr, block_width, block_height, uncompData); + + u32 decompWidth = std::min(block_width, width - x); + u32 decompHeight = std::min(block_height, height - y); + + const std::span outRow = output.subspan(depth_offset + (y * width + x) * 4); + for (u32 h = 0; h < decompHeight; ++h) { + std::memcpy(outRow.data() + h * width * 4, + uncompData.data() + h * block_width, decompWidth * 4); + } } - ++block_index; - } + }; + workers.QueueWork(std::move(decompress_stride)); } - depth_offset += height * width * 4; + workers.WaitForRequests(); } }