astc: Enable parallel CPU astc decoding
Given the issues with GPU accelerated ASTC decoding with NVIDIA's latest drivers, parallelize astc decoding on the CPU. Uses half the available threads in the system for astc decoding.
This commit is contained in:
parent
4b07596b83
commit
809126c94a
|
@ -13,7 +13,9 @@
|
||||||
|
|
||||||
#include <boost/container/static_vector.hpp>
|
#include <boost/container/static_vector.hpp>
|
||||||
|
|
||||||
|
#include "common/alignment.h"
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
|
#include "common/thread_worker.h"
|
||||||
#include "video_core/textures/astc.h"
|
#include "video_core/textures/astc.h"
|
||||||
|
|
||||||
class InputBitStream {
|
class InputBitStream {
|
||||||
|
@ -1650,11 +1652,22 @@ static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
|
||||||
|
|
||||||
void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
|
void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
|
||||||
uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
|
uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) {
|
||||||
u32 block_index = 0;
|
const u32 rows = Common::DivideUp(height, block_height);
|
||||||
std::size_t depth_offset = 0;
|
const u32 cols = Common::DivideUp(width, block_width);
|
||||||
for (u32 z = 0; z < depth; z++) {
|
|
||||||
for (u32 y = 0; y < height; y += block_height) {
|
Common::ThreadWorker workers{std::max(std::thread::hardware_concurrency(), 2U) / 2,
|
||||||
for (u32 x = 0; x < width; x += block_width) {
|
"yuzu:ASTCDecompress"};
|
||||||
|
|
||||||
|
for (u32 z = 0; z < depth; ++z) {
|
||||||
|
const u32 depth_offset = z * height * width * 4;
|
||||||
|
for (u32 y_index = 0; y_index < rows; ++y_index) {
|
||||||
|
auto decompress_stride = [data, width, height, depth, block_width, block_height, output,
|
||||||
|
rows, cols, z, depth_offset, y_index] {
|
||||||
|
const u32 y = y_index * block_height;
|
||||||
|
for (u32 x_index = 0; x_index < cols; ++x_index) {
|
||||||
|
const u32 block_index = (z * rows * cols) + (y_index * cols) + x_index;
|
||||||
|
const u32 x = x_index * block_width;
|
||||||
|
|
||||||
const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
|
const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)};
|
||||||
|
|
||||||
// Blocks can be at most 12x12
|
// Blocks can be at most 12x12
|
||||||
|
@ -1665,14 +1678,15 @@ void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height,
|
||||||
u32 decompHeight = std::min(block_height, height - y);
|
u32 decompHeight = std::min(block_height, height - y);
|
||||||
|
|
||||||
const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
|
const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4);
|
||||||
for (u32 jj = 0; jj < decompHeight; jj++) {
|
for (u32 h = 0; h < decompHeight; ++h) {
|
||||||
std::memcpy(outRow.data() + jj * width * 4,
|
std::memcpy(outRow.data() + h * width * 4,
|
||||||
uncompData.data() + jj * block_width, decompWidth * 4);
|
uncompData.data() + h * block_width, decompWidth * 4);
|
||||||
}
|
|
||||||
++block_index;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
depth_offset += height * width * 4;
|
};
|
||||||
|
workers.QueueWork(std::move(decompress_stride));
|
||||||
|
}
|
||||||
|
workers.WaitForRequests();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Reference in New Issue