Optimize AttributeBuffer to OutputVertex conversion (#3283)
Optimize AttributeBuffer to OutputVertex conversion First I unrolled the inner loop, then I pushed semantics validation outside of the hotloop. I also added overflow slots to avoid conditional branches. Super Mario 3D Land's intro runs at almost full speed when compiled with Clang, and theres a noticible speed increase in MSVC. GCC hasn't been tested but I'm confident in its ability to optimize this code.
This commit is contained in:
parent
3f7f2b42c0
commit
41929371dc
|
@ -221,6 +221,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
|
||||||
MICROPROFILE_SCOPE(GPU_Drawing);
|
MICROPROFILE_SCOPE(GPU_Drawing);
|
||||||
immediate_attribute_id = 0;
|
immediate_attribute_id = 0;
|
||||||
|
|
||||||
|
Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
|
||||||
|
|
||||||
auto* shader_engine = Shader::GetEngine();
|
auto* shader_engine = Shader::GetEngine();
|
||||||
shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
|
shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
|
||||||
|
|
||||||
|
@ -289,6 +291,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
|
||||||
// Later, these can be compiled and cached.
|
// Later, these can be compiled and cached.
|
||||||
const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
|
const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
|
||||||
VertexLoader loader(regs.pipeline);
|
VertexLoader loader(regs.pipeline);
|
||||||
|
Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
|
||||||
|
|
||||||
// Load vertices
|
// Load vertices
|
||||||
bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
|
bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
|
||||||
|
|
|
@ -87,6 +87,8 @@ struct RasterizerRegs {
|
||||||
BitField<8, 5, Semantic> map_y;
|
BitField<8, 5, Semantic> map_y;
|
||||||
BitField<16, 5, Semantic> map_z;
|
BitField<16, 5, Semantic> map_z;
|
||||||
BitField<24, 5, Semantic> map_w;
|
BitField<24, 5, Semantic> map_w;
|
||||||
|
|
||||||
|
u32 raw;
|
||||||
} vs_output_attributes[7];
|
} vs_output_attributes[7];
|
||||||
|
|
||||||
INSERT_PADDING_WORDS(0xe);
|
INSERT_PADDING_WORDS(0xe);
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
// Licensed under GPLv2 or any later version
|
// Licensed under GPLv2 or any later version
|
||||||
// Refer to the license.txt file included.
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
|
#include <cinttypes>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include "common/bit_set.h"
|
#include "common/bit_set.h"
|
||||||
|
@ -21,32 +22,41 @@ namespace Pica {
|
||||||
|
|
||||||
namespace Shader {
|
namespace Shader {
|
||||||
|
|
||||||
|
void OutputVertex::ValidateSemantics(const RasterizerRegs& regs) {
|
||||||
|
unsigned int num_attributes = regs.vs_output_total;
|
||||||
|
ASSERT(num_attributes <= 7);
|
||||||
|
for (size_t attrib = 0; attrib < num_attributes; ++attrib) {
|
||||||
|
u32 output_register_map = regs.vs_output_attributes[attrib].raw;
|
||||||
|
for (size_t comp = 0; comp < 4; ++comp) {
|
||||||
|
u32 semantic = (output_register_map >> (8 * comp)) & 0x1F;
|
||||||
|
ASSERT_MSG(semantic < 24 || semantic == RasterizerRegs::VSOutputAttributes::INVALID,
|
||||||
|
"Invalid/unknown semantic id: %" PRIu32, semantic);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
|
OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
|
||||||
const AttributeBuffer& input) {
|
const AttributeBuffer& input) {
|
||||||
// Setup output data
|
// Setup output data
|
||||||
union {
|
union {
|
||||||
OutputVertex ret{};
|
OutputVertex ret{};
|
||||||
std::array<float24, 24> vertex_slots;
|
// Allow us to overflow OutputVertex to avoid branches, since
|
||||||
|
// RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which
|
||||||
|
// would be out of bounds otherwise.
|
||||||
|
std::array<float24, 32> vertex_slots_overflow;
|
||||||
};
|
};
|
||||||
static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes.");
|
|
||||||
|
|
||||||
unsigned int num_attributes = regs.vs_output_total;
|
// Assert that OutputVertex has enough space for 24 semantic registers
|
||||||
ASSERT(num_attributes <= 7);
|
static_assert(sizeof(std::array<float24, 24>) == sizeof(ret),
|
||||||
for (unsigned int i = 0; i < num_attributes; ++i) {
|
"Struct and array have different sizes.");
|
||||||
const auto& output_register_map = regs.vs_output_attributes[i];
|
|
||||||
|
|
||||||
RasterizerRegs::VSOutputAttributes::Semantic semantics[4] = {
|
unsigned int num_attributes = regs.vs_output_total & 7;
|
||||||
output_register_map.map_x, output_register_map.map_y, output_register_map.map_z,
|
for (size_t attrib = 0; attrib < num_attributes; ++attrib) {
|
||||||
output_register_map.map_w};
|
const auto output_register_map = regs.vs_output_attributes[attrib];
|
||||||
|
vertex_slots_overflow[output_register_map.map_x] = input.attr[attrib][0];
|
||||||
for (unsigned comp = 0; comp < 4; ++comp) {
|
vertex_slots_overflow[output_register_map.map_y] = input.attr[attrib][1];
|
||||||
RasterizerRegs::VSOutputAttributes::Semantic semantic = semantics[comp];
|
vertex_slots_overflow[output_register_map.map_z] = input.attr[attrib][2];
|
||||||
if (semantic < vertex_slots.size()) {
|
vertex_slots_overflow[output_register_map.map_w] = input.attr[attrib][3];
|
||||||
vertex_slots[semantic] = input.attr[i][comp];
|
|
||||||
} else if (semantic != RasterizerRegs::VSOutputAttributes::INVALID) {
|
|
||||||
LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// The hardware takes the absolute and saturates vertex colors like this, *before* doing
|
// The hardware takes the absolute and saturates vertex colors like this, *before* doing
|
||||||
|
|
|
@ -50,6 +50,7 @@ struct OutputVertex {
|
||||||
INSERT_PADDING_WORDS(1);
|
INSERT_PADDING_WORDS(1);
|
||||||
Math::Vec2<float24> tc2;
|
Math::Vec2<float24> tc2;
|
||||||
|
|
||||||
|
static void ValidateSemantics(const RasterizerRegs& regs);
|
||||||
static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
|
static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
|
||||||
const AttributeBuffer& output);
|
const AttributeBuffer& output);
|
||||||
};
|
};
|
||||||
|
|
Reference in New Issue