OpenGL Cache: Optimize Morton Copy to copy in tiles
Compiles two lookup arrays of functions for the different configurations of Morton Copy.
This commit is contained in:
parent
160ac25527
commit
e9e2d444ef
|
@ -46,6 +46,82 @@ static const std::array<FormatTuple, 4> depth_format_tuples = {{
|
||||||
{GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24S8
|
{GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24S8
|
||||||
}};
|
}};
|
||||||
|
|
||||||
|
static bool FillSurface(const Surface& surface, const u8* fill_data,
|
||||||
|
const MathUtil::Rectangle<u32>& fill_rect) {
|
||||||
|
OpenGLState state = OpenGLState::GetCurState();
|
||||||
|
|
||||||
|
OpenGLState prev_state = state;
|
||||||
|
SCOPE_EXIT({ prev_state.Apply(); });
|
||||||
|
|
||||||
|
OpenGLState::ResetTexture(surface->texture.handle);
|
||||||
|
|
||||||
|
state.scissor.enabled = true;
|
||||||
|
state.scissor.x = static_cast<GLint>(fill_rect.left);
|
||||||
|
state.scissor.y = static_cast<GLint>(std::min(fill_rect.top, fill_rect.bottom));
|
||||||
|
state.scissor.width = static_cast<GLsizei>(fill_rect.GetWidth());
|
||||||
|
state.scissor.height = static_cast<GLsizei>(fill_rect.GetHeight());
|
||||||
|
|
||||||
|
state.draw.draw_framebuffer = transfer_framebuffers[1].handle;
|
||||||
|
state.Apply();
|
||||||
|
|
||||||
|
if (surface->type == SurfaceType::Color || surface->type == SurfaceType::Texture) {
|
||||||
|
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
|
||||||
|
surface->texture.handle, 0);
|
||||||
|
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
|
||||||
|
0);
|
||||||
|
|
||||||
|
Pica::Texture::TextureInfo tex_info{};
|
||||||
|
tex_info.format = static_cast<Pica::TexturingRegs::TextureFormat>(surface->pixel_format);
|
||||||
|
Math::Vec4<u8> color = Pica::Texture::LookupTexture(fill_data, 0, 0, tex_info);
|
||||||
|
|
||||||
|
std::array<GLfloat, 4> color_values = {color.x / 255.f, color.y / 255.f, color.z / 255.f,
|
||||||
|
color.w / 255.f};
|
||||||
|
|
||||||
|
state.color_mask.red_enabled = GL_TRUE;
|
||||||
|
state.color_mask.green_enabled = GL_TRUE;
|
||||||
|
state.color_mask.blue_enabled = GL_TRUE;
|
||||||
|
state.color_mask.alpha_enabled = GL_TRUE;
|
||||||
|
state.Apply();
|
||||||
|
glClearBufferfv(GL_COLOR, 0, &color_values[0]);
|
||||||
|
} else if (surface->type == SurfaceType::Depth) {
|
||||||
|
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
|
||||||
|
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
|
||||||
|
surface->texture.handle, 0);
|
||||||
|
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
|
||||||
|
|
||||||
|
u32 value_32bit = 0;
|
||||||
|
GLfloat value_float;
|
||||||
|
|
||||||
|
if (surface->pixel_format == SurfaceParams::PixelFormat::D16) {
|
||||||
|
std::memcpy(&value_32bit, fill_data, 2);
|
||||||
|
value_float = value_32bit / 65535.0f; // 2^16 - 1
|
||||||
|
} else if (surface->pixel_format == SurfaceParams::PixelFormat::D24) {
|
||||||
|
std::memcpy(&value_32bit, fill_data, 3);
|
||||||
|
value_float = value_32bit / 16777215.0f; // 2^24 - 1
|
||||||
|
}
|
||||||
|
|
||||||
|
state.depth.write_mask = GL_TRUE;
|
||||||
|
state.Apply();
|
||||||
|
glClearBufferfv(GL_DEPTH, 0, &value_float);
|
||||||
|
} else if (surface->type == SurfaceType::DepthStencil) {
|
||||||
|
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
|
||||||
|
glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
|
||||||
|
surface->texture.handle, 0);
|
||||||
|
|
||||||
|
u32 value_32bit;
|
||||||
|
std::memcpy(&value_32bit, fill_data, 4);
|
||||||
|
|
||||||
|
GLfloat value_float = (value_32bit & 0xFFFFFF) / 16777215.0f; // 2^24 - 1
|
||||||
|
GLint value_int = (value_32bit >> 24);
|
||||||
|
|
||||||
|
state.depth.write_mask = GL_TRUE;
|
||||||
|
state.stencil.write_mask = -1;
|
||||||
|
state.Apply();
|
||||||
|
glClearBufferfi(GL_DEPTH_STENCIL, 0, value_float, value_int);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
RasterizerCacheOpenGL::RasterizerCacheOpenGL() {
|
RasterizerCacheOpenGL::RasterizerCacheOpenGL() {
|
||||||
transfer_framebuffers[0].Create();
|
transfer_framebuffers[0].Create();
|
||||||
transfer_framebuffers[1].Create();
|
transfer_framebuffers[1].Create();
|
||||||
|
@ -55,55 +131,131 @@ RasterizerCacheOpenGL::~RasterizerCacheOpenGL() {
|
||||||
FlushAll();
|
FlushAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void MortonCopyPixels(CachedSurface::PixelFormat pixel_format, u32 width, u32 height,
|
template <bool morton_to_gl, PixelFormat format>
|
||||||
u32 bytes_per_pixel, u32 gl_bytes_per_pixel, u8* morton_data,
|
static void MortonCopyTile(u32 stride, u8* tile_buffer, u8* gl_buffer) {
|
||||||
u8* gl_data, bool morton_to_gl) {
|
constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / 8;
|
||||||
using PixelFormat = CachedSurface::PixelFormat;
|
constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);
|
||||||
|
for (u32 y = 0; y < 8; ++y) {
|
||||||
u8* data_ptrs[2];
|
for (u32 x = 0; x < 8; ++x) {
|
||||||
u32 depth_stencil_shifts[2] = {24, 8};
|
u8* tile_ptr = tile_buffer + VideoCore::MortonInterleave(x, y) * bytes_per_pixel;
|
||||||
|
u8* gl_ptr = gl_buffer + ((7 - y) * stride + x) * gl_bytes_per_pixel;
|
||||||
if (morton_to_gl) {
|
if (morton_to_gl) {
|
||||||
std::swap(depth_stencil_shifts[0], depth_stencil_shifts[1]);
|
if (format == PixelFormat::D24S8) {
|
||||||
}
|
gl_ptr[0] = tile_ptr[3];
|
||||||
|
std::memcpy(gl_ptr + 1, tile_ptr, 3);
|
||||||
if (pixel_format == PixelFormat::D24S8) {
|
} else {
|
||||||
for (unsigned y = 0; y < height; ++y) {
|
std::memcpy(gl_ptr, tile_ptr, bytes_per_pixel);
|
||||||
for (unsigned x = 0; x < width; ++x) {
|
|
||||||
const u32 coarse_y = y & ~7;
|
|
||||||
u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
|
|
||||||
coarse_y * width * bytes_per_pixel;
|
|
||||||
u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
|
|
||||||
|
|
||||||
data_ptrs[morton_to_gl] = morton_data + morton_offset;
|
|
||||||
data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];
|
|
||||||
|
|
||||||
// Swap depth and stencil value ordering since 3DS does not match OpenGL
|
|
||||||
u32 depth_stencil;
|
|
||||||
memcpy(&depth_stencil, data_ptrs[1], sizeof(u32));
|
|
||||||
depth_stencil = (depth_stencil << depth_stencil_shifts[0]) |
|
|
||||||
(depth_stencil >> depth_stencil_shifts[1]);
|
|
||||||
|
|
||||||
memcpy(data_ptrs[0], &depth_stencil, sizeof(u32));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (unsigned y = 0; y < height; ++y) {
|
if (format == PixelFormat::D24S8) {
|
||||||
for (unsigned x = 0; x < width; ++x) {
|
std::memcpy(tile_ptr, gl_ptr + 1, 3);
|
||||||
const u32 coarse_y = y & ~7;
|
tile_ptr[3] = gl_ptr[0];
|
||||||
u32 morton_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
|
} else {
|
||||||
coarse_y * width * bytes_per_pixel;
|
std::memcpy(tile_ptr, gl_ptr, bytes_per_pixel);
|
||||||
u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
data_ptrs[morton_to_gl] = morton_data + morton_offset;
|
template <bool morton_to_gl, PixelFormat format>
|
||||||
data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];
|
static void MortonCopy(u32 stride, u32 height, u8* gl_buffer, PAddr base, PAddr start, PAddr end) {
|
||||||
|
constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / 8;
|
||||||
|
constexpr u32 tile_size = bytes_per_pixel * 64;
|
||||||
|
|
||||||
memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
|
constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);
|
||||||
}
|
static_assert(gl_bytes_per_pixel >= bytes_per_pixel, "");
|
||||||
}
|
gl_buffer += gl_bytes_per_pixel - bytes_per_pixel;
|
||||||
|
|
||||||
|
const PAddr aligned_down_start = base + Common::AlignDown(start - base, tile_size);
|
||||||
|
const PAddr aligned_start = base + Common::AlignUp(start - base, tile_size);
|
||||||
|
const PAddr aligned_end = base + Common::AlignDown(end - base, tile_size);
|
||||||
|
|
||||||
|
ASSERT(!morton_to_gl || (aligned_start == start && aligned_end == end));
|
||||||
|
|
||||||
|
const u32 begin_pixel_index = (aligned_down_start - base) / bytes_per_pixel;
|
||||||
|
u32 x = (begin_pixel_index % (stride * 8)) / 8;
|
||||||
|
u32 y = (begin_pixel_index / (stride * 8)) * 8;
|
||||||
|
|
||||||
|
gl_buffer += ((height - 8 - y) * stride + x) * gl_bytes_per_pixel;
|
||||||
|
|
||||||
|
auto glbuf_next_tile = [&] {
|
||||||
|
x = (x + 8) % stride;
|
||||||
|
gl_buffer += 8 * gl_bytes_per_pixel;
|
||||||
|
if (!x) {
|
||||||
|
y += 8;
|
||||||
|
gl_buffer -= stride * 9 * gl_bytes_per_pixel;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
u8* tile_buffer = Memory::GetPhysicalPointer(start);
|
||||||
|
|
||||||
|
if (start < aligned_start && !morton_to_gl) {
|
||||||
|
std::array<u8, tile_size> tmp_buf;
|
||||||
|
MortonCopyTile<morton_to_gl, format>(stride, &tmp_buf[0], gl_buffer);
|
||||||
|
std::memcpy(tile_buffer, &tmp_buf[start - aligned_down_start],
|
||||||
|
std::min(aligned_start, end) - start);
|
||||||
|
|
||||||
|
tile_buffer += aligned_start - start;
|
||||||
|
glbuf_next_tile();
|
||||||
|
}
|
||||||
|
|
||||||
|
u8* const buffer_end = tile_buffer + aligned_end - aligned_start;
|
||||||
|
while (tile_buffer < buffer_end) {
|
||||||
|
MortonCopyTile<morton_to_gl, format>(stride, tile_buffer, gl_buffer);
|
||||||
|
tile_buffer += tile_size;
|
||||||
|
glbuf_next_tile();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (end > std::max(aligned_start, aligned_end) && !morton_to_gl) {
|
||||||
|
std::array<u8, tile_size> tmp_buf;
|
||||||
|
MortonCopyTile<morton_to_gl, format>(stride, &tmp_buf[0], gl_buffer);
|
||||||
|
std::memcpy(tile_buffer, &tmp_buf[0], end - aligned_end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static constexpr std::array<void (*)(u32, u32, u8*, PAddr, PAddr, PAddr), 18> morton_to_gl_fns = {
|
||||||
|
MortonCopy<true, PixelFormat::RGBA8>, // 0
|
||||||
|
MortonCopy<true, PixelFormat::RGB8>, // 1
|
||||||
|
MortonCopy<true, PixelFormat::RGB5A1>, // 2
|
||||||
|
MortonCopy<true, PixelFormat::RGB565>, // 3
|
||||||
|
MortonCopy<true, PixelFormat::RGBA4>, // 4
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr, // 5 - 13
|
||||||
|
MortonCopy<true, PixelFormat::D16>, // 14
|
||||||
|
nullptr, // 15
|
||||||
|
MortonCopy<true, PixelFormat::D24>, // 16
|
||||||
|
MortonCopy<true, PixelFormat::D24S8> // 17
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr std::array<void (*)(u32, u32, u8*, PAddr, PAddr, PAddr), 18> gl_to_morton_fns = {
|
||||||
|
MortonCopy<false, PixelFormat::RGBA8>, // 0
|
||||||
|
MortonCopy<false, PixelFormat::RGB8>, // 1
|
||||||
|
MortonCopy<false, PixelFormat::RGB5A1>, // 2
|
||||||
|
MortonCopy<false, PixelFormat::RGB565>, // 3
|
||||||
|
MortonCopy<false, PixelFormat::RGBA4>, // 4
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr, // 5 - 13
|
||||||
|
MortonCopy<false, PixelFormat::D16>, // 14
|
||||||
|
nullptr, // 15
|
||||||
|
MortonCopy<false, PixelFormat::D24>, // 16
|
||||||
|
MortonCopy<false, PixelFormat::D24S8> // 17
|
||||||
|
};
|
||||||
|
|
||||||
void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex,
|
void RasterizerCacheOpenGL::BlitTextures(GLuint src_tex, GLuint dst_tex,
|
||||||
CachedSurface::SurfaceType type,
|
CachedSurface::SurfaceType type,
|
||||||
const MathUtil::Rectangle<int>& src_rect,
|
const MathUtil::Rectangle<int>& src_rect,
|
||||||
|
|
|
@ -71,8 +71,8 @@ struct CachedSurface {
|
||||||
Invalid = 4,
|
Invalid = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
static unsigned int GetFormatBpp(CachedSurface::PixelFormat format) {
|
static constexpr unsigned int GetFormatBpp(CachedSurface::PixelFormat format) {
|
||||||
static const std::array<unsigned int, 18> bpp_table = {
|
constexpr std::array<unsigned int, 18> bpp_table = {
|
||||||
32, // RGBA8
|
32, // RGBA8
|
||||||
24, // RGB8
|
24, // RGB8
|
||||||
16, // RGB5A1
|
16, // RGB5A1
|
||||||
|
@ -142,7 +142,7 @@ struct CachedSurface {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static SurfaceType GetFormatType(PixelFormat pixel_format) {
|
static constexpr SurfaceType GetFormatType(PixelFormat pixel_format) {
|
||||||
if ((unsigned int)pixel_format < 5) {
|
if ((unsigned int)pixel_format < 5) {
|
||||||
return SurfaceType::Color;
|
return SurfaceType::Color;
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,9 +9,9 @@
|
||||||
namespace VideoCore {
|
namespace VideoCore {
|
||||||
|
|
||||||
// 8x8 Z-Order coordinate from 2D coordinates
|
// 8x8 Z-Order coordinate from 2D coordinates
|
||||||
static inline u32 MortonInterleave(u32 x, u32 y) {
|
static constexpr u32 MortonInterleave(u32 x, u32 y) {
|
||||||
static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15};
|
constexpr u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15};
|
||||||
static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a};
|
constexpr u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a};
|
||||||
return xlut[x % 8] + ylut[y % 8];
|
return xlut[x % 8] + ylut[y % 8];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Reference in New Issue