shader_ir: Implement VOTE

Implement VOTE using Nvidia's intrinsics. Documentation about these can be found here https://developer.nvidia.com/reading-between-threads-shader-intrinsics Instead of using portable ARB instructions I opted to use Nvidia intrinsics because these are the closest we have to how Tegra X1 hardware renders. To stub VOTE on non-Nvidia drivers (including nouveau) this commit simulates a GPU with a warp size of one, returning what is meaningful for the instruction being emulated: * anyThreadNV(value) -> value * allThreadsNV(value) -> value * allThreadsEqualNV(value) -> true ballotARB, also known as "uint64_t(activeThreadsNV())", emits VOTE.ANY Rd, PT, PT; on nouveau's compiler. This doesn't match exactly to Nvidia's code VOTE.ALL Rd, PT, PT; Which is emulated with activeThreadsNV() by this commit. In theory this shouldn't really matter since .ANY, .ALL and .EQ affect the predicates (set to PT on those cases) and not the registers.
2019-08-09 23:50:21 -03:00 · 2019-08-09 23:50:21 -03:00 · 4e35177e23
commit 4e35177e23
parent b4a8cfbd00
13 changed files with 164 additions and 1 deletions
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@ -81,6 +81,7 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
    "${VIDEO_CORE}/shader/decode/shift.cpp"
    "${VIDEO_CORE}/shader/decode/video.cpp"
+    "${VIDEO_CORE}/shader/decode/warp.cpp"
    "${VIDEO_CORE}/shader/decode/xmad.cpp"
    "${VIDEO_CORE}/shader/control_flow.cpp"
    "${VIDEO_CORE}/shader/control_flow.h"
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@ -55,6 +55,7 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
      "${VIDEO_CORE}/shader/decode/shift.cpp"
      "${VIDEO_CORE}/shader/decode/video.cpp"
+      "${VIDEO_CORE}/shader/decode/warp.cpp"
      "${VIDEO_CORE}/shader/decode/xmad.cpp"
      "${VIDEO_CORE}/shader/control_flow.cpp"
      "${VIDEO_CORE}/shader/control_flow.h"
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -100,6 +100,7 @@ add_library(video_core STATIC
    shader/decode/integer_set.cpp
    shader/decode/half_set.cpp
    shader/decode/video.cpp
+    shader/decode/warp.cpp
    shader/decode/xmad.cpp
    shader/decode/other.cpp
    shader/control_flow.cpp
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@ -538,6 +538,12 @@ enum class PhysicalAttributeDirection : u64 {
    Output = 1,
 };

+enum class VoteOperation : u64 {
+    All = 0, // allThreadsNV
+    Any = 1, // anyThreadNV
+    Eq = 2,  // allThreadsEqualNV
+};
+
 union Instruction {
    Instruction& operator=(const Instruction& instr) {
        value = instr.value;
@ -564,6 +570,13 @@ union Instruction {
        BitField<13, 1, u64> trigger;
    } nop;

+    union {
+        BitField<48, 2, VoteOperation> operation;
+        BitField<45, 3, u64> dest_pred;
+        BitField<39, 3, u64> value;
+        BitField<42, 1, u64> negate_value;
+    } vote;
+
    union {
        BitField<8, 8, Register> gpr;
        BitField<20, 24, s64> offset;
@ -1487,6 +1500,7 @@ public:
        SYNC,
        BRK,
        DEPBAR,
+        VOTE,
        BFE_C,
        BFE_R,
        BFE_IMM,
@ -1649,6 +1663,7 @@ public:
        Hfma2,
        Flow,
        Synch,
+        Warp,
        Memory,
        Texture,
        Image,
@ -1775,6 +1790,7 @@ private:
            INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
            INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
            INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
+            INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
            INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
            INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
            INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@ -27,6 +27,8 @@ Device::Device() {
    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
+                          GLAD_GL_NV_shader_thread_shuffle;
    has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = TestComponentIndexingBug();
@ -36,6 +38,7 @@ Device::Device(std::nullptr_t) {
    uniform_buffer_alignment = 0;
    max_vertex_attributes = 16;
    max_varyings = 15;
+    has_warp_intrinsics = true;
    has_vertex_viewport_layer = true;
    has_variable_aoffi = true;
    has_component_indexing_bug = false;
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@ -30,6 +30,10 @@ public:
        return max_varyings;
    }

+    bool HasWarpIntrinsics() const {
+        return has_warp_intrinsics;
+    }
+
    bool HasVertexViewportLayer() const {
        return has_vertex_viewport_layer;
    }
@ -50,6 +54,7 @@ private:
    std::size_t shader_storage_alignment{};
    u32 max_vertex_attributes{};
    u32 max_varyings{};
+    bool has_warp_intrinsics{};
    bool has_vertex_viewport_layer{};
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@ -212,7 +212,9 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
    const auto texture_buffer_usage{variant.texture_buffer_usage};

    std::string source = "#version 430 core\n"
-                         "#extension GL_ARB_separate_shader_objects : enable\n";
+                         "#extension GL_ARB_separate_shader_objects : enable\n"
+                         "#extension GL_NV_gpu_shader5 : enable\n"
+                         "#extension GL_NV_shader_thread_group : enable\n";
    if (entries.shader_viewport_layer_array) {
        source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
    }
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@ -1735,6 +1735,48 @@ private:
        return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
    }

+    std::string BallotThread(Operation operation) {
+        const std::string value = VisitOperand(operation, 0, Type::Bool);
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia warp intrinsics are not available and its required by a shader");
+            // Stub on non-Nvidia devices by simulating all threads voting the same as the active
+            // one.
+            return fmt::format("utof({} ? 0xFFFFFFFFU : 0U)", value);
+        }
+        return fmt::format("utof(ballotThreadNV({}))", value);
+    }
+
+    std::string Vote(Operation operation, const char* func) {
+        const std::string value = VisitOperand(operation, 0, Type::Bool);
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia vote intrinsics are not available and its required by a shader");
+            // Stub with a warp size of one.
+            return value;
+        }
+        return fmt::format("{}({})", func, value);
+    }
+
+    std::string VoteAll(Operation operation) {
+        return Vote(operation, "allThreadsNV");
+    }
+
+    std::string VoteAny(Operation operation) {
+        return Vote(operation, "anyThreadNV");
+    }
+
+    std::string VoteEqual(Operation operation) {
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia vote intrinsics are not available and its required by a shader");
+            // We must return true here since a stub for a theoretical warp size of 1 will always
+            // return an equal result for all its votes.
+            return "true";
+        }
+        return Vote(operation, "allThreadsEqualNV");
+    }
+
    static constexpr std::array operation_decompilers = {
        &GLSLDecompiler::Assign,

@ -1885,6 +1927,11 @@ private:
        &GLSLDecompiler::WorkGroupId<0>,
        &GLSLDecompiler::WorkGroupId<1>,
        &GLSLDecompiler::WorkGroupId<2>,
+
+        &GLSLDecompiler::BallotThread,
+        &GLSLDecompiler::VoteAll,
+        &GLSLDecompiler::VoteAny,
+        &GLSLDecompiler::VoteEqual,
    };
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@ -1072,6 +1072,26 @@ private:
        return {};
    }

+    Id BallotThread(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteAll(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteAny(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteEqual(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
    Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
                      const std::string& name) {
        const Id id = OpVariable(type, storage);
@ -1364,6 +1384,11 @@ private:
        &SPIRVDecompiler::WorkGroupId<0>,
        &SPIRVDecompiler::WorkGroupId<1>,
        &SPIRVDecompiler::WorkGroupId<2>,
+
+        &SPIRVDecompiler::BallotThread,
+        &SPIRVDecompiler::VoteAll,
+        &SPIRVDecompiler::VoteAny,
+        &SPIRVDecompiler::VoteEqual,
    };
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@ -176,6 +176,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
        {OpCode::Type::Ffma, &ShaderIR::DecodeFfma},
        {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2},
        {OpCode::Type::Conversion, &ShaderIR::DecodeConversion},
+        {OpCode::Type::Warp, &ShaderIR::DecodeWarp},
        {OpCode::Type::Memory, &ShaderIR::DecodeMemory},
        {OpCode::Type::Texture, &ShaderIR::DecodeTexture},
        {OpCode::Type::Image, &ShaderIR::DecodeImage},
--- a/src/video_core/shader/decode/warp.cpp
+++ b/src/video_core/shader/decode/warp.cpp
@ -0,0 +1,55 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+using Tegra::Shader::Pred;
+using Tegra::Shader::VoteOperation;
+
+namespace {
+OperationCode GetOperationCode(VoteOperation vote_op) {
+    switch (vote_op) {
+    case VoteOperation::All:
+        return OperationCode::VoteAll;
+    case VoteOperation::Any:
+        return OperationCode::VoteAny;
+    case VoteOperation::Eq:
+        return OperationCode::VoteEqual;
+    default:
+        UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op));
+        return OperationCode::VoteAll;
+    }
+}
+} // Anonymous namespace
+
+u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
+    const Instruction instr = {program_code[pc]};
+    const auto opcode = OpCode::Decode(instr);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::VOTE: {
+        const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0);
+        const Node active = Operation(OperationCode::BallotThread, value);
+        const Node vote = Operation(GetOperationCode(instr.vote.operation), value);
+        SetRegister(bb, instr.gpr0, active);
+        SetPredicate(bb, instr.vote.dest_pred, vote);
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
+        break;
+    }
+
+    return pc;
+}
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@ -168,6 +168,11 @@ enum class OperationCode {
    WorkGroupIdY,       /// () -> uint
    WorkGroupIdZ,       /// () -> uint

+    BallotThread, /// (bool) -> uint
+    VoteAll,      /// (bool) -> bool
+    VoteAny,      /// (bool) -> bool
+    VoteEqual,    /// (bool) -> bool
+
    Amount,
 };

--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@ -167,6 +167,7 @@ private:
    u32 DecodeFfma(NodeBlock& bb, u32 pc);
    u32 DecodeHfma2(NodeBlock& bb, u32 pc);
    u32 DecodeConversion(NodeBlock& bb, u32 pc);
+    u32 DecodeWarp(NodeBlock& bb, u32 pc);
    u32 DecodeMemory(NodeBlock& bb, u32 pc);
    u32 DecodeTexture(NodeBlock& bb, u32 pc);
    u32 DecodeImage(NodeBlock& bb, u32 pc);