From 41929371dcc267a1f367dbfb458d22b9a3cce9e8 Mon Sep 17 00:00:00 2001
From: Dwayne Slater <ds84182@gmail.com>
Date: Tue, 2 Jan 2018 18:32:33 -0500
Subject: [PATCH] Optimize AttributeBuffer to OutputVertex conversion (#3283)

Optimize AttributeBuffer to OutputVertex conversion

First I unrolled the inner loop, then I pushed semantics validation
outside of the hotloop.

I also added overflow slots to avoid conditional branches.

Super Mario 3D Land's intro runs at almost full speed when compiled with
Clang, and theres a noticible speed increase in MSVC. GCC hasn't been
tested but I'm confident in its ability to optimize this code.
---
 src/video_core/command_processor.cpp |  3 ++
 src/video_core/regs_rasterizer.h     |  2 ++
 src/video_core/shader/shader.cpp     | 46 +++++++++++++++++-----------
 src/video_core/shader/shader.h       |  1 +
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index c34a277db..3ccd1297e 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -221,6 +221,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     MICROPROFILE_SCOPE(GPU_Drawing);
                     immediate_attribute_id = 0;
 
+                    Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
+
                     auto* shader_engine = Shader::GetEngine();
                     shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
 
@@ -289,6 +291,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
         // Later, these can be compiled and cached.
         const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
         VertexLoader loader(regs.pipeline);
+        Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
 
         // Load vertices
         bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
diff --git a/src/video_core/regs_rasterizer.h b/src/video_core/regs_rasterizer.h
index 4fef00d76..9f68494ce 100644
--- a/src/video_core/regs_rasterizer.h
+++ b/src/video_core/regs_rasterizer.h
@@ -87,6 +87,8 @@ struct RasterizerRegs {
         BitField<8, 5, Semantic> map_y;
         BitField<16, 5, Semantic> map_z;
         BitField<24, 5, Semantic> map_w;
+
+        u32 raw;
     } vs_output_attributes[7];
 
     INSERT_PADDING_WORDS(0xe);
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 2d0ffe821..194c9df11 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <cinttypes>
 #include <cmath>
 #include <cstring>
 #include "common/bit_set.h"
@@ -21,32 +22,41 @@ namespace Pica {
 
 namespace Shader {
 
+void OutputVertex::ValidateSemantics(const RasterizerRegs& regs) {
+    unsigned int num_attributes = regs.vs_output_total;
+    ASSERT(num_attributes <= 7);
+    for (size_t attrib = 0; attrib < num_attributes; ++attrib) {
+        u32 output_register_map = regs.vs_output_attributes[attrib].raw;
+        for (size_t comp = 0; comp < 4; ++comp) {
+            u32 semantic = (output_register_map >> (8 * comp)) & 0x1F;
+            ASSERT_MSG(semantic < 24 || semantic == RasterizerRegs::VSOutputAttributes::INVALID,
+                       "Invalid/unknown semantic id: %" PRIu32, semantic);
+        }
+    }
+}
+
 OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
                                                const AttributeBuffer& input) {
     // Setup output data
     union {
         OutputVertex ret{};
-        std::array<float24, 24> vertex_slots;
+        // Allow us to overflow OutputVertex to avoid branches, since
+        // RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which
+        // would be out of bounds otherwise.
+        std::array<float24, 32> vertex_slots_overflow;
     };
-    static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes.");
 
-    unsigned int num_attributes = regs.vs_output_total;
-    ASSERT(num_attributes <= 7);
-    for (unsigned int i = 0; i < num_attributes; ++i) {
-        const auto& output_register_map = regs.vs_output_attributes[i];
+    // Assert that OutputVertex has enough space for 24 semantic registers
+    static_assert(sizeof(std::array<float24, 24>) == sizeof(ret),
+                  "Struct and array have different sizes.");
 
-        RasterizerRegs::VSOutputAttributes::Semantic semantics[4] = {
-            output_register_map.map_x, output_register_map.map_y, output_register_map.map_z,
-            output_register_map.map_w};
-
-        for (unsigned comp = 0; comp < 4; ++comp) {
-            RasterizerRegs::VSOutputAttributes::Semantic semantic = semantics[comp];
-            if (semantic < vertex_slots.size()) {
-                vertex_slots[semantic] = input.attr[i][comp];
-            } else if (semantic != RasterizerRegs::VSOutputAttributes::INVALID) {
-                LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic);
-            }
-        }
+    unsigned int num_attributes = regs.vs_output_total & 7;
+    for (size_t attrib = 0; attrib < num_attributes; ++attrib) {
+        const auto output_register_map = regs.vs_output_attributes[attrib];
+        vertex_slots_overflow[output_register_map.map_x] = input.attr[attrib][0];
+        vertex_slots_overflow[output_register_map.map_y] = input.attr[attrib][1];
+        vertex_slots_overflow[output_register_map.map_z] = input.attr[attrib][2];
+        vertex_slots_overflow[output_register_map.map_w] = input.attr[attrib][3];
     }
 
     // The hardware takes the absolute and saturates vertex colors like this, *before* doing
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 8740a1618..21ac01064 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -50,6 +50,7 @@ struct OutputVertex {
     INSERT_PADDING_WORDS(1);
     Math::Vec2<float24> tc2;
 
+    static void ValidateSemantics(const RasterizerRegs& regs);
     static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
                                             const AttributeBuffer& output);
 };