From 372bca5945bc455e12bb9d309679e0bb8c00283d Mon Sep 17 00:00:00 2001
From: Ameer J <52414509+ameerj@users.noreply.github.com>
Date: Tue, 19 Dec 2023 15:53:36 -0500
Subject: [PATCH] gl_buffer_cache: Reintroduce NV_vertex_buffer_unified_memory

Workaround Nvidia drivers complaining when a buffer is bound as both a vertex buffer and transform feedback buffer
---
 .../renderer_opengl/gl_buffer_cache.cpp       | 45 ++++++++++++++++---
 .../renderer_opengl/gl_buffer_cache.h         |  1 +
 src/video_core/renderer_opengl/gl_device.cpp  |  1 +
 src/video_core/renderer_opengl/gl_device.h    |  5 +++
 .../renderer_opengl/renderer_opengl.cpp       | 16 ++++++-
 5 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index a71866b75..b787b6994 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -58,6 +58,9 @@ Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rast
         glObjectLabel(GL_BUFFER, buffer.handle, static_cast<GLsizei>(name.size()), name.data());
     }
     glNamedBufferData(buffer.handle, SizeBytes(), nullptr, GL_DYNAMIC_DRAW);
+    if (runtime.has_unified_vertex_buffers) {
+        glGetNamedBufferParameterui64vNV(buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &address);
+    }
 }
 
 void Buffer::ImmediateUpload(size_t offset, std::span<const u8> data) noexcept {
@@ -109,6 +112,7 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_,
     : device{device_}, staging_buffer_pool{staging_buffer_pool_},
       has_fast_buffer_sub_data{device.HasFastBufferSubData()},
       use_assembly_shaders{device.UseAssemblyShaders()},
+      has_unified_vertex_buffers{device.HasVertexBufferUnifiedMemory()},
       stream_buffer{has_fast_buffer_sub_data ? std::nullopt : std::make_optional<StreamBuffer>()} {
     GLint gl_max_attributes;
     glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &gl_max_attributes);
@@ -210,8 +214,14 @@ void BufferCacheRuntime::ClearBuffer(Buffer& dest_buffer, u32 offset, size_t siz
 }
 
 void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) {
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle());
-    index_buffer_offset = offset;
+    if (has_unified_vertex_buffers) {
+        buffer.MakeResident(GL_READ_ONLY);
+        glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, buffer.HostGpuAddr() + offset,
+                               static_cast<GLsizeiptr>(Common::AlignUp(size, 4)));
+    } else {
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle());
+        index_buffer_offset = offset;
+    }
 }
 
 void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size,
@@ -219,8 +229,15 @@ void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset,
     if (index >= max_attributes) {
         return;
     }
-    glBindVertexBuffer(index, buffer.Handle(), static_cast<GLintptr>(offset),
-                       static_cast<GLsizei>(stride));
+    if (has_unified_vertex_buffers) {
+        buffer.MakeResident(GL_READ_ONLY);
+        glBindVertexBuffer(index, 0, 0, static_cast<GLsizei>(stride));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, index,
+                               buffer.HostGpuAddr() + offset, static_cast<GLsizeiptr>(size));
+    } else {
+        glBindVertexBuffer(index, buffer.Handle(), static_cast<GLintptr>(offset),
+                           static_cast<GLsizei>(stride));
+    }
 }
 
 void BufferCacheRuntime::BindVertexBuffers(VideoCommon::HostBindings<Buffer>& bindings) {
@@ -233,9 +250,23 @@ void BufferCacheRuntime::BindVertexBuffers(VideoCommon::HostBindings<Buffer>& bi
                            [](u64 stride) { return static_cast<GLsizei>(stride); });
     const u32 count =
         std::min(static_cast<u32>(bindings.buffers.size()), max_attributes - bindings.min_index);
-    glBindVertexBuffers(bindings.min_index, static_cast<GLsizei>(count), buffer_handles.data(),
-                        reinterpret_cast<const GLintptr*>(bindings.offsets.data()),
-                        buffer_strides.data());
+    if (has_unified_vertex_buffers) {
+        for (u32 index = 0; index < count; ++index) {
+            Buffer& buffer = *bindings.buffers[index];
+            buffer.MakeResident(GL_READ_ONLY);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, bindings.min_index + index,
+                                   buffer.HostGpuAddr() + bindings.offsets[index],
+                                   static_cast<GLsizeiptr>(bindings.sizes[index]));
+        }
+        static constexpr std::array<size_t, 32> ZEROS{};
+        glBindVertexBuffers(bindings.min_index, static_cast<GLsizei>(count),
+                            reinterpret_cast<const GLuint*>(ZEROS.data()),
+                            reinterpret_cast<const GLintptr*>(ZEROS.data()), buffer_strides.data());
+    } else {
+        glBindVertexBuffers(bindings.min_index, static_cast<GLsizei>(count), buffer_handles.data(),
+                            reinterpret_cast<const GLintptr*>(bindings.offsets.data()),
+                            buffer_strides.data());
+    }
 }
 
 void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer,
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 71cd45d35..1e8708f59 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -209,6 +209,7 @@ private:
 
     bool has_fast_buffer_sub_data = false;
     bool use_assembly_shaders = false;
+    bool has_unified_vertex_buffers = false;
 
     bool use_storage_buffers = false;
 
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index a6c93068f..993438a27 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -200,6 +200,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) {
     has_broken_texture_view_formats = is_amd || (!is_linux && is_intel);
     has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
     has_derivative_control = GLAD_GL_ARB_derivative_control;
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
     has_debugging_tool_attached = IsDebugToolAttached(extensions);
     has_depth_buffer_float = HasExtension(extensions, "GL_NV_depth_buffer_float");
     has_geometry_shader_passthrough = GLAD_GL_NV_geometry_shader_passthrough;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 96034ea4a..a5a6bbbba 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -72,6 +72,10 @@ public:
         return has_texture_shadow_lod;
     }
 
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -211,6 +215,7 @@ private:
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
     bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 7a4f0c5c1..2933718b6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -168,6 +168,14 @@ RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
     if (!GLAD_GL_ARB_seamless_cubemap_per_texture && !GLAD_GL_AMD_seamless_cubemap_per_texture) {
         glEnable(GL_TEXTURE_CUBE_MAP_SEAMLESS);
     }
+    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+        glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
+        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+                                         &vertex_buffer_address);
+    }
 }
 
 RendererOpenGL::~RendererOpenGL() = default;
@@ -667,7 +675,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
                          offsetof(ScreenRectVertex, tex_coord));
     glVertexAttribBinding(PositionLocation, 0);
     glVertexAttribBinding(TexCoordLocation, 0);
-    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+                               sizeof(vertices));
+    } else {
+        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    }
 
     if (Settings::values.scaling_filter.GetValue() != Settings::ScalingFilter::NearestNeighbor) {
         glBindSampler(0, present_sampler.handle);