dolphin/Source/Core/VideoCommon/VertexLoader.h
degasus ec28a80e00 VideoLoader: remove VAT_*_FRACBITS
They are used to remove the flush amounts, but as we don't
flush anymore on vertex loader changes (only on native
vertex format right now), this optimization is now unneeded.

This will allow us to hard code the frac factors within the
vertex loaders.
2014-12-21 13:47:42 +01:00

225 lines
6.2 KiB
C++

// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.
#pragma once
// Top vertex loaders
// Metroid Prime: P I16-flt N I16-s16 T0 I16-u16 T1 i16-flt
#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include "Common/CommonTypes.h"
#include "Common/x64Emitter.h"
#include "VideoCommon/CPMemory.h"
#include "VideoCommon/DataReader.h"
#include "VideoCommon/NativeVertexFormat.h"
#include "VideoCommon/VertexLoaderUtils.h"
#if _M_SSE >= 0x401
#include <smmintrin.h>
#include <emmintrin.h>
#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
#include <tmmintrin.h>
#endif
#ifdef _M_X86
#define USE_VERTEX_LOADER_JIT
#endif
// They are used for the communication with the loader functions
extern int tcIndex;
extern int colIndex;
extern int colElements[2];
GC_ALIGNED128(extern float posScale[4]);
GC_ALIGNED64(extern float tcScale[8][2]);
class VertexLoaderUID
{
u32 vid[5];
size_t hash;
public:
VertexLoaderUID()
{
}
VertexLoaderUID(const TVtxDesc& vtx_desc, const VAT& vat)
{
vid[0] = vtx_desc.Hex & 0xFFFFFFFF;
vid[1] = vtx_desc.Hex >> 32;
vid[2] = vat.g0.Hex;
vid[3] = vat.g1.Hex;
vid[4] = vat.g2.Hex;
hash = CalculateHash();
}
bool operator < (const VertexLoaderUID &other) const
{
// This is complex because of speed.
if (vid[0] < other.vid[0])
return true;
else if (vid[0] > other.vid[0])
return false;
for (int i = 1; i < 5; ++i)
{
if (vid[i] < other.vid[i])
return true;
else if (vid[i] > other.vid[i])
return false;
}
return false;
}
bool operator == (const VertexLoaderUID& rh) const
{
return hash == rh.hash && std::equal(vid, vid + sizeof(vid) / sizeof(vid[0]), rh.vid);
}
size_t GetHash() const
{
return hash;
}
private:
size_t CalculateHash()
{
size_t h = -1;
for (auto word : vid)
{
h = h * 137 + word;
}
return h;
}
};
// ARMTODO: This should be done in a better way
#ifndef _M_GENERIC
class VertexLoader : public Gen::X64CodeBlock
#else
class VertexLoader
#endif
{
public:
VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr);
~VertexLoader();
int GetVertexSize() const {return m_VertexSize;}
u32 GetNativeComponents() const { return m_native_components; }
const PortableVertexDeclaration& GetNativeVertexDeclaration() const
{ return m_native_vtx_decl; }
void SetupRunVertices(int primitive, int const count);
int RunVertices(int primitive, int count, DataReader src, DataReader dst);
// For debugging / profiling
void AppendToString(std::string *dest) const;
int GetNumLoadedVerts() const { return m_numLoadedVertices; }
NativeVertexFormat* GetNativeVertexFormat();
static void ClearNativeVertexFormatCache() { s_native_vertex_map.clear(); }
private:
int m_VertexSize; // number of bytes of a raw GC vertex. Computed by CompileVertexTranslator.
// GC vertex format
TVtxAttr m_VtxAttr; // VAT decoded into easy format
TVtxDesc m_VtxDesc; // Not really used currently - or well it is, but could be easily avoided.
VAT m_vat;
// PC vertex format
u32 m_native_components;
PortableVertexDeclaration m_native_vtx_decl;
#ifndef USE_VERTEX_LOADER_JIT
// Pipeline.
TPipelineFunction m_PipelineStages[64]; // TODO - figure out real max. it's lower.
int m_numPipelineStages;
#endif
const u8 *m_compiledCode;
int m_numLoadedVertices;
NativeVertexFormat* m_native_vertex_format;
static std::unordered_map<PortableVertexDeclaration, std::unique_ptr<NativeVertexFormat>> s_native_vertex_map;
void SetVAT(const VAT& vat);
void CompileVertexTranslator();
void ConvertVertices(int count);
void WriteCall(TPipelineFunction);
#ifndef _M_GENERIC
void WriteGetVariable(int bits, Gen::OpArg dest, void *address);
void WriteSetVariable(int bits, void *address, Gen::OpArg dest);
#endif
};
#if _M_SSE >= 0x301
static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
static const __m128i kMaskSwap16to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L);
static const __m128i kMaskSwap16to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L);
static const __m128i kMaskSwap16to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL);
static const __m128i kMaskSwap16to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL);
static const __m128i kMask8to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L);
static const __m128i kMask8to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L);
static const __m128i kMask8to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);
static const __m128i kMask8to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);
template <typename T, bool threeIn, bool threeOut>
__forceinline void Vertex_Read_SSSE3(const T* pData, __m128 scale)
{
__m128i coords, mask;
int loadBytes = sizeof(T) * (2 + threeIn);
if (loadBytes > 8)
coords = _mm_loadu_si128((__m128i*)pData);
else if (loadBytes > 4)
coords = _mm_loadl_epi64((__m128i*)pData);
else
coords = _mm_cvtsi32_si128(*(u32*)pData);
// Float case (no scaling)
if (sizeof(T) == 4)
{
coords = _mm_shuffle_epi8(coords, threeIn ? kMaskSwap32_3 : kMaskSwap32_2);
if (threeOut)
_mm_storeu_si128((__m128i*)g_vertex_manager_write_ptr, coords);
else
_mm_storel_epi64((__m128i*)g_vertex_manager_write_ptr, coords);
}
else
{
// Byte swap, unpack, and move to high bytes for sign extend.
if (std::is_unsigned<T>::value)
mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32l_3 : kMaskSwap16to32l_2) : (threeIn ? kMask8to32l_3 : kMask8to32l_2);
else
mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32h_3 : kMaskSwap16to32h_2) : (threeIn ? kMask8to32h_3 : kMask8to32h_2);
coords = _mm_shuffle_epi8(coords, mask);
// Sign extend
if (std::is_signed<T>::value)
coords = _mm_srai_epi32(coords, 32 - sizeof(T) * 8);
__m128 out = _mm_mul_ps(_mm_cvtepi32_ps(coords), scale);
if (threeOut)
_mm_storeu_ps((float*)g_vertex_manager_write_ptr, out);
else
_mm_storel_pi((__m64*)g_vertex_manager_write_ptr, out);
}
g_vertex_manager_write_ptr += sizeof(float) * (2 + threeOut);
}
#endif