gdkchan a731ab3a2a Add a new JIT compiler for CPU code (#693)
* Start of the ARMeilleure project

* Refactoring around the old IRAdapter, now renamed to PreAllocator

* Optimize the LowestBitSet method

* Add CLZ support and fix CLS implementation

* Add missing Equals and GetHashCode overrides on some structs, misc small tweaks

* Implement the ByteSwap IR instruction, and some refactoring on the assembler

* Implement the DivideUI IR instruction and fix 64-bits IDIV

* Correct constant operand type on CSINC

* Move division instructions implementation to InstEmitDiv

* Fix destination type for the ConditionalSelect IR instruction

* Implement UMULH and SMULH, with new IR instructions

* Fix some issues with shift instructions

* Fix constant types for BFM instructions

* Fix up new tests using the new V128 struct

* Update tests

* Move DIV tests to a separate file

* Add support for calls, and some instructions that depends on them

* Start adding support for SIMD & FP types, along with some of the related ARM instructions

* Fix some typos and the divide instruction with FP operands

* Fix wrong method call on Clz_V

* Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes

* Implement SIMD logical instructions and more misc. fixes

* Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations

* Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes

* Implement SIMD shift instruction and fix Dup_V

* Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table

* Fix check with tolerance on tester

* Implement FP & SIMD comparison instructions, and some fixes

* Update FCVT (Scalar) encoding on the table to support the Half-float variants

* Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes

* Use old memory access methods, made a start on SIMD memory insts support, some fixes

* Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes

* Fix arguments count with struct return values, other fixes

* More instructions

* Misc. fixes and integrate LDj3SNuD fixes

* Update tests

* Add a faster linear scan allocator, unwinding support on windows, and other changes

* Update Ryujinx.HLE

* Update Ryujinx.Graphics

* Fix V128 return pointer passing, RCX is clobbered

* Update Ryujinx.Tests

* Update ITimeZoneService

* Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks

* Use generic GetFunctionPointerForDelegate method and other tweaks

* Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics

* Remove some unused code on the assembler

* Fix REX.W prefix regression on float conversion instructions, add some sort of profiler

* Add hardware capability detection

* Fix regression on Sha1h and revert Fcm** changes

* Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator

* Fix silly mistake introduced on last commit on CpuId

* Generate inline stack probes when the stack allocation is too large

* Initial support for the System-V ABI

* Support multiple destination operands

* Fix SSE2 VectorInsert8 path, and other fixes

* Change placement of XMM callee save and restore code to match other compilers

* Rename Dest to Destination and Inst to Instruction

* Fix a regression related to calls and the V128 type

* Add an extra space on comments to match code style

* Some refactoring

* Fix vector insert FP32 SSE2 path

* Port over the ARM32 instructions

* Avoid memory protection races on JIT Cache

* Another fix on VectorInsert FP32 (thanks to LDj3SNuD

* Float operands don't need to use the same register when VEX is supported

* Add a new register allocator, higher quality code for hot code (tier up), and other tweaks

* Some nits, small improvements on the pre allocator

* CpuThreadState is gone

* Allow changing CPU emulators with a config entry

* Add runtime identifiers on the ARMeilleure project

* Allow switching between CPUs through a config entry (pt. 2)

* Change win10-x64 to win-x64 on projects

* Update the Ryujinx project to use ARMeilleure

* Ensure that the selected register is valid on the hybrid allocator

* Allow exiting on returns to 0 (should fix test regression)

* Remove register assignments for most used variables on the hybrid allocator

* Do not use fixed registers as spill temp

* Add missing namespace and remove unneeded using

* Address PR feedback

* Fix types, etc

* Enable AssumeStrictAbiCompliance by default

* Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 21:56:22 +03:00

404 lines
14 KiB
C#

using ARMeilleure.Memory;
using Ryujinx.Audio;
using Ryujinx.Audio.Adpcm;
using Ryujinx.Common.Logging;
using Ryujinx.HLE.HOS.Ipc;
using Ryujinx.HLE.HOS.Kernel.Common;
using Ryujinx.HLE.HOS.Kernel.Threading;
using Ryujinx.HLE.Utilities;
using System;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace Ryujinx.HLE.HOS.Services.Aud.AudioRenderer
{
class IAudioRenderer : IpcService, IDisposable
{
// This is the amount of samples that are going to be appended
// each time that RequestUpdateAudioRenderer is called. Ideally,
// this value shouldn't be neither too small (to avoid the player
// starving due to running out of samples) or too large (to avoid
// high latency).
private const int MixBufferSamplesCount = 960;
private KEvent _updateEvent;
private IMemoryManager _memory;
private IAalOutput _audioOut;
private AudioRendererParameter _params;
private MemoryPoolContext[] _memoryPools;
private VoiceContext[] _voices;
private int _track;
private PlayState _playState;
public IAudioRenderer(
Horizon system,
IMemoryManager memory,
IAalOutput audioOut,
AudioRendererParameter Params)
{
_updateEvent = new KEvent(system);
_memory = memory;
_audioOut = audioOut;
_params = Params;
_track = audioOut.OpenTrack(
AudioConsts.HostSampleRate,
AudioConsts.HostChannelsCount,
AudioCallback);
_memoryPools = CreateArray<MemoryPoolContext>(Params.EffectCount + Params.VoiceCount * 4);
_voices = CreateArray<VoiceContext>(Params.VoiceCount);
InitializeAudioOut();
_playState = PlayState.Stopped;
}
[Command(0)]
// GetSampleRate() -> u32
public ResultCode GetSampleRate(ServiceCtx context)
{
context.ResponseData.Write(_params.SampleRate);
return ResultCode.Success;
}
[Command(1)]
// GetSampleCount() -> u32
public ResultCode GetSampleCount(ServiceCtx context)
{
context.ResponseData.Write(_params.SampleCount);
return ResultCode.Success;
}
[Command(2)]
// GetMixBufferCount() -> u32
public ResultCode GetMixBufferCount(ServiceCtx context)
{
context.ResponseData.Write(_params.MixCount);
return ResultCode.Success;
}
[Command(3)]
// GetState() -> u32
public ResultCode GetState(ServiceCtx context)
{
context.ResponseData.Write((int)_playState);
Logger.PrintStub(LogClass.ServiceAudio, new { State = Enum.GetName(typeof(PlayState), _playState) });
return ResultCode.Success;
}
private void AudioCallback()
{
_updateEvent.ReadableEvent.Signal();
}
private static T[] CreateArray<T>(int size) where T : new()
{
T[] output = new T[size];
for (int index = 0; index < size; index++)
{
output[index] = new T();
}
return output;
}
private void InitializeAudioOut()
{
AppendMixedBuffer(0);
AppendMixedBuffer(1);
AppendMixedBuffer(2);
_audioOut.Start(_track);
}
[Command(4)]
// RequestUpdateAudioRenderer(buffer<nn::audio::detail::AudioRendererUpdateDataHeader, 5>)
// -> (buffer<nn::audio::detail::AudioRendererUpdateDataHeader, 6>, buffer<nn::audio::detail::AudioRendererUpdateDataHeader, 6>)
public ResultCode RequestUpdateAudioRenderer(ServiceCtx context)
{
long outputPosition = context.Request.ReceiveBuff[0].Position;
long outputSize = context.Request.ReceiveBuff[0].Size;
MemoryHelper.FillWithZeros(context.Memory, outputPosition, (int)outputSize);
long inputPosition = context.Request.SendBuff[0].Position;
StructReader reader = new StructReader(context.Memory, inputPosition);
StructWriter writer = new StructWriter(context.Memory, outputPosition);
UpdateDataHeader inputHeader = reader.Read<UpdateDataHeader>();
reader.Read<BehaviorIn>(inputHeader.BehaviorSize);
MemoryPoolIn[] memoryPoolsIn = reader.Read<MemoryPoolIn>(inputHeader.MemoryPoolSize);
for (int index = 0; index < memoryPoolsIn.Length; index++)
{
MemoryPoolIn memoryPool = memoryPoolsIn[index];
if (memoryPool.State == MemoryPoolState.RequestAttach)
{
_memoryPools[index].OutStatus.State = MemoryPoolState.Attached;
}
else if (memoryPool.State == MemoryPoolState.RequestDetach)
{
_memoryPools[index].OutStatus.State = MemoryPoolState.Detached;
}
}
reader.Read<VoiceChannelResourceIn>(inputHeader.VoiceResourceSize);
VoiceIn[] voicesIn = reader.Read<VoiceIn>(inputHeader.VoiceSize);
for (int index = 0; index < voicesIn.Length; index++)
{
VoiceIn voice = voicesIn[index];
VoiceContext voiceCtx = _voices[index];
voiceCtx.SetAcquireState(voice.Acquired != 0);
if (voice.Acquired == 0)
{
continue;
}
if (voice.FirstUpdate != 0)
{
voiceCtx.AdpcmCtx = GetAdpcmDecoderContext(
voice.AdpcmCoeffsPosition,
voice.AdpcmCoeffsSize);
voiceCtx.SampleFormat = voice.SampleFormat;
voiceCtx.SampleRate = voice.SampleRate;
voiceCtx.ChannelsCount = voice.ChannelsCount;
voiceCtx.SetBufferIndex(voice.BaseWaveBufferIndex);
}
voiceCtx.WaveBuffers[0] = voice.WaveBuffer0;
voiceCtx.WaveBuffers[1] = voice.WaveBuffer1;
voiceCtx.WaveBuffers[2] = voice.WaveBuffer2;
voiceCtx.WaveBuffers[3] = voice.WaveBuffer3;
voiceCtx.Volume = voice.Volume;
voiceCtx.PlayState = voice.PlayState;
}
UpdateAudio();
UpdateDataHeader outputHeader = new UpdateDataHeader();
int updateHeaderSize = Marshal.SizeOf<UpdateDataHeader>();
outputHeader.Revision = IAudioRendererManager.RevMagic;
outputHeader.BehaviorSize = 0xb0;
outputHeader.MemoryPoolSize = (_params.EffectCount + _params.VoiceCount * 4) * 0x10;
outputHeader.VoiceSize = _params.VoiceCount * 0x10;
outputHeader.EffectSize = _params.EffectCount * 0x10;
outputHeader.SinkSize = _params.SinkCount * 0x20;
outputHeader.PerformanceManagerSize = 0x10;
outputHeader.TotalSize = updateHeaderSize +
outputHeader.BehaviorSize +
outputHeader.MemoryPoolSize +
outputHeader.VoiceSize +
outputHeader.EffectSize +
outputHeader.SinkSize +
outputHeader.PerformanceManagerSize;
writer.Write(outputHeader);
foreach (MemoryPoolContext memoryPool in _memoryPools)
{
writer.Write(memoryPool.OutStatus);
}
foreach (VoiceContext voice in _voices)
{
writer.Write(voice.OutStatus);
}
return ResultCode.Success;
}
[Command(5)]
// Start()
public ResultCode StartAudioRenderer(ServiceCtx context)
{
Logger.PrintStub(LogClass.ServiceAudio);
_playState = PlayState.Playing;
return ResultCode.Success;
}
[Command(6)]
// Stop()
public ResultCode StopAudioRenderer(ServiceCtx context)
{
Logger.PrintStub(LogClass.ServiceAudio);
_playState = PlayState.Stopped;
return ResultCode.Success;
}
[Command(7)]
// QuerySystemEvent() -> handle<copy, event>
public ResultCode QuerySystemEvent(ServiceCtx context)
{
if (context.Process.HandleTable.GenerateHandle(_updateEvent.ReadableEvent, out int handle) != KernelResult.Success)
{
throw new InvalidOperationException("Out of handles!");
}
context.Response.HandleDesc = IpcHandleDesc.MakeCopy(handle);
return ResultCode.Success;
}
private AdpcmDecoderContext GetAdpcmDecoderContext(long position, long size)
{
if (size == 0)
{
return null;
}
AdpcmDecoderContext context = new AdpcmDecoderContext();
context.Coefficients = new short[size >> 1];
for (int offset = 0; offset < size; offset += 2)
{
context.Coefficients[offset >> 1] = _memory.ReadInt16(position + offset);
}
return context;
}
private void UpdateAudio()
{
long[] released = _audioOut.GetReleasedBuffers(_track, 2);
for (int index = 0; index < released.Length; index++)
{
AppendMixedBuffer(released[index]);
}
}
private void AppendMixedBuffer(long tag)
{
int[] mixBuffer = new int[MixBufferSamplesCount * AudioConsts.HostChannelsCount];
foreach (VoiceContext voice in _voices)
{
if (!voice.Playing || voice.CurrentWaveBuffer.Size == 0)
{
continue;
}
int outOffset = 0;
int pendingSamples = MixBufferSamplesCount;
float volume = voice.Volume;
while (pendingSamples > 0)
{
int[] samples = voice.GetBufferData(_memory, pendingSamples, out int returnedSamples);
if (returnedSamples == 0)
{
break;
}
pendingSamples -= returnedSamples;
for (int offset = 0; offset < samples.Length; offset++)
{
mixBuffer[outOffset++] += (int)(samples[offset] * voice.Volume);
}
}
}
_audioOut.AppendBuffer(_track, tag, GetFinalBuffer(mixBuffer));
}
private unsafe static short[] GetFinalBuffer(int[] buffer)
{
short[] output = new short[buffer.Length];
int offset = 0;
// Perform Saturation using SSE2 if supported
if (Sse2.IsSupported)
{
fixed (int* inptr = buffer)
fixed (short* outptr = output)
{
for (; offset + 32 <= buffer.Length; offset += 32)
{
// Unroll the loop a little to ensure the CPU pipeline
// is always full.
Vector128<int> block1A = Sse2.LoadVector128(inptr + offset + 0);
Vector128<int> block1B = Sse2.LoadVector128(inptr + offset + 4);
Vector128<int> block2A = Sse2.LoadVector128(inptr + offset + 8);
Vector128<int> block2B = Sse2.LoadVector128(inptr + offset + 12);
Vector128<int> block3A = Sse2.LoadVector128(inptr + offset + 16);
Vector128<int> block3B = Sse2.LoadVector128(inptr + offset + 20);
Vector128<int> block4A = Sse2.LoadVector128(inptr + offset + 24);
Vector128<int> block4B = Sse2.LoadVector128(inptr + offset + 28);
Vector128<short> output1 = Sse2.PackSignedSaturate(block1A, block1B);
Vector128<short> output2 = Sse2.PackSignedSaturate(block2A, block2B);
Vector128<short> output3 = Sse2.PackSignedSaturate(block3A, block3B);
Vector128<short> output4 = Sse2.PackSignedSaturate(block4A, block4B);
Sse2.Store(outptr + offset + 0, output1);
Sse2.Store(outptr + offset + 8, output2);
Sse2.Store(outptr + offset + 16, output3);
Sse2.Store(outptr + offset + 24, output4);
}
}
}
// Process left overs
for (; offset < buffer.Length; offset++)
{
output[offset] = DspUtils.Saturate(buffer[offset]);
}
return output;
}
public void Dispose()
{
Dispose(true);
}
protected virtual void Dispose(bool disposing)
{
if (disposing)
{
_audioOut.CloseTrack(_track);
}
}
}
}