Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975)

* Implement Jump Table for Native Calls

NOTE: this slows down rejit considerably! Not recommended to be used
without codegen optimisation or AOT.

- Does not work on Linux
- A32 needs an additional commit.

* A32 Support

(WIP)

* Actually write Direct Call pointers to the table

That would help.

* Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame.

A return to the translator can still happen, but only by exceptionally
bubbling up to it.

Also:
- Always translate lowCq as a function. Faster interop with the direct
jumps, and this will be useful in future if we want to do speculative
translation.
- Tail Call Detection: after the decoding stage, detect if we do a tail
call, and avoid translating into it. Detected if a jump is made to an
address outwith the contiguous sequence of blocks surrounding the entry
point. The goal is to reduce code touched by jit and rejit.

* A32 Support

* Use smaller max function size for lowCq, fix exceptional returns

When a return has an unexpected value and there is no code block
following this one, we now return the value rather than continuing.

* CompareAndSwap (buggy)

* Ensure CompareAndSwap does not get optimized away.

* Use CompareAndSwap to make the dynamic table thread safe.

* Tail call for linux, throw on too many arguments.

* Combine CompareAndSwap 128 and 32/64.

They emit different IR instructions since their PreAllocator behaviour
is different, but now they just have one function on EmitterContext.

* Fix issues separating from optimisations.

* Use a stub to find and execute missing functions.

This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly.

For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found.

* Make Jump Tables and Jit Cache dynmically resize

Reserve virtual memory, commit as needed.

* Move TailCallRemover to its own class.

* Multithreaded Translation (based on heuristic)

A poor one, at that. Need to get core count for a better one, which
means a lot of OS specific garbage.

* Better priority management for background threads.

* Bound core limit a bit more

Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation.

* Fix memory management on linux.

* Temporary solution to some sync problems.

This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread)

* Address feedback minus CompareAndSwap change.

* Use default ReservedRegion granularity.

* Merge CompareAndSwap with its V128 variant.

* We already got the source, no need to do it again.

* Make sure all background translation threads exit.

* Fix CompareAndSwap128

Detection criteria was a bit scuffed.

* Address Comments.
This commit is contained in:
riperiperi 2020-03-12 03:20:55 +00:00 committed by GitHub
parent c26f3774bd
commit d904706fc0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 1094 additions and 136 deletions

View File

@ -136,7 +136,9 @@ namespace ARMeilleure.CodeGen.Optimizations
private static bool HasSideEffects(Node node)
{
return (node is Operation operation) && operation.Instruction == Instruction.Call;
return (node is Operation operation) && (operation.Instruction == Instruction.Call
|| operation.Instruction == Instruction.Tailcall
|| operation.Instruction == Instruction.CompareAndSwap);
}
private static bool IsPropagableCopy(Operation operation)

View File

@ -90,6 +90,7 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Cmpps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fc2, InstructionFlags.Vex));
Add(X86Instruction.Cmpsd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fc2, InstructionFlags.Vex | InstructionFlags.PrefixF2));
Add(X86Instruction.Cmpss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fc2, InstructionFlags.Vex | InstructionFlags.PrefixF3));
Add(X86Instruction.Cmpxchg, new InstructionInfo(0x00000fb1, BadOp, BadOp, BadOp, BadOp, InstructionFlags.None));
Add(X86Instruction.Cmpxchg16b, new InstructionInfo(0x01000fc7, BadOp, BadOp, BadOp, BadOp, InstructionFlags.RexW));
Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex));
@ -117,6 +118,7 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Imul, new InstructionInfo(BadOp, 0x0000006b, 0x00000069, BadOp, 0x00000faf, InstructionFlags.None));
Add(X86Instruction.Imul128, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x050000f7, InstructionFlags.None));
Add(X86Instruction.Insertps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a21, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Jmp, new InstructionInfo(0x040000ff, BadOp, BadOp, BadOp, BadOp, InstructionFlags.None));
Add(X86Instruction.Lea, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x0000008d, InstructionFlags.None));
Add(X86Instruction.Maxpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f5f, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Maxps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f5f, InstructionFlags.Vex));
@ -328,6 +330,13 @@ namespace ARMeilleure.CodeGen.X86
WriteByte(0x99);
}
public void Cmpxchg(MemoryOperand memOp, Operand src)
{
WriteByte(LockPrefix);
WriteInstruction(memOp, src, src.Type, X86Instruction.Cmpxchg);
}
public void Cmpxchg16b(MemoryOperand memOp)
{
WriteByte(LockPrefix);
@ -480,6 +489,11 @@ namespace ARMeilleure.CodeGen.X86
}
}
public void Jmp(Operand dest)
{
WriteInstruction(dest, null, OperandType.None, X86Instruction.Jmp);
}
public void Lea(Operand dest, Operand source, OperandType type)
{
WriteInstruction(dest, source, type, X86Instruction.Lea);

View File

@ -34,7 +34,7 @@ namespace ARMeilleure.CodeGen.X86
Add(Instruction.ByteSwap, GenerateByteSwap);
Add(Instruction.Call, GenerateCall);
Add(Instruction.Clobber, GenerateClobber);
Add(Instruction.CompareAndSwap128, GenerateCompareAndSwap128);
Add(Instruction.CompareAndSwap, GenerateCompareAndSwap);
Add(Instruction.CompareEqual, GenerateCompareEqual);
Add(Instruction.CompareGreater, GenerateCompareGreater);
Add(Instruction.CompareGreaterOrEqual, GenerateCompareGreaterOrEqual);
@ -76,6 +76,7 @@ namespace ARMeilleure.CodeGen.X86
Add(Instruction.Store16, GenerateStore16);
Add(Instruction.Store8, GenerateStore8);
Add(Instruction.Subtract, GenerateSubtract);
Add(Instruction.Tailcall, GenerateTailcall);
Add(Instruction.VectorCreateScalar, GenerateVectorCreateScalar);
Add(Instruction.VectorExtract, GenerateVectorExtract);
Add(Instruction.VectorExtract16, GenerateVectorExtract16);
@ -543,14 +544,28 @@ namespace ARMeilleure.CodeGen.X86
// register allocator, we don't need to produce any code.
}
private static void GenerateCompareAndSwap128(CodeGenContext context, Operation operation)
private static void GenerateCompareAndSwap(CodeGenContext context, Operation operation)
{
Operand source = operation.GetSource(0);
Operand src1 = operation.GetSource(0);
MemoryOperand memOp = new MemoryOperand(OperandType.I64, source);
if (operation.SourcesCount == 5) // CompareAndSwap128 has 5 sources, compared to CompareAndSwap64/32's 3.
{
MemoryOperand memOp = new MemoryOperand(OperandType.I64, src1);
context.Assembler.Cmpxchg16b(memOp);
}
else
{
Operand src2 = operation.GetSource(1);
Operand src3 = operation.GetSource(2);
EnsureSameType(src2, src3);
MemoryOperand memOp = new MemoryOperand(src3.Type, src1);
context.Assembler.Cmpxchg(memOp, src3);
}
}
private static void GenerateCompareEqual(CodeGenContext context, Operation operation)
{
@ -1083,6 +1098,13 @@ namespace ARMeilleure.CodeGen.X86
}
}
private static void GenerateTailcall(CodeGenContext context, Operation operation)
{
WriteEpilogue(context);
context.Assembler.Jmp(operation.GetSource(0));
}
private static void GenerateVectorCreateScalar(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;

View File

@ -1,6 +1,7 @@
using ARMeilleure.CodeGen.RegisterAllocators;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Collections.Generic;
using System.Diagnostics;
@ -101,6 +102,17 @@ namespace ARMeilleure.CodeGen.X86
}
break;
case Instruction.Tailcall:
if (callConv == CallConvName.Windows)
{
HandleTailcallWindowsAbi(block.Operations, stackAlloc, node, operation);
}
else
{
HandleTailcallSystemVAbi(block.Operations, stackAlloc, node, operation);
}
break;
case Instruction.VectorInsert8:
if (!HardwareCapabilities.SupportsSse41)
{
@ -199,7 +211,11 @@ namespace ARMeilleure.CodeGen.X86
switch (operation.Instruction)
{
case Instruction.CompareAndSwap128:
case Instruction.CompareAndSwap:
{
OperandType type = operation.GetSource(1).Type;
if (type == OperandType.V128)
{
// Handle the many restrictions of the compare and exchange (16 bytes) instruction:
// - The expected value should be in RDX:RAX.
@ -225,6 +241,25 @@ namespace ARMeilleure.CodeGen.X86
operation.SetDestinations(new Operand[] { rdx, rax });
operation.SetSources(new Operand[] { operation.GetSource(0), rdx, rax, rcx, rbx });
}
else
{
// Handle the many restrictions of the compare and exchange (32/64) instruction:
// - The expected value should be in (E/R)AX.
// - The value at the memory location is loaded to (E/R)AX.
Operand expected = operation.GetSource(1);
Operand rax = Gpr(X86Register.Rax, expected.Type);
nodes.AddBefore(node, new Operation(Instruction.Copy, rax, expected));
operation.SetSources(new Operand[] { operation.GetSource(0), rax, operation.GetSource(2) });
node = nodes.AddAfter(node, new Operation(Instruction.Copy, dest, rax));
operation.Destination = rax;
}
break;
}
@ -829,6 +864,123 @@ namespace ARMeilleure.CodeGen.X86
return node;
}
private static void HandleTailcallSystemVAbi(IntrusiveList<Node> nodes, StackAllocator stackAlloc, Node node, Operation operation)
{
List<Operand> sources = new List<Operand>();
sources.Add(operation.GetSource(0));
int argsCount = operation.SourcesCount - 1;
int intMax = CallingConvention.GetIntArgumentsOnRegsCount();
int vecMax = CallingConvention.GetVecArgumentsOnRegsCount();
int intCount = 0;
int vecCount = 0;
// Handle arguments passed on registers.
for (int index = 0; index < argsCount; index++)
{
Operand source = operation.GetSource(1 + index);
bool passOnReg;
if (source.Type.IsInteger())
{
passOnReg = intCount + 1 < intMax;
}
else
{
passOnReg = vecCount < vecMax;
}
if (source.Type == OperandType.V128 && passOnReg)
{
// V128 is a struct, we pass each half on a GPR if possible.
Operand argReg = Gpr(CallingConvention.GetIntArgumentRegister(intCount++), OperandType.I64);
Operand argReg2 = Gpr(CallingConvention.GetIntArgumentRegister(intCount++), OperandType.I64);
nodes.AddBefore(node, new Operation(Instruction.VectorExtract, argReg, source, Const(0)));
nodes.AddBefore(node, new Operation(Instruction.VectorExtract, argReg2, source, Const(1)));
continue;
}
if (passOnReg)
{
Operand argReg = source.Type.IsInteger()
? Gpr(CallingConvention.GetIntArgumentRegister(intCount++), source.Type)
: Xmm(CallingConvention.GetVecArgumentRegister(vecCount++), source.Type);
Operation copyOp = new Operation(Instruction.Copy, argReg, source);
HandleConstantCopy(nodes, nodes.AddBefore(node, copyOp), copyOp);
sources.Add(argReg);
}
else
{
throw new NotImplementedException("Spilling is not currently supported for tail calls. (too many arguments)");
}
}
// The target address must be on the return registers, since we
// don't return anything and it is guaranteed to not be a
// callee saved register (which would be trashed on the epilogue).
Operand retReg = Gpr(CallingConvention.GetIntReturnRegister(), OperandType.I64);
Operation addrCopyOp = new Operation(Instruction.Copy, retReg, operation.GetSource(0));
nodes.AddBefore(node, addrCopyOp);
sources[0] = retReg;
operation.SetSources(sources.ToArray());
}
private static void HandleTailcallWindowsAbi(IntrusiveList<Node> nodes, StackAllocator stackAlloc, Node node, Operation operation)
{
int argsCount = operation.SourcesCount - 1;
int maxArgs = CallingConvention.GetArgumentsOnRegsCount();
if (argsCount > maxArgs)
{
throw new NotImplementedException("Spilling is not currently supported for tail calls. (too many arguments)");
}
Operand[] sources = new Operand[1 + argsCount];
// Handle arguments passed on registers.
for (int index = 0; index < argsCount; index++)
{
Operand source = operation.GetSource(1 + index);
Operand argReg = source.Type.IsInteger()
? Gpr(CallingConvention.GetIntArgumentRegister(index), source.Type)
: Xmm(CallingConvention.GetVecArgumentRegister(index), source.Type);
Operation copyOp = new Operation(Instruction.Copy, argReg, source);
HandleConstantCopy(nodes, nodes.AddBefore(node, copyOp), copyOp);
sources[1 + index] = argReg;
}
// The target address must be on the return registers, since we
// don't return anything and it is guaranteed to not be a
// callee saved register (which would be trashed on the epilogue).
Operand retReg = Gpr(CallingConvention.GetIntReturnRegister(), OperandType.I64);
Operation addrCopyOp = new Operation(Instruction.Copy, retReg, operation.GetSource(0));
nodes.AddBefore(node, addrCopyOp);
sources[0] = retReg;
operation.SetSources(sources);
}
private static void HandleLoadArgumentWindowsAbi(
CompilerContext cctx,
IntrusiveList<Node> nodes,

View File

@ -23,6 +23,7 @@ namespace ARMeilleure.CodeGen.X86
Cmpps,
Cmpsd,
Cmpss,
Cmpxchg,
Cmpxchg16b,
Comisd,
Comiss,
@ -50,6 +51,7 @@ namespace ARMeilleure.CodeGen.X86
Imul,
Imul128,
Insertps,
Jmp,
Lea,
Maxpd,
Maxps,

View File

@ -11,6 +11,8 @@ namespace ARMeilleure.Decoders
public Block Next { get; set; }
public Block Branch { get; set; }
public bool TailCall { get; set; }
public List<OpCode> OpCodes { get; private set; }
public Block()

View File

@ -1,3 +1,4 @@
using ARMeilleure.Decoders.Optimizations;
using ARMeilleure.Instructions;
using ARMeilleure.Memory;
using ARMeilleure.State;
@ -15,6 +16,9 @@ namespace ARMeilleure.Decoders
// take too long to compile and use too much memory.
private const int MaxInstsPerFunction = 5000;
// For lower code quality translation, we set a lower limit since we're blocking execution.
private const int MaxInstsPerFunctionLowCq = 500;
private delegate object MakeOp(InstDescriptor inst, ulong address, int opCode);
private static ConcurrentDictionary<Type, MakeOp> _opActivators;
@ -33,7 +37,7 @@ namespace ARMeilleure.Decoders
return new Block[] { block };
}
public static Block[] DecodeFunction(MemoryManager memory, ulong address, ExecutionMode mode)
public static Block[] DecodeFunction(MemoryManager memory, ulong address, ExecutionMode mode, bool highCq)
{
List<Block> blocks = new List<Block>();
@ -43,11 +47,13 @@ namespace ARMeilleure.Decoders
int opsCount = 0;
int instructionLimit = highCq ? MaxInstsPerFunction : MaxInstsPerFunctionLowCq;
Block GetBlock(ulong blkAddress)
{
if (!visited.TryGetValue(blkAddress, out Block block))
{
if (opsCount > MaxInstsPerFunction || !memory.IsMapped((long)blkAddress))
if (opsCount > instructionLimit || !memory.IsMapped((long)blkAddress))
{
return null;
}
@ -121,7 +127,7 @@ namespace ARMeilleure.Decoders
currBlock.Branch = GetBlock((ulong)op.Immediate);
}
if (!IsUnconditionalBranch(lastOp) /*|| isCall*/)
if (!IsUnconditionalBranch(lastOp) || isCall)
{
currBlock.Next = GetBlock(currBlock.EndAddress);
}
@ -140,10 +146,12 @@ namespace ARMeilleure.Decoders
}
}
TailCallRemover.RunPass(address, blocks);
return blocks.ToArray();
}
private static bool BinarySearch(List<Block> blocks, ulong address, out int index)
public static bool BinarySearch(List<Block> blocks, ulong address, out int index)
{
index = 0;

View File

@ -0,0 +1,75 @@
using ARMeilleure.Decoders;
using System;
using System.Collections.Generic;
namespace ARMeilleure.Decoders.Optimizations
{
static class TailCallRemover
{
public static void RunPass(ulong entryAddress, List<Block> blocks)
{
// Detect tail calls:
// - Assume this function spans the space covered by contiguous code blocks surrounding the entry address.
// - Unconditional jump to an area outside this contiguous region will be treated as a tail call.
// - Include a small allowance for jumps outside the contiguous range.
if (!Decoder.BinarySearch(blocks, entryAddress, out int entryBlockId))
{
throw new InvalidOperationException("Function entry point is not contained in a block.");
}
const ulong allowance = 4;
Block entryBlock = blocks[entryBlockId];
int startBlockIndex = entryBlockId;
Block startBlock = entryBlock;
int endBlockIndex = entryBlockId;
Block endBlock = entryBlock;
for (int i = entryBlockId + 1; i < blocks.Count; i++) // Search forwards.
{
Block block = blocks[i];
if (endBlock.EndAddress < block.Address - allowance)
{
break; // End of contiguous function.
}
endBlock = block;
endBlockIndex = i;
}
for (int i = entryBlockId - 1; i >= 0; i--) // Search backwards.
{
Block block = blocks[i];
if (startBlock.Address > block.EndAddress + allowance)
{
break; // End of contiguous function.
}
startBlock = block;
startBlockIndex = i;
}
if (startBlockIndex == 0 && endBlockIndex == blocks.Count - 1)
{
return; // Nothing to do here.
}
// Replace all branches to blocks outside the range with null, and force a tail call.
for (int i = startBlockIndex; i <= endBlockIndex; i++)
{
Block block = blocks[i];
if (block.Branch != null && (block.Branch.Address > endBlock.EndAddress || block.Branch.EndAddress < startBlock.Address))
{
block.Branch = null;
block.TailCall = true;
}
}
// Finally, delete all blocks outside the contiguous range.
blocks.RemoveRange(endBlockIndex + 1, (blocks.Count - endBlockIndex) - 1);
blocks.RemoveRange(0, startBlockIndex);
}
}
}

View File

@ -3,6 +3,8 @@ using System;
namespace ARMeilleure.Instructions
{
delegate bool _Bool();
delegate double _F64_F64(double a1);
delegate double _F64_F64_Bool(double a1, bool a2);
delegate double _F64_F64_F64(double a1, double a2);

View File

@ -116,12 +116,14 @@ namespace ARMeilleure.Instructions
{
Debug.Assert(value.Type == OperandType.I32);
context.StoreToContext();
if (IsThumb(context.CurrOp))
{
// Make this count as a call, the translator will ignore the low bit for the address.
context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(value, Const(1))));
context.StoreToContext();
bool isReturn = IsA32Return(context);
Operand addr = context.BitwiseOr(value, Const(1));
InstEmitFlowHelper.EmitVirtualJump(context, addr, isReturn);
}
else
{
@ -138,18 +140,8 @@ namespace ARMeilleure.Instructions
if (setFlags)
{
// TODO: Load SPSR etc.
Operand isThumb = GetFlag(PState.TFlag);
Operand lblThumb = Label();
context.BranchIfTrue(lblThumb, isThumb);
// Make this count as a call, the translator will ignore the low bit for the address.
context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(context.BitwiseAnd(value, Const(~3)), Const(1))));
context.MarkLabel(lblThumb);
context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(value, Const(1))));
EmitBxWritePc(context, value);
}
else
{

View File

@ -2,6 +2,7 @@ using ARMeilleure.Decoders;
using ARMeilleure.Translation;
using System;
using static ARMeilleure.Instructions.InstEmitFlowHelper;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.Instructions
@ -30,7 +31,7 @@ namespace ARMeilleure.Instructions
if (context.CurrBlock.Next == null)
{
context.Return(Const(op.Address + 4));
EmitTailContinue(context, Const(op.Address + 4));
}
}
@ -48,7 +49,7 @@ namespace ARMeilleure.Instructions
if (context.CurrBlock.Next == null)
{
context.Return(Const(op.Address + 4));
EmitTailContinue(context, Const(op.Address + 4));
}
}
}

View File

@ -1,6 +1,7 @@
using ARMeilleure.Decoders;
using ARMeilleure.Translation;
using static ARMeilleure.Instructions.InstEmitFlowHelper;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.Instructions
@ -29,7 +30,7 @@ namespace ARMeilleure.Instructions
if (context.CurrBlock.Next == null)
{
context.Return(Const(op.Address + 4));
EmitTailContinue(context, Const(op.Address + 4));
}
}
}

View File

@ -21,7 +21,7 @@ namespace ARMeilleure.Instructions
}
else
{
context.Return(Const(op.Immediate));
EmitTailContinue(context, Const(op.Immediate), context.CurrBlock.TailCall);
}
}
@ -56,7 +56,7 @@ namespace ARMeilleure.Instructions
{
OpCodeBReg op = (OpCodeBReg)context.CurrOp;
EmitVirtualJump(context, GetIntOrZR(context, op.Rn));
EmitVirtualJump(context, GetIntOrZR(context, op.Rn), op.Rn == RegisterAlias.Lr);
}
public static void Cbnz(ArmEmitterContext context) => EmitCb(context, onNotZero: true);
@ -71,7 +71,7 @@ namespace ARMeilleure.Instructions
public static void Ret(ArmEmitterContext context)
{
context.Return(context.BitwiseOr(GetIntOrZR(context, RegisterAlias.Lr), Const(CallFlag)));
context.Return(GetIntOrZR(context, RegisterAlias.Lr));
}
public static void Tbnz(ArmEmitterContext context) => EmitTb(context, onNotZero: true);
@ -96,7 +96,7 @@ namespace ARMeilleure.Instructions
if (context.CurrBlock.Next == null)
{
context.Return(Const(op.Address + 4));
EmitTailContinue(context, Const(op.Address + 4));
}
}
else
@ -105,11 +105,11 @@ namespace ARMeilleure.Instructions
EmitCondBranch(context, lblTaken, cond);
context.Return(Const(op.Address + 4));
EmitTailContinue(context, Const(op.Address + 4));
context.MarkLabel(lblTaken);
context.Return(Const(op.Immediate));
EmitTailContinue(context, Const(op.Immediate));
}
}
@ -132,7 +132,7 @@ namespace ARMeilleure.Instructions
if (context.CurrBlock.Next == null)
{
context.Return(Const(op.Address + 4));
EmitTailContinue(context, Const(op.Address + 4));
}
}
else
@ -148,11 +148,11 @@ namespace ARMeilleure.Instructions
context.BranchIfFalse(lblTaken, value);
}
context.Return(Const(op.Address + 4));
EmitTailContinue(context, Const(op.Address + 4));
context.MarkLabel(lblTaken);
context.Return(Const(op.Immediate));
EmitTailContinue(context, Const(op.Immediate));
}
}
}

View File

@ -21,8 +21,7 @@ namespace ARMeilleure.Instructions
}
else
{
context.StoreToContext();
context.Return(Const(op.Immediate));
EmitTailContinue(context, Const(op.Immediate));
}
}
@ -57,7 +56,7 @@ namespace ARMeilleure.Instructions
SetFlag(context, PState.TFlag, Const(isThumb ? 0 : 1));
}
InstEmitFlowHelper.EmitCall(context, (ulong)op.Immediate);
EmitCall(context, (ulong)op.Immediate);
}
public static void Blxr(ArmEmitterContext context)
@ -66,9 +65,8 @@ namespace ARMeilleure.Instructions
uint pc = op.GetPc();
Operand addr = GetIntA32(context, op.Rm);
Operand addr = context.Copy(GetIntA32(context, op.Rm));
Operand bitOne = context.BitwiseAnd(addr, Const(1));
addr = context.BitwiseOr(addr, Const((int)CallFlag)); // Set call flag.
bool isThumb = IsThumb(context.CurrOp);
@ -80,16 +78,14 @@ namespace ARMeilleure.Instructions
SetFlag(context, PState.TFlag, bitOne);
context.Return(addr); // Call.
EmitVirtualCall(context, addr);
}
public static void Bx(ArmEmitterContext context)
{
IOpCode32BReg op = (IOpCode32BReg)context.CurrOp;
context.StoreToContext();
EmitBxWritePc(context, GetIntA32(context, op.Rm));
EmitBxWritePc(context, GetIntA32(context, op.Rm), op.Rm);
}
}
}

View File

@ -2,6 +2,7 @@ using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.State;
using ARMeilleure.Translation;
using System;
using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
@ -142,7 +143,29 @@ namespace ARMeilleure.Instructions
public static void EmitCall(ArmEmitterContext context, ulong immediate)
{
context.Return(Const(immediate | CallFlag));
EmitJumpTableBranch(context, Const(immediate));
}
private static void EmitNativeCall(ArmEmitterContext context, Operand nativeContextPtr, Operand funcAddr, bool isJump = false)
{
context.StoreToContext();
Operand returnAddress;
if (isJump)
{
context.Tailcall(funcAddr, nativeContextPtr);
}
else
{
returnAddress = context.Call(funcAddr, OperandType.I64, nativeContextPtr);
context.LoadFromContext();
EmitContinueOrReturnCheck(context, returnAddress);
}
}
private static void EmitNativeCall(ArmEmitterContext context, Operand funcAddr, bool isJump = false)
{
EmitNativeCall(context, context.LoadArgument(OperandType.I64, 0), funcAddr, isJump);
}
public static void EmitVirtualCall(ArmEmitterContext context, Operand target)
@ -150,37 +173,45 @@ namespace ARMeilleure.Instructions
EmitVirtualCallOrJump(context, target, isJump: false);
}
public static void EmitVirtualJump(ArmEmitterContext context, Operand target)
public static void EmitVirtualJump(ArmEmitterContext context, Operand target, bool isReturn)
{
EmitVirtualCallOrJump(context, target, isJump: true);
EmitVirtualCallOrJump(context, target, isJump: true, isReturn: isReturn);
}
private static void EmitVirtualCallOrJump(ArmEmitterContext context, Operand target, bool isJump)
private static void EmitVirtualCallOrJump(ArmEmitterContext context, Operand target, bool isJump, bool isReturn = false)
{
context.Return(context.BitwiseOr(target, Const(target.Type, (long)CallFlag)));
}
private static void EmitContinueOrReturnCheck(ArmEmitterContext context, Operand retVal)
if (isReturn)
{
// Note: The return value of the called method will be placed
// at the Stack, the return value is always a Int64 with the
// return address of the function. We check if the address is
// correct, if it isn't we keep returning until we reach the dispatcher.
ulong nextAddr = GetNextOpAddress(context.CurrOp);
if (context.CurrBlock.Next != null)
{
Operand lblContinue = Label();
context.BranchIfTrue(lblContinue, context.ICompareEqual(retVal, Const(nextAddr)));
context.Return(Const(nextAddr));
context.MarkLabel(lblContinue);
context.Return(target);
}
else
{
context.Return(Const(nextAddr));
EmitJumpTableBranch(context, target, isJump);
}
}
private static void EmitContinueOrReturnCheck(ArmEmitterContext context, Operand returnAddress)
{
// Note: The return value of a translated function is always an Int64 with the
// address execution has returned to. We expect this address to be immediately after the
// current instruction, if it isn't we keep returning until we reach the dispatcher.
Operand nextAddr = Const(GetNextOpAddress(context.CurrOp));
// Try to continue within this block.
// If the return address isn't to our next instruction, we need to return so the JIT can figure out what to do.
Operand lblContinue = Label();
// We need to clear out the call flag for the return address before comparing it.
context.BranchIfTrue(lblContinue, context.ICompareEqual(context.BitwiseAnd(returnAddress, Const(~CallFlag)), nextAddr));
context.Return(returnAddress);
context.MarkLabel(lblContinue);
if (context.CurrBlock.Next == null)
{
// No code following this instruction, try and find the next block and jump to it.
EmitTailContinue(context, nextAddr);
}
}
@ -188,5 +219,134 @@ namespace ARMeilleure.Instructions
{
return op.Address + (ulong)op.OpCodeSizeInBytes;
}
public static void EmitTailContinue(ArmEmitterContext context, Operand address, bool allowRejit = false)
{
bool useTailContinue = true; // Left option here as it may be useful if we need to return to managed rather than tail call in future. (eg. for debug)
if (useTailContinue)
{
if (allowRejit)
{
address = context.BitwiseOr(address, Const(1L));
}
Operand fallbackAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address);
EmitNativeCall(context, fallbackAddr, true);
}
else
{
context.Return(address);
}
}
private static void EmitNativeCallWithGuestAddress(ArmEmitterContext context, Operand funcAddr, Operand guestAddress, bool isJump)
{
Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0);
context.Store(context.Add(nativeContextPtr, Const(NativeContext.GetCallAddressOffset())), guestAddress);
EmitNativeCall(context, nativeContextPtr, funcAddr, isJump);
}
private static void EmitBranchFallback(ArmEmitterContext context, Operand address, bool isJump)
{
address = context.BitwiseOr(address, Const(address.Type, (long)CallFlag)); // Set call flag.
Operand fallbackAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address);
EmitNativeCall(context, fallbackAddr, isJump);
}
public static void EmitDynamicTableCall(ArmEmitterContext context, Operand tableAddress, Operand address, bool isJump)
{
// Loop over elements of the dynamic table. Unrolled loop.
Operand endLabel = Label();
Operand fallbackLabel = Label();
Action<Operand> emitTableEntry = (Operand entrySkipLabel) =>
{
// Try to take this entry in the table if its guest address equals 0.
Operand gotResult = context.CompareAndSwap(tableAddress, Const(0L), address);
// Is the address ours? (either taken via CompareAndSwap (0), or what was already here)
context.BranchIfFalse(entrySkipLabel, context.BitwiseOr(context.ICompareEqual(gotResult, address), context.ICompareEqual(gotResult, Const(0L))));
// It's ours, so what function is it pointing to?
Operand targetFunctionPtr = context.Add(tableAddress, Const(8L));
Operand targetFunction = context.Load(OperandType.I64, targetFunctionPtr);
// Call the function.
// We pass in the entry address as the guest address, as the entry may need to be updated by the indirect call stub.
EmitNativeCallWithGuestAddress(context, targetFunction, tableAddress, isJump);
context.Branch(endLabel);
};
// Currently this uses a size of 1, as higher values inflate code size for no real benefit.
for (int i = 0; i < JumpTable.DynamicTableElems; i++)
{
if (i == JumpTable.DynamicTableElems - 1)
{
emitTableEntry(fallbackLabel); // If this is the last entry, avoid emitting the additional label and add.
}
else
{
Operand nextLabel = Label();
emitTableEntry(nextLabel);
context.MarkLabel(nextLabel);
tableAddress = context.Add(tableAddress, Const((long)JumpTable.JumpTableStride)); // Move to the next table entry.
}
}
context.MarkLabel(fallbackLabel);
EmitBranchFallback(context, address, isJump);
context.MarkLabel(endLabel);
}
public static void EmitJumpTableBranch(ArmEmitterContext context, Operand address, bool isJump = false)
{
if (address.Type == OperandType.I32)
{
address = context.ZeroExtend32(OperandType.I64, address);
}
// TODO: Constant folding. Indirect calls are slower in the best case and emit more code so we want to avoid them when possible.
bool isConst = address.Kind == OperandKind.Constant;
long constAddr = (long)address.Value;
if (!context.HighCq)
{
// Don't emit indirect calls or jumps if we're compiling in lowCq mode.
// This avoids wasting space on the jump and indirect tables.
// Just ask the translator for the function address.
EmitBranchFallback(context, address, isJump);
}
else if (!isConst)
{
// Virtual branch/call - store first used addresses on a small table for fast lookup.
int entry = context.JumpTable.ReserveDynamicEntry(isJump);
int jumpOffset = entry * JumpTable.JumpTableStride * JumpTable.DynamicTableElems;
Operand dynTablePtr = Const(context.JumpTable.DynamicPointer.ToInt64() + jumpOffset);
EmitDynamicTableCall(context, dynTablePtr, address, isJump);
}
else
{
int entry = context.JumpTable.ReserveTableEntry(context.BaseAddress & (~3L), constAddr, isJump);
int jumpOffset = entry * JumpTable.JumpTableStride + 8; // Offset directly to the host address.
// TODO: Relocatable jump table ptr for AOT. Would prefer a solution to patch this constant into functions as they are loaded rather than calculate at runtime.
Operand tableEntryPtr = Const(context.JumpTable.JumpPointer.ToInt64() + jumpOffset);
Operand funcAddr = context.Load(OperandType.I64, tableEntryPtr);
EmitNativeCallWithGuestAddress(context, funcAddr, address, isJump); // Call the function directly. If it's not present yet, this will call the direct call stub.
}
}
}
}

View File

@ -144,22 +144,34 @@ namespace ARMeilleure.Instructions
}
}
public static void EmitBxWritePc(ArmEmitterContext context, Operand pc)
public static bool IsA32Return(ArmEmitterContext context)
{
switch (context.CurrOp)
{
case IOpCode32MemMult op:
return true; // Setting PC using LDM is nearly always a return.
case OpCode32AluRsImm op:
return op.Rm == RegisterAlias.Aarch32Lr;
case OpCode32AluRsReg op:
return op.Rm == RegisterAlias.Aarch32Lr;
case OpCode32AluReg op:
return op.Rm == RegisterAlias.Aarch32Lr;
case OpCode32Mem op:
return op.Rn == RegisterAlias.Aarch32Sp && op.WBack && !op.Index; // Setting PC to an address stored on the stack is nearly always a return.
}
return false;
}
public static void EmitBxWritePc(ArmEmitterContext context, Operand pc, int sourceRegister = 0)
{
bool isReturn = sourceRegister == RegisterAlias.Aarch32Lr || IsA32Return(context);
Operand mode = context.BitwiseAnd(pc, Const(1));
SetFlag(context, PState.TFlag, mode);
Operand lblArmMode = Label();
Operand addr = context.ConditionalSelect(mode, context.BitwiseOr(pc, Const((int)InstEmitFlowHelper.CallFlag)), context.BitwiseAnd(pc, Const(~3)));
context.BranchIfTrue(lblArmMode, mode);
// Make this count as a call, the translator will ignore the low bit for the address.
context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(pc, Const((int)InstEmitFlowHelper.CallFlag))));
context.MarkLabel(lblArmMode);
context.Return(context.ZeroExtend32(OperandType.I64, context.BitwiseOr(context.BitwiseAnd(pc, Const(~3)), Const((int)InstEmitFlowHelper.CallFlag))));
InstEmitFlowHelper.EmitVirtualJump(context, addr, isReturn);
}
public static Operand GetIntOrZR(ArmEmitterContext context, int regIndex)

View File

@ -51,7 +51,7 @@ namespace ARMeilleure.Instructions
EmitReadInt(context, address, rt, size);
}
if (!isSimd)
if (!isSimd && !(context.CurrOp is OpCode32 && rt == State.RegisterAlias.Aarch32Pc))
{
Operand value = GetInt(context, rt);

View File

@ -1,6 +1,8 @@
using ARMeilleure.Memory;
using ARMeilleure.State;
using ARMeilleure.Translation;
using System;
using System.Runtime.InteropServices;
namespace ARMeilleure.Instructions
{
@ -12,15 +14,17 @@ namespace ARMeilleure.Instructions
{
public ExecutionContext Context { get; }
public MemoryManager Memory { get; }
public Translator Translator { get; }
public ulong ExclusiveAddress { get; set; }
public ulong ExclusiveValueLow { get; set; }
public ulong ExclusiveValueHigh { get; set; }
public ThreadContext(ExecutionContext context, MemoryManager memory)
public ThreadContext(ExecutionContext context, MemoryManager memory, Translator translator)
{
Context = context;
Memory = memory;
Translator = translator;
ExclusiveAddress = ulong.MaxValue;
}
@ -29,9 +33,9 @@ namespace ARMeilleure.Instructions
[ThreadStatic]
private static ThreadContext _context;
public static void RegisterThread(ExecutionContext context, MemoryManager memory)
public static void RegisterThread(ExecutionContext context, MemoryManager memory, Translator translator)
{
_context = new ThreadContext(context, memory);
_context = new ThreadContext(context, memory, translator);
}
public static void UnregisterThread()
@ -381,18 +385,39 @@ namespace ARMeilleure.Instructions
return address & ~((4UL << ErgSizeLog2) - 1);
}
public static ulong GetFunctionAddress(ulong address)
{
TranslatedFunction function = _context.Translator.GetOrTranslate(address, GetContext().ExecutionMode);
return (ulong)function.GetPointer().ToInt64();
}
public static ulong GetIndirectFunctionAddress(ulong address, ulong entryAddress)
{
TranslatedFunction function = _context.Translator.GetOrTranslate(address, GetContext().ExecutionMode);
ulong ptr = (ulong)function.GetPointer().ToInt64();
if (function.HighCq)
{
// Rewrite the host function address in the table to point to the highCq function.
Marshal.WriteInt64((IntPtr)entryAddress, 8, (long)ptr);
}
return ptr;
}
public static void ClearExclusive()
{
_context.ExclusiveAddress = ulong.MaxValue;
}
public static void CheckSynchronization()
public static bool CheckSynchronization()
{
Statistics.PauseTimer();
GetContext().CheckInterrupt();
ExecutionContext context = GetContext();
context.CheckInterrupt();
Statistics.ResumeTimer();
return context.Running;
}
public static ExecutionContext GetContext()

View File

@ -12,7 +12,7 @@ namespace ARMeilleure.IntermediateRepresentation
BranchIfTrue,
ByteSwap,
Call,
CompareAndSwap128,
CompareAndSwap,
CompareEqual,
CompareGreater,
CompareGreaterOrEqual,
@ -52,6 +52,7 @@ namespace ARMeilleure.IntermediateRepresentation
Store16,
Store8,
Subtract,
Tailcall,
VectorCreateScalar,
VectorExtract,
VectorExtract16,

View File

@ -44,6 +44,25 @@ namespace ARMeilleure.Memory
}
}
public static bool Commit(IntPtr address, ulong size)
{
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
IntPtr sizeNint = new IntPtr((long)size);
return MemoryManagementWindows.Commit(address, sizeNint);
}
else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ||
RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
{
return MemoryManagementUnix.Commit(address, size);
}
else
{
throw new PlatformNotSupportedException();
}
}
public static void Reprotect(IntPtr address, ulong size, MemoryProtection permission)
{
bool result;
@ -70,6 +89,25 @@ namespace ARMeilleure.Memory
}
}
public static IntPtr Reserve(ulong size)
{
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
IntPtr sizeNint = new IntPtr((long)size);
return MemoryManagementWindows.Reserve(sizeNint);
}
else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ||
RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
{
return MemoryManagementUnix.Reserve(size);
}
else
{
throw new PlatformNotSupportedException();
}
}
public static bool Free(IntPtr address)
{
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))

View File

@ -30,6 +30,11 @@ namespace ARMeilleure.Memory
return ptr;
}
public static bool Commit(IntPtr address, ulong size)
{
return Syscall.mprotect(address, size, MmapProts.PROT_READ | MmapProts.PROT_WRITE) == 0;
}
public static bool Reprotect(IntPtr address, ulong size, Memory.MemoryProtection protection)
{
MmapProts prot = GetProtection(protection);
@ -37,6 +42,24 @@ namespace ARMeilleure.Memory
return Syscall.mprotect(address, size, prot) == 0;
}
public static IntPtr Reserve(ulong size)
{
ulong pageSize = (ulong)Syscall.sysconf(SysconfName._SC_PAGESIZE);
const MmapProts prot = MmapProts.PROT_NONE;
const MmapFlags flags = MmapFlags.MAP_PRIVATE | MmapFlags.MAP_ANONYMOUS;
IntPtr ptr = Syscall.mmap(IntPtr.Zero, size + pageSize, prot, flags, -1, 0);
if (ptr == IntPtr.Zero)
{
throw new OutOfMemoryException();
}
return ptr;
}
private static MmapProts GetProtection(Memory.MemoryProtection protection)
{
switch (protection)

View File

@ -89,6 +89,15 @@ namespace ARMeilleure.Memory
return ptr;
}
public static bool Commit(IntPtr location, IntPtr size)
{
const AllocationType flags = AllocationType.Commit;
IntPtr ptr = VirtualAlloc(location, size, flags, MemoryProtection.ReadWrite);
return ptr != IntPtr.Zero;
}
public static bool Reprotect(IntPtr address, IntPtr size, Memory.MemoryProtection protection)
{
MemoryProtection prot = GetProtection(protection);
@ -96,6 +105,20 @@ namespace ARMeilleure.Memory
return VirtualProtect(address, size, prot, out _);
}
public static IntPtr Reserve(IntPtr size)
{
const AllocationType flags = AllocationType.Reserve;
IntPtr ptr = VirtualAlloc(IntPtr.Zero, size, flags, MemoryProtection.ReadWrite);
if (ptr == IntPtr.Zero)
{
throw new OutOfMemoryException();
}
return ptr;
}
private static MemoryProtection GetProtection(Memory.MemoryProtection protection)
{
switch (protection)

View File

@ -53,7 +53,7 @@ namespace ARMeilleure.Memory
Operand expected = context.LoadArgument(OperandType.V128, 1);
Operand desired = context.LoadArgument(OperandType.V128, 2);
Operand result = context.CompareAndSwap128(address, expected, desired);
Operand result = context.CompareAndSwap(address, expected, desired);
context.Return(result);

View File

@ -0,0 +1,53 @@
using System;
using System.Collections.Generic;
using System.Text;
namespace ARMeilleure.Memory
{
class ReservedRegion
{
private const int DefaultGranularity = 65536; // Mapping granularity in Windows.
public IntPtr Pointer { get; }
private ulong _maxSize;
private ulong _sizeGranularity;
private ulong _currentSize;
public ReservedRegion(ulong maxSize, ulong granularity = 0)
{
if (granularity == 0)
{
granularity = DefaultGranularity;
}
Pointer = MemoryManagement.Reserve(maxSize);
_maxSize = maxSize;
_sizeGranularity = granularity;
_currentSize = 0;
}
public void ExpandIfNeeded(ulong desiredSize)
{
if (desiredSize > _maxSize)
{
throw new OutOfMemoryException();
}
if (desiredSize > _currentSize)
{
// Lock, and then check again. We only want to commit once.
lock (this)
{
if (desiredSize >= _currentSize)
{
ulong overflowBytes = desiredSize - _currentSize;
ulong moreToCommit = (((_sizeGranularity - 1) + overflowBytes) / _sizeGranularity) * _sizeGranularity; // Round up.
MemoryManagement.Commit(new IntPtr((long)Pointer + (long)_currentSize), moreToCommit);
_currentSize += moreToCommit;
}
}
}
}
}
}

View File

@ -5,7 +5,7 @@ namespace ARMeilleure.State
{
public class ExecutionContext
{
private const int MinCountForCheck = 40000;
private const int MinCountForCheck = 4000;
private NativeContext _nativeContext;
@ -57,7 +57,7 @@ namespace ARMeilleure.State
}
}
public bool Running { get; set; }
internal bool Running { get; private set; }
public event EventHandler<EventArgs> Interrupt;
public event EventHandler<InstExceptionEventArgs> Break;
@ -126,6 +126,12 @@ namespace ARMeilleure.State
Undefined?.Invoke(this, new InstUndefinedEventArgs(address, opCode));
}
public void StopRunning()
{
Running = false;
_nativeContext.SetCounter(0);
}
public void Dispose()
{
_nativeContext.Dispose();

View File

@ -10,7 +10,7 @@ namespace ARMeilleure.State
private const int IntSize = 8;
private const int VecSize = 16;
private const int FlagSize = 4;
private const int ExtraSize = 4;
private const int ExtraSize = 8;
private const int TotalSize = RegisterConsts.IntRegsCount * IntSize +
RegisterConsts.VecRegsCount * VecSize +
@ -183,6 +183,14 @@ namespace ARMeilleure.State
RegisterConsts.FpFlagsCount * FlagSize;
}
public static int GetCallAddressOffset()
{
return RegisterConsts.IntRegsCount * IntSize +
RegisterConsts.VecRegsCount * VecSize +
RegisterConsts.FlagsCount * FlagSize +
RegisterConsts.FpFlagsCount * FlagSize + 4;
}
public void Dispose()
{
MemoryManagement.Free(BasePtr);

View File

@ -41,9 +41,18 @@ namespace ARMeilleure.Translation
public Aarch32Mode Mode { get; }
public ArmEmitterContext(MemoryManager memory, Aarch32Mode mode)
public JumpTable JumpTable { get; }
public long BaseAddress { get; }
public bool HighCq { get; }
public ArmEmitterContext(MemoryManager memory, JumpTable jumpTable, long baseAddress, bool highCq, Aarch32Mode mode)
{
Memory = memory;
JumpTable = jumpTable;
BaseAddress = baseAddress;
HighCq = highCq;
Mode = mode;
_labels = new Dictionary<ulong, Operand>();

View File

@ -0,0 +1,131 @@
using ARMeilleure.Instructions;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.State;
using System;
using System.Runtime.InteropServices;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
namespace ARMeilleure.Translation
{
static class DirectCallStubs
{
private delegate long GuestFunction(IntPtr nativeContextPtr);
private static GuestFunction _directCallStub;
private static GuestFunction _directTailCallStub;
private static GuestFunction _indirectCallStub;
private static GuestFunction _indirectTailCallStub;
private static object _lock;
private static bool _initialized;
static DirectCallStubs()
{
_lock = new object();
}
public static void InitializeStubs()
{
if (_initialized) return;
lock (_lock)
{
if (_initialized) return;
_directCallStub = GenerateDirectCallStub(false);
_directTailCallStub = GenerateDirectCallStub(true);
_indirectCallStub = GenerateIndirectCallStub(false);
_indirectTailCallStub = GenerateIndirectCallStub(true);
_initialized = true;
}
}
public static IntPtr DirectCallStub(bool tailCall)
{
return Marshal.GetFunctionPointerForDelegate(tailCall ? _directTailCallStub : _directCallStub);
}
public static IntPtr IndirectCallStub(bool tailCall)
{
return Marshal.GetFunctionPointerForDelegate(tailCall ? _indirectTailCallStub : _indirectCallStub);
}
private static void EmitCall(EmitterContext context, Operand address, bool tailCall)
{
if (tailCall)
{
context.Tailcall(address, context.LoadArgument(OperandType.I64, 0));
}
else
{
context.Return(context.Call(address, OperandType.I64, context.LoadArgument(OperandType.I64, 0)));
}
}
/// <summary>
/// Generates a stub that is used to find function addresses. Used for direct calls when their jump table does not have the host address yet.
/// Takes a NativeContext like a translated guest function, and extracts the target address from the NativeContext.
/// When the target function is compiled in highCq, all table entries are updated to point to that function instead of this stub by the translator.
/// </summary>
private static GuestFunction GenerateDirectCallStub(bool tailCall)
{
EmitterContext context = new EmitterContext();
Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0);
Operand address = context.Load(OperandType.I64, context.Add(nativeContextPtr, Const((long)NativeContext.GetCallAddressOffset())));
address = context.BitwiseOr(address, Const(address.Type, 1)); // Set call flag.
Operand functionAddr = context.Call(new _U64_U64(NativeInterface.GetFunctionAddress), address);
EmitCall(context, functionAddr, tailCall);
ControlFlowGraph cfg = context.GetControlFlowGraph();
OperandType[] argTypes = new OperandType[]
{
OperandType.I64
};
return Compiler.Compile<GuestFunction>(
cfg,
argTypes,
OperandType.I64,
CompilerOptions.HighCq);
}
/// <summary>
/// Generates a stub that is used to find function addresses and add them to an indirect table.
/// Used for indirect calls entries (already claimed) when their jump table does not have the host address yet.
/// Takes a NativeContext like a translated guest function, and extracts the target indirect table entry from the NativeContext.
/// If the function we find is highCq, the entry in the table is updated to point to that function rather than this stub.
/// </summary>
private static GuestFunction GenerateIndirectCallStub(bool tailCall)
{
EmitterContext context = new EmitterContext();
Operand nativeContextPtr = context.LoadArgument(OperandType.I64, 0);
Operand entryAddress = context.Load(OperandType.I64, context.Add(nativeContextPtr, Const((long)NativeContext.GetCallAddressOffset())));
Operand address = context.Load(OperandType.I64, entryAddress);
// We need to find the missing function. If the function is HighCq, then it replaces this stub in the indirect table.
// Either way, we call it afterwards.
Operand functionAddr = context.Call(new _U64_U64_U64(NativeInterface.GetIndirectFunctionAddress), address, entryAddress);
// Call and save the function.
EmitCall(context, functionAddr, tailCall);
ControlFlowGraph cfg = context.GetControlFlowGraph();
OperandType[] argTypes = new OperandType[]
{
OperandType.I64
};
return Compiler.Compile<GuestFunction>(
cfg,
argTypes,
OperandType.I64,
CompilerOptions.HighCq);
}
}
}

View File

@ -143,9 +143,22 @@ namespace ARMeilleure.Translation
}
}
public Operand CompareAndSwap128(Operand address, Operand expected, Operand desired)
public void Tailcall(Operand address, params Operand[] callArgs)
{
return Add(Instruction.CompareAndSwap128, Local(OperandType.V128), address, expected, desired);
Operand[] args = new Operand[callArgs.Length + 1];
args[0] = address;
Array.Copy(callArgs, 0, args, 1, callArgs.Length);
Add(Instruction.Tailcall, null, args);
_needsNewBlock = true;
}
public Operand CompareAndSwap(Operand address, Operand expected, Operand desired)
{
return Add(Instruction.CompareAndSwap, Local(desired.Type), address, expected, desired);
}
public Operand ConditionalSelect(Operand op1, Operand op2, Operand op3)

View File

@ -13,9 +13,11 @@ namespace ARMeilleure.Translation
private const int CodeAlignment = 4; // Bytes
private const int CacheSize = 512 * 1024 * 1024;
private const int CacheSize = 2047 * 1024 * 1024;
private static IntPtr _basePointer;
private static ReservedRegion _jitRegion;
private static IntPtr _basePointer => _jitRegion.Pointer;
private static int _offset;
@ -25,10 +27,11 @@ namespace ARMeilleure.Translation
static JitCache()
{
_basePointer = MemoryManagement.Allocate(CacheSize);
_jitRegion = new ReservedRegion(CacheSize);
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
_jitRegion.ExpandIfNeeded(PageSize);
JitUnwindWindows.InstallFunctionTableHandler(_basePointer, CacheSize);
// The first page is used for the table based SEH structs.
@ -97,6 +100,8 @@ namespace ARMeilleure.Translation
_offset += codeSize;
_jitRegion.ExpandIfNeeded((ulong)_offset);
if ((ulong)(uint)_offset > CacheSize)
{
throw new OutOfMemoryException();

View File

@ -0,0 +1,149 @@
using ARMeilleure.Memory;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Threading;
namespace ARMeilleure.Translation
{
class JumpTable
{
public static JumpTable Instance { get; }
static JumpTable()
{
Instance = new JumpTable();
}
// The jump table is a block of (guestAddress, hostAddress) function mappings.
// Each entry corresponds to one branch in a JIT compiled function. The entries are
// reserved specifically for each call.
// The _dependants dictionary can be used to update the hostAddress for any functions that change.
public const int JumpTableStride = 16; // 8 byte guest address, 8 byte host address
private const int JumpTableSize = 1048576;
private const int JumpTableByteSize = JumpTableSize * JumpTableStride;
// The dynamic table is also a block of (guestAddress, hostAddress) function mappings.
// The main difference is that indirect calls and jumps reserve _multiple_ entries on the table.
// These start out as all 0. When an indirect call is made, it tries to find the guest address on the table.
// If we get to an empty address, the guestAddress is set to the call that we want.
// If we get to a guestAddress that matches our own (or we just claimed it), the hostAddress is read.
// If it is non-zero, we immediately branch or call the host function.
// If it is 0, NativeInterface is called to find the rejited address of the call.
// If none is found, the hostAddress entry stays at 0. Otherwise, the new address is placed in the entry.
// If the table size is exhausted and we didn't find our desired address, we fall back to requesting
// the function from the JIT.
private const int DynamicTableSize = 1048576;
public const int DynamicTableElems = 1;
public const int DynamicTableStride = DynamicTableElems * JumpTableStride;
private const int DynamicTableByteSize = DynamicTableSize * JumpTableStride * DynamicTableElems;
private int _tableEnd = 0;
private int _dynTableEnd = 0;
private ConcurrentDictionary<ulong, TranslatedFunction> _targets;
private ConcurrentDictionary<ulong, LinkedList<int>> _dependants; // TODO: Attach to TranslatedFunction or a wrapper class.
private ReservedRegion _jumpRegion;
private ReservedRegion _dynamicRegion;
public IntPtr JumpPointer => _jumpRegion.Pointer;
public IntPtr DynamicPointer => _dynamicRegion.Pointer;
public JumpTable()
{
_jumpRegion = new ReservedRegion(JumpTableByteSize);
_dynamicRegion = new ReservedRegion(DynamicTableByteSize);
_targets = new ConcurrentDictionary<ulong, TranslatedFunction>();
_dependants = new ConcurrentDictionary<ulong, LinkedList<int>>();
}
public void RegisterFunction(ulong address, TranslatedFunction func) {
address &= ~3UL;
_targets.AddOrUpdate(address, func, (key, oldFunc) => func);
long funcPtr = func.GetPointer().ToInt64();
// Update all jump table entries that target this address.
LinkedList<int> myDependants;
if (_dependants.TryGetValue(address, out myDependants))
{
lock (myDependants)
{
foreach (var entry in myDependants)
{
IntPtr addr = _jumpRegion.Pointer + entry * JumpTableStride;
Marshal.WriteInt64(addr, 8, funcPtr);
}
}
}
}
public int ReserveDynamicEntry(bool isJump)
{
int entry = Interlocked.Increment(ref _dynTableEnd);
if (entry >= DynamicTableSize)
{
throw new OutOfMemoryException("JIT Dynamic Jump Table exhausted.");
}
_dynamicRegion.ExpandIfNeeded((ulong)((entry + 1) * DynamicTableStride));
// Initialize all host function pointers to the indirect call stub.
IntPtr addr = _dynamicRegion.Pointer + entry * DynamicTableStride;
long stubPtr = (long)DirectCallStubs.IndirectCallStub(isJump);
for (int i = 0; i < DynamicTableElems; i++)
{
Marshal.WriteInt64(addr, i * JumpTableStride + 8, stubPtr);
}
return entry;
}
public int ReserveTableEntry(long ownerAddress, long address, bool isJump)
{
int entry = Interlocked.Increment(ref _tableEnd);
if (entry >= JumpTableSize)
{
throw new OutOfMemoryException("JIT Direct Jump Table exhausted.");
}
_jumpRegion.ExpandIfNeeded((ulong)((entry + 1) * JumpTableStride));
// Is the address we have already registered? If so, put the function address in the jump table.
// If not, it will point to the direct call stub.
long value = (long)DirectCallStubs.DirectCallStub(isJump);
TranslatedFunction func;
if (_targets.TryGetValue((ulong)address, out func))
{
value = func.GetPointer().ToInt64();
}
// Make sure changes to the function at the target address update this jump table entry.
LinkedList<int> targetDependants = _dependants.GetOrAdd((ulong)address, (addr) => new LinkedList<int>());
lock (targetDependants)
{
targetDependants.AddLast(entry);
}
IntPtr addr = _jumpRegion.Pointer + entry * JumpTableStride;
Marshal.WriteInt64(addr, 0, address);
Marshal.WriteInt64(addr, 8, value);
return entry;
}
}
}

View File

@ -1,3 +1,5 @@
using System;
using System.Runtime.InteropServices;
using System.Threading;
namespace ARMeilleure.Translation
@ -11,6 +13,8 @@ namespace ARMeilleure.Translation
private bool _rejit;
private int _callCount;
public bool HighCq => !_rejit;
public TranslatedFunction(GuestFunction func, bool rejit)
{
_func = func;
@ -26,5 +30,10 @@ namespace ARMeilleure.Translation
{
return _rejit && Interlocked.Increment(ref _callCount) == MinCallsForRejit;
}
public IntPtr GetPointer()
{
return Marshal.GetFunctionPointerForDelegate(_func);
}
}
}

View File

@ -16,10 +16,14 @@ namespace ARMeilleure.Translation
{
private const ulong CallFlag = InstEmitFlowHelper.CallFlag;
private const bool AlwaysTranslateFunctions = true; // If false, only translates a single block for lowCq.
private MemoryManager _memory;
private ConcurrentDictionary<ulong, TranslatedFunction> _funcs;
private JumpTable _jumpTable;
private PriorityQueue<RejitRequest> _backgroundQueue;
private AutoResetEvent _backgroundTranslatorEvent;
@ -32,9 +36,13 @@ namespace ARMeilleure.Translation
_funcs = new ConcurrentDictionary<ulong, TranslatedFunction>();
_jumpTable = JumpTable.Instance;
_backgroundQueue = new PriorityQueue<RejitRequest>(2);
_backgroundTranslatorEvent = new AutoResetEvent(false);
DirectCallStubs.InitializeStubs();
}
private void TranslateQueuedSubs()
@ -46,30 +54,42 @@ namespace ARMeilleure.Translation
TranslatedFunction func = Translate(request.Address, request.Mode, highCq: true);
_funcs.AddOrUpdate(request.Address, func, (key, oldFunc) => func);
_jumpTable.RegisterFunction(request.Address, func);
}
else
{
_backgroundTranslatorEvent.WaitOne();
}
}
_backgroundTranslatorEvent.Set(); // Wake up any other background translator threads, to encourage them to exit.
}
public void Execute(State.ExecutionContext context, ulong address)
{
if (Interlocked.Increment(ref _threadCount) == 1)
{
// Simple heuristic, should be user configurable in future. (1 for 4 core/ht or less, 2 for 6 core+ht etc).
// All threads are normal priority except from the last, which just fills as much of the last core as the os lets it with a low priority.
// If we only have one rejit thread, it should be normal priority as highCq code is performance critical.
// TODO: Use physical cores rather than logical. This only really makes sense for processors with hyperthreading. Requires OS specific code.
int unboundedThreadCount = Math.Max(1, (Environment.ProcessorCount - 6) / 3);
int threadCount = Math.Min(3, unboundedThreadCount);
for (int i = 0; i < threadCount; i++)
{
bool last = i != 0 && i == unboundedThreadCount - 1;
Thread backgroundTranslatorThread = new Thread(TranslateQueuedSubs)
{
Name = "CPU.BackgroundTranslatorThread",
Priority = ThreadPriority.Lowest
Name = "CPU.BackgroundTranslatorThread." + i,
Priority = last ? ThreadPriority.Lowest : ThreadPriority.Normal
};
backgroundTranslatorThread.Start();
}
}
Statistics.InitializeTimer();
NativeInterface.RegisterThread(context, _memory);
NativeInterface.RegisterThread(context, _memory, this);
do
{
@ -98,7 +118,7 @@ namespace ARMeilleure.Translation
return nextAddr;
}
private TranslatedFunction GetOrTranslate(ulong address, ExecutionMode mode)
internal TranslatedFunction GetOrTranslate(ulong address, ExecutionMode mode)
{
// TODO: Investigate how we should handle code at unaligned addresses.
// Currently, those low bits are used to store special flags.
@ -124,12 +144,12 @@ namespace ARMeilleure.Translation
private TranslatedFunction Translate(ulong address, ExecutionMode mode, bool highCq)
{
ArmEmitterContext context = new ArmEmitterContext(_memory, Aarch32Mode.User);
ArmEmitterContext context = new ArmEmitterContext(_memory, _jumpTable, (long)address, highCq, Aarch32Mode.User);
Logger.StartPass(PassName.Decoding);
Block[] blocks = highCq
? Decoder.DecodeFunction (_memory, address, mode)
Block[] blocks = AlwaysTranslateFunctions
? Decoder.DecodeFunction (_memory, address, mode, highCq)
: Decoder.DecodeBasicBlock(_memory, address, mode);
Logger.EndPass(PassName.Decoding);
@ -216,7 +236,7 @@ namespace ARMeilleure.Translation
// with some kind of branch).
if (isLastOp && block.Next == null)
{
context.Return(Const(opCode.Address + (ulong)opCode.OpCodeSizeInBytes));
InstEmitFlowHelper.EmitTailContinue(context, Const(opCode.Address + (ulong)opCode.OpCodeSizeInBytes));
}
}
}
@ -238,7 +258,11 @@ namespace ARMeilleure.Translation
context.BranchIfTrue(lblNonZero, count);
context.Call(new _Void(NativeInterface.CheckSynchronization));
Operand running = context.Call(new _Bool(NativeInterface.CheckSynchronization));
context.BranchIfTrue(lblExit, running);
context.Return(Const(0L));
context.Branch(lblExit);

View File

@ -137,7 +137,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
public void ExitThread(KThread thread)
{
thread.Context.Running = false;
thread.Context.StopRunning();
CoreManager.Exit(thread.HostThread);
}

View File

@ -1141,9 +1141,9 @@ namespace Ryujinx.HLE.HOS.Kernel.Threading
{
Owner.Translator.Execute(Context, entrypoint);
Context.Dispose();
ThreadExit();
Context.Dispose();
}
private void ThreadExit()