Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jit preparation for arm64 apple: Use bytes in fgArgTabEntry. #42503

Merged
merged 10 commits into from
Oct 1, 2020
124 changes: 106 additions & 18 deletions src/coreclr/src/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -1459,11 +1459,21 @@ struct fgArgTabEntry
unsigned structFloatRegs;
#endif // UNIX_AMD64_ABI

#if defined(DEBUG_NOT_OSX_ARM64_ABI)
// A slot is a pointer sized region in the OutArg area.
unsigned slotNum; // When an argument is passed in the OutArg area this is the slot number in the OutArg area
unsigned numSlots; // Count of number of slots that this argument uses
#endif // DEBUG_NOT_OSX_ARM64_ABI

unsigned alignment; // 1 or 2 (slots/registers)
// Return number of stack slots that this argument is taking.
// TODO-Cleanup: this funtion does not align with arm64 apple model,
// delete it.
unsigned GetStackSlotsNumber() const
{
return roundUp(GetStackByteSize(), TARGET_POINTER_SIZE) / TARGET_POINTER_SIZE;
}

unsigned byteAlignment; // usually 8 or 16 bytes (slots/registers).
private:
unsigned _lateArgInx; // index into gtCallLateArgs list; UINT_MAX if this is not a late arg.
public:
Expand Down Expand Up @@ -1614,9 +1624,24 @@ struct fgArgTabEntry
}

// Get the number of bytes that this argument is occyping on the stack.
unsigned GetStackSize() const
unsigned GetStackByteSize() const
{
return (TARGET_POINTER_SIZE * this->numSlots);
if (IsHfaRegArg())
{
return 0;
}
assert(!IsHfaArg() || !IsSplit());

if (!IsSplit() && numRegs > 0)
{
return 0;
}
assert(GetByteSize() > TARGET_POINTER_SIZE * numRegs);
unsigned stackByteSize = GetByteSize() - TARGET_POINTER_SIZE * numRegs;
#if !defined(OSX_ARM64_ABI)

#endif
return GetByteSize() - TARGET_POINTER_SIZE * numRegs;
}

var_types GetHfaType() const
Expand All @@ -1634,7 +1659,7 @@ struct fgArgTabEntry
if (type != TYP_UNDEF)
{
// We must already have set the passing mode.
assert(numRegs != 0 || numSlots != 0);
assert(numRegs != 0 || GetStackSlotsNumber() != 0);
// We originally set numRegs according to the size of the struct, but if the size of the
// hfaType is not the same as the pointer size, we need to correct it.
// Note that hfaSlots is the number of registers we will use. For ARM, that is twice
Expand Down Expand Up @@ -1709,11 +1734,13 @@ struct fgArgTabEntry
#endif
}

bool isSingleRegOrSlot() const
// Can we replace the struct type of this node with a primitive type for argument passing?
bool TryPassAsPrimitive() const
{
return !IsSplit() && ((numRegs == 1) || (numSlots == 1));
return !IsSplit() && ((numRegs == 1) || (m_byteSize <= TARGET_POINTER_SIZE));
}

#if defined(DEBUG_NOT_OSX_ARM64_ABI)
// Returns the number of "slots" used, where for this purpose a
// register counts as a slot.
unsigned getSlotCount() const
Expand All @@ -1734,7 +1761,9 @@ struct fgArgTabEntry
}
return numSlots + numRegs;
}
#endif

#if defined(DEBUG_NOT_OSX_ARM64_ABI)
// Returns the size as a multiple of pointer-size.
// For targets without HFAs, this is the same as getSlotCount().
unsigned getSize() const
Expand Down Expand Up @@ -1770,6 +1799,43 @@ struct fgArgTabEntry
return size;
}

#endif // DEBUG && !OSX_ARM64_ABI

private:
unsigned m_byteOffset;
unsigned m_byteSize;

public:
void SetByteOffset(unsigned byteOffset)
{
DEBUG_NOT_OSX_ARM64_ASSERT(byteOffset / TARGET_POINTER_SIZE == slotNum);
m_byteOffset = byteOffset;
}

unsigned GetByteOffset() const
{
DEBUG_NOT_OSX_ARM64_ASSERT(m_byteOffset / TARGET_POINTER_SIZE == slotNum);
return m_byteOffset;
}

void SetByteSize(unsigned byteSize)
{
#if defined(DEBUG_NOT_OSX_ARM64_ABI)
assert(byteAlignment != 0);
if (!isStruct)
{
const unsigned alignedByteSize = roundUp(byteSize, byteAlignment);
assert(alignedByteSize == getSlotCount() * TARGET_POINTER_SIZE);
}
#endif
m_byteSize = byteSize;
}

unsigned GetByteSize() const
{
return m_byteSize;
}

// Set the register numbers for a multireg argument.
// There's nothing to do on x64/Ux because the structDesc has already been used to set the
// register numbers.
Expand Down Expand Up @@ -1799,6 +1865,7 @@ struct fgArgTabEntry
#endif // FEATURE_MULTIREG_ARGS && !defined(UNIX_AMD64_ABI)
}

#ifdef DEBUG
// Check that the value of 'isStruct' is consistent.
// A struct arg must be one of the following:
// - A node of struct type,
Expand All @@ -1816,7 +1883,8 @@ struct fgArgTabEntry
// This is the case where we are passing a struct as a primitive type.
// On most targets, this is always a single register or slot.
// However, on ARM this could be two slots if it is TYP_DOUBLE.
bool isPassedAsPrimitiveType = ((numRegs == 1) || ((numRegs == 0) && (numSlots == 1)));
bool isPassedAsPrimitiveType =
((numRegs == 1) || ((numRegs == 0) && (GetByteSize() <= TARGET_POINTER_SIZE)));
#ifdef TARGET_ARM
if (!isPassedAsPrimitiveType)
{
Expand All @@ -1835,7 +1903,6 @@ struct fgArgTabEntry
}
}

#ifdef DEBUG
void Dump() const;
#endif
};
Expand All @@ -1848,11 +1915,14 @@ struct fgArgTabEntry

class fgArgInfo
{
Compiler* compiler; // Back pointer to the compiler instance so that we can allocate memory
GenTreeCall* callTree; // Back pointer to the GT_CALL node for this fgArgInfo
unsigned argCount; // Updatable arg count value
unsigned nextSlotNum; // Updatable slot count value
unsigned stkLevel; // Stack depth when we make this call (for x86)
Compiler* compiler; // Back pointer to the compiler instance so that we can allocate memory
GenTreeCall* callTree; // Back pointer to the GT_CALL node for this fgArgInfo
unsigned argCount; // Updatable arg count value
#if defined(DEBUG_NOT_OSX_ARM64_ABI)
unsigned nextSlotNum; // Updatable slot count value
#endif
unsigned nextStackByteOffset;
unsigned stkLevel; // Stack depth when we make this call (for x86)

#if defined(UNIX_X86_ABI)
bool alignmentDone; // Updateable flag, set to 'true' after we've done any required alignment.
Expand Down Expand Up @@ -1887,7 +1957,8 @@ class fgArgInfo
GenTreeCall::Use* use,
regNumber regNum,
unsigned numRegs,
unsigned alignment,
unsigned byteSize,
unsigned byteAlignment,
bool isStruct,
bool isVararg = false);

Expand All @@ -1897,7 +1968,8 @@ class fgArgInfo
GenTreeCall::Use* use,
regNumber regNum,
unsigned numRegs,
unsigned alignment,
unsigned byteSize,
unsigned byteAlignment,
const bool isStruct,
const bool isVararg,
const regNumber otherRegNum,
Expand All @@ -1910,7 +1982,8 @@ class fgArgInfo
GenTree* node,
GenTreeCall::Use* use,
unsigned numSlots,
unsigned alignment,
unsigned byteSize,
unsigned byteAlignment,
bool isStruct,
bool isVararg = false);

Expand All @@ -1936,10 +2009,19 @@ class fgArgInfo
{
return argTable;
}

#if defined(DEBUG_NOT_OSX_ARM64_ABI)
unsigned GetNextSlotNum() const
{
return nextSlotNum;
}
#endif

unsigned GetNextSlotByteOffset() const
{
return nextStackByteOffset;
}

bool HasRegArgs() const
{
return hasRegArgs;
Expand Down Expand Up @@ -3212,6 +3294,12 @@ class Compiler
unsigned lvaOutgoingArgSpaceVar; // dummy TYP_LCLBLK var for fixed outgoing argument space
PhasedVar<unsigned> lvaOutgoingArgSpaceSize; // size of fixed outgoing argument space
#endif // FEATURE_FIXED_OUT_ARGS

static unsigned GetOutgoingArgByteSize(unsigned sizeWithoutPadding)
{
return roundUp(sizeWithoutPadding, TARGET_POINTER_SIZE);
}

// Variable representing the return address. The helper-based tailcall
// mechanism passes the address of the return address to a runtime helper
// where it is used to detect tail-call chains.
Expand Down Expand Up @@ -5727,8 +5815,8 @@ class Compiler
SpecialCodeKind acdKind; // what kind of a special block is this?
#if !FEATURE_FIXED_OUT_ARGS
bool acdStkLvlInit; // has acdStkLvl value been already set?
unsigned acdStkLvl;
#endif // !FEATURE_FIXED_OUT_ARGS
unsigned acdStkLvl; // stack level in stack slots.
#endif // !FEATURE_FIXED_OUT_ARGS
};

private:
Expand Down
14 changes: 14 additions & 0 deletions src/coreclr/src/jit/jit.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,20 @@
#define UNIX_AMD64_ABI_ONLY(x)
#endif // defined(UNIX_AMD64_ABI)

#if defined(DEBUG) && !defined(OSX_ARM64_ABI)
#define DEBUG_NOT_OSX_ARM64_ABI
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO these would be better left as, e.g. defined(DEBUG) && !defined(OSX_ARM64_ABI) at each use. I don't find the implied conjunction clear enough, though I see that it might be difficult to deal with the #defines below. Perhaps you could define something like TRACK_ARG_SLOTS or DEBUG_ARG_SLOTS and have a comment here about why it is not used for the OSX_ARM64_ABIT case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say it is a temporary solution, all code under DEBUG_NOT_OSX_ARM64_ABI will go away once the work is done and we check that the new byte mechanism is correct and matches the old results. However, I have found it useful to use a new define because now I can easily change it:

  1. define it in release for non apple arm64;
  2. replace assert-s with noway_assert-s and make debugging easier;
    etc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree that it's not a huge deal since it's temporary, but I appreciate you making the changes~

#endif

#if defined(DEBUG_NOT_OSX_ARM64_ABI)
#define DEBUG_NOT_OSX_ARM64_ARG(x) , x
#define DEBUG_NOT_OSX_ARM64(x) x
#define DEBUG_NOT_OSX_ARM64_ASSERT(x) assert(x)
#else
#define DEBUG_NOT_OSX_ARM64_ARG(x)
#define DEBUG_NOT_OSX_ARM64(x)
#define DEBUG_NOT_OSX_ARM64_ASSERT(x)
#endif

#if defined(UNIX_AMD64_ABI) || !defined(TARGET_64BIT) || defined(TARGET_ARM64)
#define FEATURE_PUT_STRUCT_ARG_STK 1
#define PUT_STRUCT_ARG_STK_ONLY_ARG(x) , x
Expand Down
32 changes: 26 additions & 6 deletions src/coreclr/src/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1041,8 +1041,15 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* inf
#endif // TARGET_ARM
}

unsigned slotNumber = info->GetByteOffset() / TARGET_POINTER_SIZE;
#if defined(FEATURE_PUT_STRUCT_ARG_STK)
unsigned numberOfStackSlots = info->GetStackSlotsNumber();
DEBUG_NOT_OSX_ARM64_ASSERT(numberOfStackSlots == info->numSlots);
#endif
DEBUG_NOT_OSX_ARM64_ASSERT(slotNumber == info->slotNum);

putArg = new (comp, GT_PUTARG_SPLIT)
GenTreePutArgSplit(arg, info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots), info->numRegs,
GenTreePutArgSplit(arg, slotNumber PUT_STRUCT_ARG_STK_ONLY_ARG(numberOfStackSlots), info->numRegs,
call->IsFastTailCall(), call);

// If struct argument is morphed to GT_FIELD_LIST node(s),
Expand Down Expand Up @@ -1126,12 +1133,16 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* inf
// Mark this one as tail call arg if it is a fast tail call.
// This provides the info to put this argument in in-coming arg area slot
// instead of in out-going arg area slot.
CLANG_FORMAT_COMMENT_ANCHOR;

#ifdef DEBUG
// Make sure state is correct. The PUTARG_STK has TYP_VOID, as it doesn't produce
// a result. So the type of its operand must be the correct type to push on the stack.
// For a FIELD_LIST, this will be the type of the field (not the type of the arg),
// but otherwise it is generally the type of the operand.
info->checkIsStruct();
#endif

if ((arg->OperGet() != GT_FIELD_LIST))
{
#if defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK)
Expand All @@ -1145,10 +1156,16 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* inf
assert(genActualType(arg->TypeGet()) == type);
}
}
unsigned slotNumber = info->GetByteOffset() / TARGET_POINTER_SIZE;
#if defined(FEATURE_PUT_STRUCT_ARG_STK)
unsigned numberOfStackSlots = info->GetStackSlotsNumber();
DEBUG_NOT_OSX_ARM64_ASSERT(numberOfStackSlots == info->numSlots);
#endif
DEBUG_NOT_OSX_ARM64_ASSERT(slotNumber == info->slotNum);

putArg =
new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg,
info->slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(info->numSlots),
slotNumber PUT_STRUCT_ARG_STK_ONLY_ARG(numberOfStackSlots),
call->IsFastTailCall(), call);

#ifdef FEATURE_PUT_STRUCT_ARG_STK
Expand Down Expand Up @@ -2115,7 +2132,10 @@ GenTree* Lowering::LowerTailCallViaJitHelper(GenTreeCall* call, GenTree* callTar
// We need to figure out the size of the outgoing stack arguments, not including the special args.
// The number of 4-byte words is passed to the helper for the incoming and outgoing argument sizes.
// This number is exactly the next slot number in the call's argument info struct.
unsigned nNewStkArgsWords = call->fgArgInfo->GetNextSlotNum();
unsigned nNewStkArgsBytes = call->fgArgInfo->GetNextSlotByteOffset();
const int wordSize = 4;
unsigned nNewStkArgsWords = nNewStkArgsBytes / wordSize;
DEBUG_NOT_OSX_ARM64_ASSERT(call->fgArgInfo->GetNextSlotNum() == nNewStkArgsWords);
assert(nNewStkArgsWords >= 4); // There must be at least the four special stack args.
nNewStkArgsWords -= 4;

Expand Down Expand Up @@ -4167,7 +4187,7 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
// On x86 targets, PInvoke calls need the size of the stack args in InlinedCallFrame.m_Datum.
// This is because the callee pops stack arguments, and we need to keep track of this during stack
// walking
const unsigned numStkArgBytes = call->fgArgInfo->GetNextSlotNum() * TARGET_POINTER_SIZE;
const unsigned numStkArgBytes = call->fgArgInfo->GetNextSlotByteOffset();
GenTree* stackBytes = comp->gtNewIconNode(numStkArgBytes, TYP_INT);
GenTreeCall::Use* args = comp->gtNewCallArgs(frameAddr, stackBytes);
#else
Expand Down Expand Up @@ -4201,9 +4221,9 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call)
{
#if !defined(TARGET_64BIT)
// On 32-bit targets, indirect calls need the size of the stack args in InlinedCallFrame.m_Datum.
const unsigned numStkArgBytes = call->fgArgInfo->GetNextSlotNum() * TARGET_POINTER_SIZE;
const unsigned stackByteOffset = call->fgArgInfo->GetNextSlotByteOffset();

src = comp->gtNewIconNode(numStkArgBytes, TYP_INT);
src = comp->gtNewIconNode(stackByteOffset, TYP_INT);
#else
// On 64-bit targets, indirect calls may need the stub parameter value in InlinedCallFrame.m_Datum.
// If the stub parameter value is not needed, m_Datum will be initialized by the VM.
Expand Down
Loading