Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Arm64] Keep unrolling InitBlock and CopyBlock up to 128 bytes #63422

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 41 additions & 8 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
GenTree* src = blkNode->Data();
unsigned size = blkNode->Size();

const bool isDstAddrLocal = dstAddr->OperIsLocalAddr();

if (blkNode->OperIsInitBlkOp())
{
if (src->OperIs(GT_INIT_VAL))
Expand All @@ -306,7 +308,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
blkNode->SetOper(GT_STORE_BLK);
}

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT))
unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT;

#ifdef TARGET_ARM64
if (isDstAddrLocal)
{
// Since dstAddr points to the stack CodeGen can use more optimal
// quad-word store SIMD instructions for InitBlock.
initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT;
}
#endif

if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down Expand Up @@ -353,27 +366,47 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
assert(src->OperIs(GT_IND, GT_LCL_VAR, GT_LCL_FLD));
src->SetContained();

bool isSrcAddrLocal = false;

if (src->OperIs(GT_IND))
{
GenTree* srcAddr = src->AsIndir()->Addr();
// TODO-Cleanup: Make sure that GT_IND lowering didn't mark the source address as contained.
// Sometimes the GT_IND type is a non-struct type and then GT_IND lowering may contain the
// address, not knowing that GT_IND is part of a block op that has containment restrictions.
src->AsIndir()->Addr()->ClearContained();
srcAddr->ClearContained();
isSrcAddrLocal = srcAddr->OperIsLocalAddr();
}
else
{
isSrcAddrLocal = true;

if (src->OperIs(GT_LCL_VAR))
{
// TODO-1stClassStructs: for now we can't work with STORE_BLOCK source in register.
const unsigned srcLclNum = src->AsLclVar()->GetLclNum();
comp->lvaSetVarDoNotEnregister(srcLclNum DEBUGARG(DoNotEnregisterReason::BlockOp));
}
}
else if (src->OperIs(GT_LCL_VAR))

unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT;

#ifdef TARGET_ARM64
if (isSrcAddrLocal && isDstAddrLocal)
{
// TODO-1stClassStructs: for now we can't work with STORE_BLOCK source in register.
const unsigned srcLclNum = src->AsLclVar()->GetLclNum();
comp->lvaSetVarDoNotEnregister(srcLclNum DEBUGARG(DoNotEnregisterReason::BlockOp));
// Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal
// quad-word load and store SIMD instructions for CopyBlock.
copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT;
}
#endif

if (blkNode->OperIs(GT_STORE_OBJ))
{
if (!blkNode->AsObj()->GetLayout()->HasGCPtr())
{
blkNode->SetOper(GT_STORE_BLK);
}
else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT))
else if (isDstAddrLocal && (size <= copyBlockUnrollLimit))
{
// If the size is small enough to unroll then we need to mark the block as non-interruptible
// to actually allow unrolling. The generated code does not report GC references loaded in the
Expand All @@ -389,7 +422,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)

blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
}
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT))
else if (blkNode->OperIs(GT_STORE_BLK) && (size <= copyBlockUnrollLimit))
{
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;

Expand Down
6 changes: 4 additions & 2 deletions src/coreclr/jit/targetarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
#define ROUND_FLOAT 0 // Do not round intermed float expression results
#define CPU_HAS_BYTE_REGS 0

#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk.
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk.
#define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk
#define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack)
#define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk
#define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack)

#ifdef FEATURE_SIMD
#define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
Expand Down