From bbf42c1f756b1ebf466b79ec7a5064b8c5063d2a Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 4 Jan 2022 16:06:46 -0800 Subject: [PATCH 1/3] Add INITBLK_LCL_UNROLL_LIMIT and CPBLK_LCL_UNROLL_LIMIT of 128 bytes in src/coreclr/jit/targetarm64.h --- src/coreclr/jit/targetarm64.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index cdab21582ffced..4cc6b63f73009f 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -11,8 +11,10 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk. + #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk + #define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack) + #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk + #define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack) #ifdef FEATURE_SIMD #define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned From 22f540ed7f882192bc3a28823c40d46471220df9 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Wed, 5 Jan 2022 14:51:10 -0800 Subject: [PATCH 2/3] Keep unrolling InitBlock up to INITBLK_LCL_UNROLL_LIMIT bytes when dstAddr points to the stack in src/coreclr/jit/lowerarmarch.cpp --- src/coreclr/jit/lowerarmarch.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 58301b37b887b7..1f52c32ba4da76 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -293,6 +293,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) GenTree* src = blkNode->Data(); unsigned size = blkNode->Size(); + const bool isDstAddrLocal = dstAddr->OperIsLocalAddr(); + if (blkNode->OperIsInitBlkOp()) { if (src->OperIs(GT_INIT_VAL)) @@ -306,7 +308,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT)) + unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT; + +#ifdef TARGET_ARM64 + if (isDstAddrLocal) + { + // Since dstAddr points to the stack CodeGen can use more optimal + // quad-word store SIMD instructions for InitBlock. + initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT; + } +#endif + + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT)) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; From 22116ba9e1b5df12bea4098b6aafa7906da826c2 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Wed, 5 Jan 2022 15:16:27 -0800 Subject: [PATCH 3/3] Keep unrolling CopyBlock up to CPBLK_LCL_UNROLL_LIMIT bytes when both srcAddr and dstAddr point to the stack in src/coreclr/jit/lowerarmarch.cpp --- src/coreclr/jit/lowerarmarch.cpp | 34 +++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 1f52c32ba4da76..3769c762fac239 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -366,27 +366,47 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) assert(src->OperIs(GT_IND, GT_LCL_VAR, GT_LCL_FLD)); src->SetContained(); + bool isSrcAddrLocal = false; + if (src->OperIs(GT_IND)) { + GenTree* srcAddr = src->AsIndir()->Addr(); // TODO-Cleanup: Make sure that GT_IND lowering didn't mark the source address as contained. // Sometimes the GT_IND type is a non-struct type and then GT_IND lowering may contain the // address, not knowing that GT_IND is part of a block op that has containment restrictions. - src->AsIndir()->Addr()->ClearContained(); + srcAddr->ClearContained(); + isSrcAddrLocal = srcAddr->OperIsLocalAddr(); } - else if (src->OperIs(GT_LCL_VAR)) + else { - // TODO-1stClassStructs: for now we can't work with STORE_BLOCK source in register. - const unsigned srcLclNum = src->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(srcLclNum DEBUGARG(DoNotEnregisterReason::BlockOp)); + isSrcAddrLocal = true; + + if (src->OperIs(GT_LCL_VAR)) + { + // TODO-1stClassStructs: for now we can't work with STORE_BLOCK source in register. + const unsigned srcLclNum = src->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(srcLclNum DEBUGARG(DoNotEnregisterReason::BlockOp)); + } } + unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT; + +#ifdef TARGET_ARM64 + if (isSrcAddrLocal && isDstAddrLocal) + { + // Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal + // quad-word load and store SIMD instructions for CopyBlock. + copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT; + } +#endif + if (blkNode->OperIs(GT_STORE_OBJ)) { if (!blkNode->AsObj()->GetLayout()->HasGCPtr()) { blkNode->SetOper(GT_STORE_BLK); } - else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT)) + else if (isDstAddrLocal && (size <= copyBlockUnrollLimit)) { // If the size is small enough to unroll then we need to mark the block as non-interruptible // to actually allow unrolling. The generated code does not report GC references loaded in the @@ -402,7 +422,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } - else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT)) + else if (blkNode->OperIs(GT_STORE_BLK) && (size <= copyBlockUnrollLimit)) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;