diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index ba88ac065e7e63..24e46849decf75 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3124,7 +3124,7 @@ void CodeGen::genLclHeap(GenTree* tree) if (compiler->info.compInitMem) { - if (amount <= LCLHEAP_UNROLL_LIMIT) + if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset)) { // The following zeroes the last 16 bytes and probes the page containing [sp, #16] address. // stp xzr, xzr, [sp, #-16]! diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp index 658cb32f02ba23..0498378155d517 100644 --- a/src/coreclr/jit/codegenloongarch64.cpp +++ b/src/coreclr/jit/codegenloongarch64.cpp @@ -2765,7 +2765,7 @@ void CodeGen::genCodeForDivMod(GenTreeOp* tree) // Generate code for InitBlk by performing a loop unroll // Preconditions: // a) Both the size and fill byte value are integer constants. -// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes. +// b) The size of the struct to initialize is smaller than getUnrollThreshold() bytes. void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) { assert(node->OperIs(GT_STORE_BLK)); @@ -6457,7 +6457,7 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode) // None // // Assumption: -// The size argument of the CpBlk node is a constant and <= CPBLK_UNROLL_LIMIT bytes. +// The size argument of the CpBlk node is a constant and <= getUnrollThreshold() bytes. // void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode) { diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 4d18a72d2adb1a..1ede2de60c1aaa 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3613,7 +3613,8 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) } unsigned loadSize = putArgNode->GetArgLoadSize(); - assert(!src->GetLayout(compiler)->HasGCPtr() && (loadSize <= CPBLK_UNROLL_LIMIT)); + assert(!src->GetLayout(compiler)->HasGCPtr() && + (loadSize <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memcpy))); unsigned offset = 0; regNumber xmmTmpReg = REG_NA; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index b772d4071a4122..3c23d4e9da18be 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8944,6 +8944,75 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif // FEATURE_SIMD public: + enum UnrollKind + { + Memset, // Initializing memory with some value + Memcpy // Copying memory from src to dst + }; + + //------------------------------------------------------------------------ + // getUnrollThreshold: Calculates the unrolling threshold for the given operation + // + // Arguments: + // type - kind of the operation (memset/memcpy) + // canUseSimd - whether it is allowed to use SIMD or not + // + // Return Value: + // The unrolling threshold for the given operation in bytes + // + unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true) + { + unsigned threshold = TARGET_POINTER_SIZE; + +#if defined(FEATURE_SIMD) + if (canUseSimd) + { + threshold = maxSIMDStructBytes(); +#if defined(TARGET_ARM64) + // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: + // + // ldp q0, q1, [x1] + // stp q0, q1, [x0] + // + threshold *= 2; +#elif defined(TARGET_XARCH) + // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial + threshold = max(threshold, YMM_REGSIZE_BYTES); +#endif + } +#if defined(TARGET_XARCH) + else + { + // Compatibility with previous logic: we used to allow memset:128/memcpy:64 + // on AMD64 (and 64/32 on x86) for cases where we don't use SIMD + // see https://github.com/dotnet/runtime/issues/83297 + threshold *= 2; + } +#endif +#endif + + if (type == UnrollKind::Memset) + { + // Typically, memset-like operations require less instructions than memcpy + threshold *= 2; + } + + // Use 4 as a multiplier by default, thus, the final threshold will be: + // + // | arch | memset | memcpy | + // |-------------|--------|--------| + // | x86 avx512 | 512 | 256 | (TODO-XARCH-AVX512: ignored for now) + // | x86 avx | 256 | 128 | + // | x86 sse | 128 | 64 | + // | arm64 | 256 | 128 | ldp/stp (2x128bit) + // | arm | 32 | 16 | no SIMD support + // | loongarch64 | 64 | 32 | no SIMD support + // + // We might want to use a different multiplier for trully hot/cold blocks based on PGO data + // + return threshold * 4; + } + //------------------------------------------------------------------------ // largestEnregisterableStruct: The size in bytes of the largest struct that can be enregistered. // diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 2952e1c96cf4f8..7846ec2396544a 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -527,18 +527,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT; - -#ifdef TARGET_ARM64 - if (isDstAddrLocal) - { - // Since dstAddr points to the stack CodeGen can use more optimal - // quad-word store SIMD instructions for InitBlock. - initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT; - } -#endif - - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT)) + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)) && + src->OperIs(GT_CNS_INT)) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; @@ -608,17 +598,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } } - unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT; - -#ifdef TARGET_ARM64 - if (isSrcAddrLocal && isDstAddrLocal) - { - // Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal - // quad-word load and store SIMD instructions for CopyBlock. - copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT; - } -#endif - + unsigned copyBlockUnrollLimit = comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy); if (blkNode->OperIs(GT_STORE_OBJ)) { if (!blkNode->AsObj()->GetLayout()->HasGCPtr()) diff --git a/src/coreclr/jit/lowerloongarch64.cpp b/src/coreclr/jit/lowerloongarch64.cpp index 0c417ff543155b..f2a4320bbf9143 100644 --- a/src/coreclr/jit/lowerloongarch64.cpp +++ b/src/coreclr/jit/lowerloongarch64.cpp @@ -295,7 +295,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT)) + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset)) && + src->OperIs(GT_CNS_INT)) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; @@ -353,7 +354,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { blkNode->SetOper(GT_STORE_BLK); } - else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT)) + else if (dstAddr->OperIsLocalAddr() && (size <= getUnrollThreshold(UnrollKind::Memcpy))) { // If the size is small enough to unroll then we need to mark the block as non-interruptible // to actually allow unrolling. The generated code does not report GC references loaded in the @@ -371,7 +372,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } //////////////////////////////////////////////////////////////////////////////////////////////////////// - else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT)) + else if (blkNode->OperIs(GT_STORE_BLK) && (size <= getUnrollThreshold(UnrollKind::Memcpy))) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index afcbe5d82c1860..490c70076a527d 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -321,7 +321,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT)) + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset))) { if (!src->OperIs(GT_CNS_INT)) { @@ -332,7 +332,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } else { - blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; // The fill value of an initblk is interpreted to hold a // value of (unsigned int8) however a constant of any size @@ -357,6 +356,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { src->SetContained(); } + else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, /*canUseSimd*/ false)) + { + // It turns out we can't use SIMD so the default threshold is too big + goto TOO_BIG_TO_UNROLL; + } } } #ifdef TARGET_AMD64 @@ -371,6 +375,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) fill *= 0x01010101; } + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; src->AsIntCon()->SetIconValue(fill); ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); @@ -378,6 +383,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } else { + TOO_BIG_TO_UNROLL: #ifdef TARGET_AMD64 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; #else @@ -412,7 +418,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } #ifndef JIT32_GCENCODER - else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT)) + else if (dstAddr->OperIsLocalAddr() && + (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy, false))) { // If the size is small enough to unroll then we need to mark the block as non-interruptible // to actually allow unrolling. The generated code does not report GC references loaded in the @@ -472,7 +479,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } } - else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT)) + else if (blkNode->OperIs(GT_STORE_BLK) && + (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy, !blkNode->GetLayout()->HasGCPtr()))) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; @@ -655,7 +663,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) } else #endif // TARGET_X86 - if (loadSize <= CPBLK_UNROLL_LIMIT) + if (loadSize <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy)) { putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; } diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 0f7c3b4d9f1a7c..394f402d0a3383 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -591,7 +591,7 @@ int LinearScan::BuildNode(GenTree* tree) // localloc. sizeVal = AlignUp(sizeVal, STACK_ALIGN); - if (sizeVal <= LCLHEAP_UNROLL_LIMIT) + if (sizeVal <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset)) { // Need no internal registers } diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index ac3f0ca7e8c027..5d646bfff2ae32 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -13,8 +13,6 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk. #define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number // of contiguous non-gc slots that trigger generating rep movsq instead of // sequences of movsq instructions diff --git a/src/coreclr/jit/targetarm.h b/src/coreclr/jit/targetarm.h index 60b01d6063dcbd..3fa00cacbf4700 100644 --- a/src/coreclr/jit/targetarm.h +++ b/src/coreclr/jit/targetarm.h @@ -14,9 +14,6 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 32 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 16 // Upper bound to let the code generator to loop unroll InitBlk. - #define FEATURE_FIXED_OUT_ARGS 1 // Preallocate the outgoing arg area in the prolog #define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers #define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index b30c7af7e40d98..a454b47608bb37 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -11,12 +11,6 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk - #define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack) - #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk - #define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack) - #define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required) - #ifdef FEATURE_SIMD #define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned #define FEATURE_PARTIAL_SIMD_CALLEE_SAVE 1 // Whether SIMD registers are partially saved at calls diff --git a/src/coreclr/jit/targetloongarch64.h b/src/coreclr/jit/targetloongarch64.h index 5d511e73ec3ae6..2a6e68622c9523 100644 --- a/src/coreclr/jit/targetloongarch64.h +++ b/src/coreclr/jit/targetloongarch64.h @@ -16,9 +16,6 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk. - #ifdef FEATURE_SIMD #pragma error("SIMD Unimplemented yet LOONGARCH") #define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index dffd6adf2efb08..4c18c408ec0c85 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -11,10 +11,6 @@ #define ROUND_FLOAT 1 // round intermed float expression results #define CPU_HAS_BYTE_REGS 1 - // TODO-CQ: Fine tune the following xxBlk threshold values: - - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk. #define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number // of contiguous non-gc slots that trigger generating rep movsq instead of // sequences of movsq instructions