From 6426a70abb5056521c2676d811b9ae8eb8a6eaef Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 10 Mar 2023 20:14:52 +0100 Subject: [PATCH 1/7] Unify unroll limits in a single entry point --- src/coreclr/jit/codegenarm64.cpp | 2 +- src/coreclr/jit/codegenloongarch64.cpp | 4 +-- src/coreclr/jit/codegenxarch.cpp | 2 +- src/coreclr/jit/compiler.h | 43 ++++++++++++++++++++++++++ src/coreclr/jit/lowerarmarch.cpp | 16 ++-------- src/coreclr/jit/lowerloongarch64.cpp | 7 +++-- src/coreclr/jit/lowerxarch.cpp | 8 ++--- src/coreclr/jit/lsraarm64.cpp | 2 +- src/coreclr/jit/targetamd64.h | 2 -- src/coreclr/jit/targetarm.h | 3 -- src/coreclr/jit/targetarm64.h | 6 ---- src/coreclr/jit/targetloongarch64.h | 3 -- src/coreclr/jit/targetx86.h | 2 -- 13 files changed, 59 insertions(+), 41 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 9dcd37c1799a71..43ba28a397fb19 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3124,7 +3124,7 @@ void CodeGen::genLclHeap(GenTree* tree) if (compiler->info.compInitMem) { - if (amount <= LCLHEAP_UNROLL_LIMIT) + if (amount <= getUnrollThreshold(UnrollKind::Memset)) { // The following zeroes the last 16 bytes and probes the page containing [sp, #16] address. // stp xzr, xzr, [sp, #-16]! diff --git a/src/coreclr/jit/codegenloongarch64.cpp b/src/coreclr/jit/codegenloongarch64.cpp index 658cb32f02ba23..0498378155d517 100644 --- a/src/coreclr/jit/codegenloongarch64.cpp +++ b/src/coreclr/jit/codegenloongarch64.cpp @@ -2765,7 +2765,7 @@ void CodeGen::genCodeForDivMod(GenTreeOp* tree) // Generate code for InitBlk by performing a loop unroll // Preconditions: // a) Both the size and fill byte value are integer constants. -// b) The size of the struct to initialize is smaller than INITBLK_UNROLL_LIMIT bytes. +// b) The size of the struct to initialize is smaller than getUnrollThreshold() bytes. void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) { assert(node->OperIs(GT_STORE_BLK)); @@ -6457,7 +6457,7 @@ void CodeGen::genCodeForCpBlkHelper(GenTreeBlk* cpBlkNode) // None // // Assumption: -// The size argument of the CpBlk node is a constant and <= CPBLK_UNROLL_LIMIT bytes. +// The size argument of the CpBlk node is a constant and <= getUnrollThreshold() bytes. // void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode) { diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 4d18a72d2adb1a..ae9707da66bcde 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3613,7 +3613,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) } unsigned loadSize = putArgNode->GetArgLoadSize(); - assert(!src->GetLayout(compiler)->HasGCPtr() && (loadSize <= CPBLK_UNROLL_LIMIT)); + assert(!src->GetLayout(compiler)->HasGCPtr() && (loadSize <= getUnrollThreshold(UnrollKind::Memcpy))); unsigned offset = 0; regNumber xmmTmpReg = REG_NA; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 912f52d7f27385..df775b7dc5b988 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8917,6 +8917,49 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX return emitTypeSize(TYP_SIMD8); } + enum UnrollKind + { + Memset, // Initializing memory with some value + Memcpy // Copying memory from src to dst + }; + + unsigned int getUnrollThreshold(UnrollKind type) + { + unsigned threshold = maxSIMDStructBytes(); +#if defined(TARGET_ARM64) + // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: + // + // ldp q0, q1, [x1] + // stp q0, q1, [x0] + // + threshold *= 2; +#elif defined(TARGET_XARCH) + // Ignore AVX-512 for now + threshold = max(threshold, YMM_REGSIZE_BYTES); +#endif + + if (type == UnrollKind::Memset) + { + // Typically, memset-like operations require less instructions than memcpy + threshold *= 2; + } + + // Use 4 as a multiplier by default, thus, the final threshold will be: + // + // | arch | memset | memcpy | + // |------------|--------|--------| + // | x86 avx512 | 512 | 256 | (ignored for now) + // | x86 avx | 256 | 128 | + // | x86 sse | 128 | 64 | + // | arm64 | 256 | 128 | + // | arm | 128 | 64 | + // + // We might want to use a different multiplier for trully hot/cold blocks based on PGO data + // + // As of today, we don't use SVE/SVE2, DC ZVA and hardware memset/memcpy instructions on ARM64 + return threshold * 4; + } + public: // Returns the codegen type for a given SIMD size. static var_types getSIMDTypeForSize(unsigned size) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 2952e1c96cf4f8..0cc476e78810fa 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -527,18 +527,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - unsigned initBlockUnrollLimit = INITBLK_UNROLL_LIMIT; - -#ifdef TARGET_ARM64 - if (isDstAddrLocal) - { - // Since dstAddr points to the stack CodeGen can use more optimal - // quad-word store SIMD instructions for InitBlock. - initBlockUnrollLimit = INITBLK_LCL_UNROLL_LIMIT; - } -#endif - - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= initBlockUnrollLimit) && src->OperIs(GT_CNS_INT)) + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset)) && + src->OperIs(GT_CNS_INT)) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; @@ -608,7 +598,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } } - unsigned copyBlockUnrollLimit = CPBLK_UNROLL_LIMIT; + unsigned copyBlockUnrollLimit = getUnrollThreshold(UnrollKind::Memcpy); #ifdef TARGET_ARM64 if (isSrcAddrLocal && isDstAddrLocal) diff --git a/src/coreclr/jit/lowerloongarch64.cpp b/src/coreclr/jit/lowerloongarch64.cpp index 0c417ff543155b..f2a4320bbf9143 100644 --- a/src/coreclr/jit/lowerloongarch64.cpp +++ b/src/coreclr/jit/lowerloongarch64.cpp @@ -295,7 +295,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT) && src->OperIs(GT_CNS_INT)) + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset)) && + src->OperIs(GT_CNS_INT)) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; @@ -353,7 +354,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { blkNode->SetOper(GT_STORE_BLK); } - else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT)) + else if (dstAddr->OperIsLocalAddr() && (size <= getUnrollThreshold(UnrollKind::Memcpy))) { // If the size is small enough to unroll then we need to mark the block as non-interruptible // to actually allow unrolling. The generated code does not report GC references loaded in the @@ -371,7 +372,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } //////////////////////////////////////////////////////////////////////////////////////////////////////// - else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT)) + else if (blkNode->OperIs(GT_STORE_BLK) && (size <= getUnrollThreshold(UnrollKind::Memcpy))) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index afcbe5d82c1860..ae0d2694187a00 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -321,7 +321,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= INITBLK_UNROLL_LIMIT)) + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset))) { if (!src->OperIs(GT_CNS_INT)) { @@ -412,7 +412,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } #ifndef JIT32_GCENCODER - else if (dstAddr->OperIsLocalAddr() && (size <= CPBLK_UNROLL_LIMIT)) + else if (dstAddr->OperIsLocalAddr() && (size <= getUnrollThreshold(UnrollKind::Memcpy))) { // If the size is small enough to unroll then we need to mark the block as non-interruptible // to actually allow unrolling. The generated code does not report GC references loaded in the @@ -472,7 +472,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } } - else if (blkNode->OperIs(GT_STORE_BLK) && (size <= CPBLK_UNROLL_LIMIT)) + else if (blkNode->OperIs(GT_STORE_BLK) && (size <= getUnrollThreshold(UnrollKind::Memcpy))) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; @@ -655,7 +655,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) } else #endif // TARGET_X86 - if (loadSize <= CPBLK_UNROLL_LIMIT) + if (loadSize <= getUnrollThreshold(UnrollKind::Memcpy)) { putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; } diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 0f7c3b4d9f1a7c..a32d2d6ea11471 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -591,7 +591,7 @@ int LinearScan::BuildNode(GenTree* tree) // localloc. sizeVal = AlignUp(sizeVal, STACK_ALIGN); - if (sizeVal <= LCLHEAP_UNROLL_LIMIT) + if (sizeVal <= getUnrollThreshold(UnrollKind::Memset)) { // Need no internal registers } diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index ac3f0ca7e8c027..5d646bfff2ae32 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -13,8 +13,6 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk. #define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number // of contiguous non-gc slots that trigger generating rep movsq instead of // sequences of movsq instructions diff --git a/src/coreclr/jit/targetarm.h b/src/coreclr/jit/targetarm.h index 60b01d6063dcbd..3fa00cacbf4700 100644 --- a/src/coreclr/jit/targetarm.h +++ b/src/coreclr/jit/targetarm.h @@ -14,9 +14,6 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 32 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 16 // Upper bound to let the code generator to loop unroll InitBlk. - #define FEATURE_FIXED_OUT_ARGS 1 // Preallocate the outgoing arg area in the prolog #define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers #define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index b30c7af7e40d98..a454b47608bb37 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -11,12 +11,6 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk - #define CPBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll CpBlk (when both srcAddr and dstAddr point to the stack) - #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk - #define INITBLK_LCL_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk (when dstAddr points to the stack) - #define LCLHEAP_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll LclHeap (when zeroing is required) - #ifdef FEATURE_SIMD #define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned #define FEATURE_PARTIAL_SIMD_CALLEE_SAVE 1 // Whether SIMD registers are partially saved at calls diff --git a/src/coreclr/jit/targetloongarch64.h b/src/coreclr/jit/targetloongarch64.h index 5d511e73ec3ae6..2a6e68622c9523 100644 --- a/src/coreclr/jit/targetloongarch64.h +++ b/src/coreclr/jit/targetloongarch64.h @@ -16,9 +16,6 @@ #define ROUND_FLOAT 0 // Do not round intermed float expression results #define CPU_HAS_BYTE_REGS 0 - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll InitBlk. - #ifdef FEATURE_SIMD #pragma error("SIMD Unimplemented yet LOONGARCH") #define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index dffd6adf2efb08..167eacb3bcdae9 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -13,8 +13,6 @@ // TODO-CQ: Fine tune the following xxBlk threshold values: - #define CPBLK_UNROLL_LIMIT 64 // Upper bound to let the code generator to loop unroll CpBlk. - #define INITBLK_UNROLL_LIMIT 128 // Upper bound to let the code generator to loop unroll InitBlk. #define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number // of contiguous non-gc slots that trigger generating rep movsq instead of // sequences of movsq instructions From 6bcfedf1be5404291c6e412190f76a818cf499e9 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 10 Mar 2023 20:51:47 +0100 Subject: [PATCH 2/7] Fix compilation --- src/coreclr/jit/codegenarm64.cpp | 2 +- src/coreclr/jit/codegenxarch.cpp | 3 +- src/coreclr/jit/compiler.h | 90 +++++++++++++++++--------------- src/coreclr/jit/lowerarmarch.cpp | 14 +---- src/coreclr/jit/lowerxarch.cpp | 8 +-- src/coreclr/jit/lsraarm64.cpp | 2 +- 6 files changed, 57 insertions(+), 62 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 43ba28a397fb19..77ed96519ec88c 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3124,7 +3124,7 @@ void CodeGen::genLclHeap(GenTree* tree) if (compiler->info.compInitMem) { - if (amount <= getUnrollThreshold(UnrollKind::Memset)) + if (amount <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset)) { // The following zeroes the last 16 bytes and probes the page containing [sp, #16] address. // stp xzr, xzr, [sp, #-16]! diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index ae9707da66bcde..1ede2de60c1aaa 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3613,7 +3613,8 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) } unsigned loadSize = putArgNode->GetArgLoadSize(); - assert(!src->GetLayout(compiler)->HasGCPtr() && (loadSize <= getUnrollThreshold(UnrollKind::Memcpy))); + assert(!src->GetLayout(compiler)->HasGCPtr() && + (loadSize <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memcpy))); unsigned offset = 0; regNumber xmmTmpReg = REG_NA; diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index df775b7dc5b988..af0a7ee43fb637 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8917,49 +8917,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX return emitTypeSize(TYP_SIMD8); } - enum UnrollKind - { - Memset, // Initializing memory with some value - Memcpy // Copying memory from src to dst - }; - - unsigned int getUnrollThreshold(UnrollKind type) - { - unsigned threshold = maxSIMDStructBytes(); -#if defined(TARGET_ARM64) - // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: - // - // ldp q0, q1, [x1] - // stp q0, q1, [x0] - // - threshold *= 2; -#elif defined(TARGET_XARCH) - // Ignore AVX-512 for now - threshold = max(threshold, YMM_REGSIZE_BYTES); -#endif - - if (type == UnrollKind::Memset) - { - // Typically, memset-like operations require less instructions than memcpy - threshold *= 2; - } - - // Use 4 as a multiplier by default, thus, the final threshold will be: - // - // | arch | memset | memcpy | - // |------------|--------|--------| - // | x86 avx512 | 512 | 256 | (ignored for now) - // | x86 avx | 256 | 128 | - // | x86 sse | 128 | 64 | - // | arm64 | 256 | 128 | - // | arm | 128 | 64 | - // - // We might want to use a different multiplier for trully hot/cold blocks based on PGO data - // - // As of today, we don't use SVE/SVE2, DC ZVA and hardware memset/memcpy instructions on ARM64 - return threshold * 4; - } - public: // Returns the codegen type for a given SIMD size. static var_types getSIMDTypeForSize(unsigned size) @@ -9005,6 +8962,53 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif // FEATURE_SIMD public: + enum UnrollKind + { + Memset, // Initializing memory with some value + Memcpy // Copying memory from src to dst + }; + + unsigned int getUnrollThreshold(UnrollKind type) + { + unsigned threshold = TARGET_POINTER_SIZE; + +#if defined(FEATURE_SIMD) + threshold = maxSIMDStructBytes(); +#if defined(TARGET_ARM64) + // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: + // + // ldp q0, q1, [x1] + // stp q0, q1, [x0] + // + threshold *= 2; +#elif defined(TARGET_XARCH) + // Ignore AVX-512 for now + threshold = max(threshold, YMM_REGSIZE_BYTES); +#endif +#endif + + if (type == UnrollKind::Memset) + { + // Typically, memset-like operations require less instructions than memcpy + threshold *= 2; + } + + // Use 4 as a multiplier by default, thus, the final threshold will be: + // + // | arch | memset | memcpy | + // |------------|--------|--------| + // | x86 avx512 | 512 | 256 | (ignored for now) + // | x86 avx | 256 | 128 | + // | x86 sse | 128 | 64 | + // | arm64 | 256 | 128 | + // | arm | 128 | 64 | + // + // We might want to use a different multiplier for trully hot/cold blocks based on PGO data + // + // As of today, we don't use SVE/SVE2, DC ZVA and hardware memset/memcpy instructions on ARM64 + return threshold * 4; + } + //------------------------------------------------------------------------ // largestEnregisterableStruct: The size in bytes of the largest struct that can be enregistered. // diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 0cc476e78810fa..7846ec2396544a 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -527,7 +527,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset)) && + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset)) && src->OperIs(GT_CNS_INT)) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; @@ -598,17 +598,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } } - unsigned copyBlockUnrollLimit = getUnrollThreshold(UnrollKind::Memcpy); - -#ifdef TARGET_ARM64 - if (isSrcAddrLocal && isDstAddrLocal) - { - // Since both srcAddr and dstAddr point to the stack CodeGen can use more optimal - // quad-word load and store SIMD instructions for CopyBlock. - copyBlockUnrollLimit = CPBLK_LCL_UNROLL_LIMIT; - } -#endif - + unsigned copyBlockUnrollLimit = comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy); if (blkNode->OperIs(GT_STORE_OBJ)) { if (!blkNode->AsObj()->GetLayout()->HasGCPtr()) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ae0d2694187a00..5e7981f332eb65 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -321,7 +321,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } - if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= getUnrollThreshold(UnrollKind::Memset))) + if (!blkNode->OperIs(GT_STORE_DYN_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memset))) { if (!src->OperIs(GT_CNS_INT)) { @@ -412,7 +412,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } #ifndef JIT32_GCENCODER - else if (dstAddr->OperIsLocalAddr() && (size <= getUnrollThreshold(UnrollKind::Memcpy))) + else if (dstAddr->OperIsLocalAddr() && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy))) { // If the size is small enough to unroll then we need to mark the block as non-interruptible // to actually allow unrolling. The generated code does not report GC references loaded in the @@ -472,7 +472,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } } - else if (blkNode->OperIs(GT_STORE_BLK) && (size <= getUnrollThreshold(UnrollKind::Memcpy))) + else if (blkNode->OperIs(GT_STORE_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy))) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; @@ -655,7 +655,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) } else #endif // TARGET_X86 - if (loadSize <= getUnrollThreshold(UnrollKind::Memcpy)) + if (loadSize <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy)) { putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; } diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index a32d2d6ea11471..394f402d0a3383 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -591,7 +591,7 @@ int LinearScan::BuildNode(GenTree* tree) // localloc. sizeVal = AlignUp(sizeVal, STACK_ALIGN); - if (sizeVal <= getUnrollThreshold(UnrollKind::Memset)) + if (sizeVal <= compiler->getUnrollThreshold(Compiler::UnrollKind::Memset)) { // Need no internal registers } From a9bb9ae139e6661582a219aa51903dc19d733149 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 11 Mar 2023 00:23:23 +0100 Subject: [PATCH 3/7] Fix comment --- src/coreclr/jit/compiler.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index af0a7ee43fb637..4eeb62bf22e815 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8995,17 +8995,17 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // Use 4 as a multiplier by default, thus, the final threshold will be: // - // | arch | memset | memcpy | - // |------------|--------|--------| - // | x86 avx512 | 512 | 256 | (ignored for now) - // | x86 avx | 256 | 128 | - // | x86 sse | 128 | 64 | - // | arm64 | 256 | 128 | - // | arm | 128 | 64 | + // | arch | memset | memcpy | + // |-------------|--------|--------| + // | x86 avx512 | 512 | 256 | (ignored for now) + // | x86 avx | 256 | 128 | + // | x86 sse | 128 | 64 | + // | arm64 | 256 | 128 | ldp/stp (2x128bit) + // | arm | 32 | 16 | no SIMD support + // | loongarch64 | 64 | 32 | no SIMD support // // We might want to use a different multiplier for trully hot/cold blocks based on PGO data // - // As of today, we don't use SVE/SVE2, DC ZVA and hardware memset/memcpy instructions on ARM64 return threshold * 4; } From c1e865f7aefa9b526234aa9470f9448d45c0ecee Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 11 Mar 2023 01:01:42 +0100 Subject: [PATCH 4/7] Update compiler.h --- src/coreclr/jit/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 4eeb62bf22e815..1dc5e59bc8b7c5 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8999,7 +8999,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // |-------------|--------|--------| // | x86 avx512 | 512 | 256 | (ignored for now) // | x86 avx | 256 | 128 | - // | x86 sse | 128 | 64 | + // | x86 sse | 128 | 64 | // | arm64 | 256 | 128 | ldp/stp (2x128bit) // | arm | 32 | 16 | no SIMD support // | loongarch64 | 64 | 32 | no SIMD support From e0b40ecb2f2cdcec5047d18762e0db4aa216c69c Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 11 Mar 2023 18:37:10 +0100 Subject: [PATCH 5/7] Mitigate regressions --- src/coreclr/jit/compiler.h | 32 ++++++++++++++++++++++---------- src/coreclr/jit/lowerxarch.cpp | 13 ++++++++++--- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index b4b714c8732c98..a4342211f9d72e 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8950,22 +8950,34 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Memcpy // Copying memory from src to dst }; - unsigned int getUnrollThreshold(UnrollKind type) + unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true) { unsigned threshold = TARGET_POINTER_SIZE; #if defined(FEATURE_SIMD) - threshold = maxSIMDStructBytes(); + if (canUseSimd) + { + threshold = maxSIMDStructBytes(); #if defined(TARGET_ARM64) - // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: - // - // ldp q0, q1, [x1] - // stp q0, q1, [x0] - // - threshold *= 2; + // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: + // + // ldp q0, q1, [x1] + // stp q0, q1, [x0] + // + threshold *= 2; #elif defined(TARGET_XARCH) - // Ignore AVX-512 for now - threshold = max(threshold, YMM_REGSIZE_BYTES); + // Ignore AVX-512 for now + threshold = max(threshold, YMM_REGSIZE_BYTES); +#endif + } +#if defined(TARGET_XARCH) + else + { + // Compatibility with previous logic: we used to allow memset:128/memcpy:64 + // on AMD64 (and 64/32 on x86) for cases where we don't use SIMD + // see https://github.com/dotnet/runtime/issues/83297 + threshold *= 2; + } #endif #endif diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 5e7981f332eb65..1b8034b7e27f91 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -332,7 +332,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } else { - blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; // The fill value of an initblk is interpreted to hold a // value of (unsigned int8) however a constant of any size @@ -357,6 +356,10 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { src->SetContained(); } + else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, /*canUseSimd*/ false)) + { + goto TOO_BIG_TO_UNROLL; + } } } #ifdef TARGET_AMD64 @@ -371,6 +374,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) fill *= 0x01010101; } + blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; src->AsIntCon()->SetIconValue(fill); ContainBlockStoreAddress(blkNode, size, dstAddr, nullptr); @@ -378,6 +382,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } else { + TOO_BIG_TO_UNROLL: #ifdef TARGET_AMD64 blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindHelper; #else @@ -412,7 +417,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->SetOper(GT_STORE_BLK); } #ifndef JIT32_GCENCODER - else if (dstAddr->OperIsLocalAddr() && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy))) + else if (dstAddr->OperIsLocalAddr() && + (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy, false))) { // If the size is small enough to unroll then we need to mark the block as non-interruptible // to actually allow unrolling. The generated code does not report GC references loaded in the @@ -472,7 +478,8 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } } - else if (blkNode->OperIs(GT_STORE_BLK) && (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy))) + else if (blkNode->OperIs(GT_STORE_BLK) && + (size <= comp->getUnrollThreshold(Compiler::UnrollKind::Memcpy, !blkNode->GetLayout()->HasGCPtr()))) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; From 0481c3ed5ffdc007bdab58f14d0571ca2837990e Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 11 Mar 2023 21:44:12 +0100 Subject: [PATCH 6/7] Clean up --- src/coreclr/jit/compiler.h | 10 ++++++++++ src/coreclr/jit/lowerxarch.cpp | 1 + src/coreclr/jit/targetx86.h | 2 -- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index a4342211f9d72e..f05c9d5d2e1348 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8950,6 +8950,16 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Memcpy // Copying memory from src to dst }; + //------------------------------------------------------------------------ + // getUnrollThreshold: Calculates the unrolling threshold for the given operation + // + // Arguments: + // type - kind of the operation (memset/memcpy) + // canUseSimd - whether it is allowed to use SIMD or not + // + // Return Value: + // The unrolling threshold for the given operation in bytes + // unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true) { unsigned threshold = TARGET_POINTER_SIZE; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 1b8034b7e27f91..490c70076a527d 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -358,6 +358,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) } else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, /*canUseSimd*/ false)) { + // It turns out we can't use SIMD so the default threshold is too big goto TOO_BIG_TO_UNROLL; } } diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index 167eacb3bcdae9..4c18c408ec0c85 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -11,8 +11,6 @@ #define ROUND_FLOAT 1 // round intermed float expression results #define CPU_HAS_BYTE_REGS 1 - // TODO-CQ: Fine tune the following xxBlk threshold values: - #define CPOBJ_NONGC_SLOTS_LIMIT 4 // For CpObj code generation, this is the threshold of the number // of contiguous non-gc slots that trigger generating rep movsq instead of // sequences of movsq instructions From 0f99a42835523fd2e96ab4eddb18c872f9149541 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 12 Mar 2023 17:53:09 +0100 Subject: [PATCH 7/7] Update compiler.h --- src/coreclr/jit/compiler.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index f05c9d5d2e1348..3c23d4e9da18be 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8976,7 +8976,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // threshold *= 2; #elif defined(TARGET_XARCH) - // Ignore AVX-512 for now + // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial threshold = max(threshold, YMM_REGSIZE_BYTES); #endif } @@ -9001,7 +9001,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // // | arch | memset | memcpy | // |-------------|--------|--------| - // | x86 avx512 | 512 | 256 | (ignored for now) + // | x86 avx512 | 512 | 256 | (TODO-XARCH-AVX512: ignored for now) // | x86 avx | 256 | 128 | // | x86 sse | 128 | 64 | // | arm64 | 256 | 128 | ldp/stp (2x128bit)