-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Enable and optimize AVX helper-intrinsics #17030
Changes from all commits
d5716e4
37c716b
744940a
e938ad3
0c90eb4
c488262
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1318,6 +1318,67 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) | |
break; | ||
} | ||
|
||
case NI_AVX_SetAllVector256: | ||
{ | ||
assert(op1 != nullptr); | ||
assert(op2 == nullptr); | ||
op1Reg = op1->gtRegNum; | ||
if (varTypeIsIntegral(baseType)) | ||
{ | ||
// If the argument is a integer, it needs to be moved into a XMM register | ||
regNumber tmpXMM = node->ExtractTempReg(); | ||
emit->emitIns_R_R(INS_mov_i2xmm, emitActualTypeSize(baseType), tmpXMM, op1Reg); | ||
op1Reg = tmpXMM; | ||
} | ||
|
||
if (compiler->compSupports(InstructionSet_AVX2)) | ||
{ | ||
// generate broadcast instructions if AVX2 is available | ||
emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg); | ||
} | ||
else | ||
{ | ||
// duplicate the scalar argument to XMM register | ||
switch (baseType) | ||
{ | ||
case TYP_FLOAT: | ||
emit->emitIns_SIMD_R_R_I(INS_vpermilps, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0); | ||
break; | ||
case TYP_DOUBLE: | ||
emit->emitIns_R_R(INS_movddup, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg); | ||
break; | ||
case TYP_BYTE: | ||
case TYP_UBYTE: | ||
{ | ||
regNumber tmpZeroReg = node->GetSingleTempReg(); | ||
emit->emitIns_R_R(INS_pxor, emitTypeSize(TYP_SIMD16), tmpZeroReg, tmpZeroReg); | ||
emit->emitIns_SIMD_R_R_R(INS_pshufb, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, tmpZeroReg); | ||
break; | ||
} | ||
case TYP_SHORT: | ||
case TYP_USHORT: | ||
emit->emitIns_SIMD_R_R_I(INS_pshuflw, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just followed Clang codegen. They have the same performance on most architectures. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, but the latter produces smaller code, iirc There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can investigate it later. |
||
emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 80); | ||
break; | ||
case TYP_INT: | ||
case TYP_UINT: | ||
emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0); | ||
break; | ||
case TYP_LONG: | ||
case TYP_ULONG: | ||
emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 68); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why |
||
break; | ||
|
||
default: | ||
unreached(); | ||
break; | ||
} | ||
// duplicate the XMM register to YMM register | ||
emit->emitIns_SIMD_R_R_R_I(INS_vinsertf128, emitTypeSize(TYP_SIMD32), targetReg, op1Reg, op1Reg, 1); | ||
} | ||
break; | ||
} | ||
|
||
case NI_AVX_ExtendToVector256: | ||
{ | ||
// ExtendToVector256 has zero-extend semantics in order to ensure it is deterministic | ||
|
@@ -1412,7 +1473,7 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) | |
{ | ||
if (intrinsicID == NI_AVX_ExtractVector128 || intrinsicID == NI_AVX2_ExtractVector128) | ||
{ | ||
emit->emitIns_R_AR_I(ins, attr, op2Reg, op1Reg, 0, (int)i); | ||
emit->emitIns_AR_R_I(ins, attr, op1Reg, 0, op2Reg, (int)i); | ||
} | ||
else if (op2->TypeGet() == TYP_I_IMPL) | ||
{ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the register encoding needs to happen in the case for
MWR
.emitOutputAM
is one of the only ones that handles the register encoding in itself.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I do not quite understand.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you look at the other examples MWR cases handle the encodeReg3456 before the emit Output call. emitOutputAM is the only one that does that in the call itself
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I looked the above section of
IF_RWR_MRD_CNS
that does not handle the encodeReg3456 inif (Is4ByteSSE4OrAVXInstruction(ins))
path.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, I see.
A comment (or explicit assert) that
Is4ByteSSE4OrAVXInstruction(ins)
is always true would be beneficial.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, will do.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Added a comment blow.