Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Enable and optimize AVX helper-intrinsics #17030

Merged
merged 6 commits into from
Mar 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/jit/emitfmtsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ IF_DEF(MRD_CNS, IS_GM_RD, DSP_CNS) // read [mem], const
IF_DEF(MWR_CNS, IS_GM_WR, DSP_CNS) // write [mem], const
IF_DEF(MRW_CNS, IS_GM_RW, DSP_CNS) // r/w [mem], const

IF_DEF(MWR_RRD_CNS, IS_GM_WR|IS_R1_RD, DSP_CNS) // write [mem], read reg, const

IF_DEF(MRW_SHF, IS_GM_RW, DSP_CNS) // shift [mem], const

//----------------------------------------------------------------------------
Expand Down Expand Up @@ -194,6 +196,8 @@ IF_DEF(ARD_CNS, IS_AM_RD, AMD_CNS) // read [adr], const
IF_DEF(AWR_CNS, IS_AM_WR, AMD_CNS) // write [adr], const
IF_DEF(ARW_CNS, IS_AM_RW, AMD_CNS) // r/w [adr], const

IF_DEF(AWR_RRD_CNS, IS_AM_WR|IS_R1_RD, AMD_CNS) // write [adr], read reg, const

IF_DEF(ARW_SHF, IS_AM_RW, AMD_CNS) // shift [adr], const


Expand Down
101 changes: 101 additions & 0 deletions src/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2730,6 +2730,9 @@ emitter::insFormat emitter::emitMapFmtAtoM(insFormat fmt)
case IF_ARW_CNS:
return IF_MRW_CNS;

case IF_AWR_RRD_CNS:
return IF_MWR_RRD_CNS;

case IF_ARW_SHF:
return IF_MRW_SHF;

Expand Down Expand Up @@ -5067,6 +5070,32 @@ void emitter::emitIns_AR_R(instruction ins, emitAttr attr, regNumber ireg, regNu
emitAdjustStackDepthPushPop(ins);
}

#ifndef LEGACY_BACKEND
void emitter::emitIns_AR_R_I(instruction ins, emitAttr attr, regNumber base, int disp, regNumber ireg, int ival)
{
assert(ins == INS_vextracti128 || ins == INS_vextractf128);
assert(base != REG_NA);
assert(ireg != REG_NA);
UNATIVE_OFFSET sz;
instrDesc* id = emitNewInstrAmdCns(attr, disp, ival);

id->idIns(ins);
id->idInsFmt(IF_AWR_RRD_CNS);
id->idAddr()->iiaAddrMode.amBaseReg = base;
id->idAddr()->iiaAddrMode.amIndxReg = REG_NA;
id->idReg1(ireg);

assert(emitGetInsAmdAny(id) == disp); // make sure "disp" is stored properly

// the code size of "vextracti/f128 [mem], ymm, imm8" is 6 byte
sz = 6;
id->idCodeSize(sz);

dispIns(id);
emitCurIGsize += sz;
}
#endif

void emitter::emitIns_AI_R(instruction ins, emitAttr attr, regNumber ireg, ssize_t disp)
{
UNATIVE_OFFSET sz;
Expand Down Expand Up @@ -7790,6 +7819,32 @@ void emitter::emitDispIns(
break;
}

case IF_AWR_RRD_CNS:
{
assert(ins == INS_vextracti128 || ins == INS_vextractf128);
// vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr"
sstr = codeGen->genSizeStr(EA_ATTR(16));
printf(sstr);
emitDispAddrMode(id);
printf(", %s", emitRegName(id->idReg1(), attr));

emitGetInsAmdCns(id, &cnsVal);

val = cnsVal.cnsVal;
printf(", ");

if (cnsVal.cnsReloc)
{
emitDispReloc(val);
}
else
{
goto PRINT_CONSTANT;
}

break;
}

case IF_RWR_RRD_ARD:
printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr);
emitDispAddrMode(id);
Expand Down Expand Up @@ -8166,6 +8221,32 @@ void emitter::emitDispIns(
break;
}

case IF_MWR_RRD_CNS:
{
assert(ins == INS_vextracti128 || ins == INS_vextractf128);
// vextracti/f128 extracts 128-bit data, so we fix sstr as "xmm ptr"
sstr = codeGen->genSizeStr(EA_ATTR(16));
printf(sstr);
offs = emitGetInsDsp(id);
emitDispClsVar(id->idAddr()->iiaFieldHnd, offs, ID_INFO_DSP_RELOC);
printf(", %s", emitRegName(id->idReg1(), attr));
emitGetInsDcmCns(id, &cnsVal);

val = cnsVal.cnsVal;
printf(", ");

if (cnsVal.cnsReloc)
{
emitDispReloc(val);
}
else
{
goto PRINT_CONSTANT;
}

break;
}

case IF_RWR_RRD_MRD:
printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr);
offs = emitGetInsDsp(id);
Expand Down Expand Up @@ -12218,6 +12299,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
sz = emitSizeOfInsDsc(id);
break;

case IF_AWR_RRD_CNS:
assert(ins == INS_vextracti128 || ins == INS_vextractf128);
assert(UseVEXEncoding());
emitGetInsAmdCns(id, &cnsVal);
code = insCodeMR(ins);
dst = emitOutputAM(dst, id, code, &cnsVal);
sz = emitSizeOfInsDsc(id);
break;

case IF_RRD_ARD:
case IF_RWR_ARD:
case IF_RRW_ARD:
Expand Down Expand Up @@ -12530,6 +12620,17 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
sz = emitSizeOfInsDsc(id);
break;

case IF_MWR_RRD_CNS:
Copy link
Member

@tannergooding tannergooding Mar 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the register encoding needs to happen in the case for MWR. emitOutputAM is one of the only ones that handles the register encoding in itself.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I do not quite understand.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you look at the other examples MWR cases handle the encodeReg3456 before the emit Output call. emitOutputAM is the only one that does that in the call itself

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked the above section of IF_RWR_MRD_CNS that does not handle the encodeReg3456 in if (Is4ByteSSE4OrAVXInstruction(ins)) path.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see.

A comment (or explicit assert) that Is4ByteSSE4OrAVXInstruction(ins) is always true would be beneficial.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, will do.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Added a comment blow.

assert(ins == INS_vextracti128 || ins == INS_vextractf128);
assert(UseVEXEncoding());
emitGetInsDcmCns(id, &cnsVal);
code = insCodeMR(ins);
// only AVX2 vextracti128 and AVX vextractf128 can reach this path,
// they do not need VEX.vvvv to encode the register operand
dst = emitOutputCV(dst, id, code, &cnsVal);
sz = emitSizeOfInsDsc(id);
break;

case IF_RRD_MRD:
case IF_RWR_MRD:
case IF_RRW_MRD:
Expand Down
1 change: 1 addition & 0 deletions src/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ void emitIns_R_R_A_I(
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, int ival, insFormat fmt);
void emitIns_R_R_AR_I(
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs, int ival);
void emitIns_AR_R_I(instruction ins, emitAttr attr, regNumber base, int disp, regNumber ireg, int ival);
#endif // !LEGACY_BACKEND

void emitIns_R_R_C_I(
Expand Down
63 changes: 62 additions & 1 deletion src/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1318,6 +1318,67 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_AVX_SetAllVector256:
{
assert(op1 != nullptr);
assert(op2 == nullptr);
op1Reg = op1->gtRegNum;
if (varTypeIsIntegral(baseType))
{
// If the argument is a integer, it needs to be moved into a XMM register
regNumber tmpXMM = node->ExtractTempReg();
emit->emitIns_R_R(INS_mov_i2xmm, emitActualTypeSize(baseType), tmpXMM, op1Reg);
op1Reg = tmpXMM;
}

if (compiler->compSupports(InstructionSet_AVX2))
{
// generate broadcast instructions if AVX2 is available
emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg);
}
else
{
// duplicate the scalar argument to XMM register
switch (baseType)
{
case TYP_FLOAT:
emit->emitIns_SIMD_R_R_I(INS_vpermilps, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
break;
case TYP_DOUBLE:
emit->emitIns_R_R(INS_movddup, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg);
break;
case TYP_BYTE:
case TYP_UBYTE:
{
regNumber tmpZeroReg = node->GetSingleTempReg();
emit->emitIns_R_R(INS_pxor, emitTypeSize(TYP_SIMD16), tmpZeroReg, tmpZeroReg);
emit->emitIns_SIMD_R_R_R(INS_pshufb, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, tmpZeroReg);
break;
}
case TYP_SHORT:
case TYP_USHORT:
emit->emitIns_SIMD_R_R_I(INS_pshuflw, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why pshuflw instead of unpcklwd?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just followed Clang codegen. They have the same performance on most architectures.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but the latter produces smaller code, iirc

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can investigate it later.

emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 80);
break;
case TYP_INT:
case TYP_UINT:
emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
break;
case TYP_LONG:
case TYP_ULONG:
emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 68);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why pshufd instead of unpcklqdq?

break;

default:
unreached();
break;
}
// duplicate the XMM register to YMM register
emit->emitIns_SIMD_R_R_R_I(INS_vinsertf128, emitTypeSize(TYP_SIMD32), targetReg, op1Reg, op1Reg, 1);
}
break;
}

case NI_AVX_ExtendToVector256:
{
// ExtendToVector256 has zero-extend semantics in order to ensure it is deterministic
Expand Down Expand Up @@ -1412,7 +1473,7 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
{
if (intrinsicID == NI_AVX_ExtractVector128 || intrinsicID == NI_AVX2_ExtractVector128)
{
emit->emitIns_R_AR_I(ins, attr, op2Reg, op1Reg, 0, (int)i);
emit->emitIns_AR_R_I(ins, attr, op1Reg, 0, op2Reg, (int)i);
}
else if (op2->TypeGet() == TYP_I_IMPL)
{
Expand Down
4 changes: 4 additions & 0 deletions src/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,12 +348,14 @@ HARDWARE_INTRINSIC(AVX_Divide, "Divide",
HARDWARE_INTRINSIC(AVX_DotProduct, "DotProduct", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX_DuplicateEvenIndexed, "DuplicateEvenIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_DuplicateOddIndexed, "DuplicateOddIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_Extract, "Extract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_BaseTypeFromFirstArg|HW_Flag_FullRangeIMM|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AVX_ExtendToVector256, "ExtendToVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_ExtractVector128, "ExtractVector128", AVX, -1, 32, -1, {INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128, INS_vextractf128},HW_Category_IMM, HW_Flag_OneTypeGeneric|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX_Floor, "Floor", AVX, 9, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_GetLowerHalf, "GetLowerHalf", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_HorizontalAdd, "HorizontalAdd", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_HorizontalSubtract, "HorizontalSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX_Insert, "Insert", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoCodeGen|HW_Flag_SecondArgMaybe64Bit)
HARDWARE_INTRINSIC(AVX_InsertVector128, "InsertVector128", AVX, -1, 32, 3, {INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128, INS_vinsertf128},HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_OneTypeGeneric|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
Expand All @@ -374,6 +376,8 @@ HARDWARE_INTRINSIC(AVX_RoundToNearestInteger, "RoundToNea
HARDWARE_INTRINSIC(AVX_RoundToNegativeInfinity, "RoundToNegativeInfinity", AVX, 9, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_RoundToPositiveInfinity, "RoundToPositiveInfinity", AVX, 10, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_RoundToZero, "RoundToZero", AVX, 11, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_SetVector256, "SetVector256", AVX, -1, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SecondArgMaybe64Bit)
HARDWARE_INTRINSIC(AVX_SetAllVector256, "SetAllVector256", AVX, -1, 32, 1, {INS_vpbroadcastb,INS_vpbroadcastb,INS_vpbroadcastw,INS_vpbroadcastw,INS_vpbroadcastd,INS_vpbroadcastd,INS_vpbroadcastq,INS_vpbroadcastq,INS_vbroadcastss,INS_vbroadcastsd},HW_Category_Helper, HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_OneTypeGeneric)
HARDWARE_INTRINSIC(AVX_SetZeroVector256, "SetZeroVector256", AVX, -1, 32, 0, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_xorps, INS_xorpd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(AVX_Shuffle, "Shuffle", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM)
HARDWARE_INTRINSIC(AVX_Sqrt, "Sqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
Expand Down
Loading