-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Arm] Fix generating code with UB in NeonEmitter #121802
Conversation
@llvm/pr-subscribers-testing-tools @llvm/pr-subscribers-clang-codegen Author: Momchil Velikov (momchil-velikov) ChangesThe The latter is especially troubling as it introduces undefined behaviour, e.g.
In all this usage the intended semantics are of a bitcast, thus this patch introduces The implementation of some compare intrinsics used to depend on the > // FIXME: this is utterly horrific. We should not be looking at previous This is resolved by using the last argument of the intrinsic call to determine Patch is 6.33 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121802.diff 53 Files Affected:
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 914be3691ee812..47870bd1678c17 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -214,6 +214,10 @@ namespace clang {
EltType ET = getEltType();
return ET == Poly8 || ET == Poly16 || ET == Poly64;
}
+ bool isFloatingPoint() const {
+ EltType ET = getEltType();
+ return ET == Float16 || ET == Float32 || ET == Float64 || ET == BFloat16;
+ }
bool isUnsigned() const { return (Flags & UnsignedFlag) != 0; }
bool isQuad() const { return (Flags & QuadFlag) != 0; }
unsigned getEltSizeInBits() const {
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef89fa4358dfeb..19cf6f1dbfb692 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -129,7 +129,7 @@ def OP_VCVT_NA_HI_F32 : Op<(call "vcombine", $p0, (call "vcvt_f32_f64", $p1))>;
def OP_VCVT_EX_HI_F32 : Op<(call "vcvt_f32_f16", (call "vget_high", $p0))>;
def OP_VCVT_EX_HI_F64 : Op<(call "vcvt_f64_f32", (call "vget_high", $p0))>;
def OP_VCVTX_HI : Op<(call "vcombine", $p0, (call "vcvtx_f32", $p1))>;
-def OP_REINT : Op<(cast "R", $p0)>;
+def OP_REINT : Op<(bitcast "R", $p0)>;
def OP_ADDHNHi : Op<(call "vcombine", $p0, (call "vaddhn", $p1, $p2))>;
def OP_RADDHNHi : Op<(call "vcombine", $p0, (call "vraddhn", $p1, $p2))>;
def OP_SUBHNHi : Op<(call "vcombine", $p0, (call "vsubhn", $p1, $p2))>;
@@ -929,12 +929,12 @@ def CFMLE : SOpInst<"vcle", "U..", "lUldQdQlQUl", OP_LE>;
def CFMGT : SOpInst<"vcgt", "U..", "lUldQdQlQUl", OP_GT>;
def CFMLT : SOpInst<"vclt", "U..", "lUldQdQlQUl", OP_LT>;
-def CMEQ : SInst<"vceqz", "U.",
+def CMEQ : SInst<"vceqz", "U(.!)",
"csilfUcUsUiUlPcPlQcQsQiQlQfQUcQUsQUiQUlQPcdQdQPl">;
-def CMGE : SInst<"vcgez", "U.", "csilfdQcQsQiQlQfQd">;
-def CMLE : SInst<"vclez", "U.", "csilfdQcQsQiQlQfQd">;
-def CMGT : SInst<"vcgtz", "U.", "csilfdQcQsQiQlQfQd">;
-def CMLT : SInst<"vcltz", "U.", "csilfdQcQsQiQlQfQd">;
+def CMGE : SInst<"vcgez", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMLE : SInst<"vclez", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMGT : SInst<"vcgtz", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMLT : SInst<"vcltz", "U(.!)", "csilfdQcQsQiQlQfQd">;
////////////////////////////////////////////////////////////////////////////////
// Max/Min Integer
@@ -1672,11 +1672,11 @@ let TargetGuard = "fullfp16,neon" in {
// ARMv8.2-A FP16 one-operand vector intrinsics.
// Comparison
- def CMEQH : SInst<"vceqz", "U.", "hQh">;
- def CMGEH : SInst<"vcgez", "U.", "hQh">;
- def CMGTH : SInst<"vcgtz", "U.", "hQh">;
- def CMLEH : SInst<"vclez", "U.", "hQh">;
- def CMLTH : SInst<"vcltz", "U.", "hQh">;
+ def CMEQH : SInst<"vceqz", "U(.!)", "hQh">;
+ def CMGEH : SInst<"vcgez", "U(.!)", "hQh">;
+ def CMGTH : SInst<"vcgtz", "U(.!)", "hQh">;
+ def CMLEH : SInst<"vclez", "U(.!)", "hQh">;
+ def CMLTH : SInst<"vcltz", "U(.!)", "hQh">;
// Vector conversion
def VCVT_F16 : SInst<"vcvt_f16", "F(.!)", "sUsQsQUs">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c419fb0cc055e0..b3c76986511444 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8158,8 +8158,9 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
// Determine the type of this overloaded NEON intrinsic.
NeonTypeFlags Type(NeonTypeConst->getZExtValue());
- bool Usgn = Type.isUnsigned();
- bool Quad = Type.isQuad();
+ const bool Usgn = Type.isUnsigned();
+ const bool Quad = Type.isQuad();
+ const bool Floating = Type.isFloatingPoint();
const bool HasLegalHalfType = getTarget().hasLegalHalfType();
const bool AllowBFloatArgsAndRet =
getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
@@ -8260,24 +8261,28 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
}
case NEON::BI__builtin_neon_vceqz_v:
case NEON::BI__builtin_neon_vceqzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
- ICmpInst::ICMP_EQ, "vceqz");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, "vceqz");
case NEON::BI__builtin_neon_vcgez_v:
case NEON::BI__builtin_neon_vcgezq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
- ICmpInst::ICMP_SGE, "vcgez");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE,
+ "vcgez");
case NEON::BI__builtin_neon_vclez_v:
case NEON::BI__builtin_neon_vclezq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
- ICmpInst::ICMP_SLE, "vclez");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE,
+ "vclez");
case NEON::BI__builtin_neon_vcgtz_v:
case NEON::BI__builtin_neon_vcgtzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
- ICmpInst::ICMP_SGT, "vcgtz");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT,
+ "vcgtz");
case NEON::BI__builtin_neon_vcltz_v:
case NEON::BI__builtin_neon_vcltzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
- ICmpInst::ICMP_SLT, "vcltz");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT,
+ "vcltz");
case NEON::BI__builtin_neon_vclz_v:
case NEON::BI__builtin_neon_vclzq_v:
// We generate target-independent intrinsic, which needs a second argument
@@ -8840,28 +8845,32 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
return Builder.CreateBitCast(Result, ResultType, NameHint);
}
-Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
- Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
- const CmpInst::Predicate Ip, const Twine &Name) {
- llvm::Type *OTy = Op->getType();
-
- // FIXME: this is utterly horrific. We should not be looking at previous
- // codegen context to find out what needs doing. Unfortunately TableGen
- // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
- // (etc).
- if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
- OTy = BI->getOperand(0)->getType();
-
- Op = Builder.CreateBitCast(Op, OTy);
- if (OTy->getScalarType()->isFloatingPointTy()) {
- if (Fp == CmpInst::FCMP_OEQ)
- Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
+Value *
+CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty,
+ const CmpInst::Predicate Pred,
+ const Twine &Name) {
+
+ if (isa<FixedVectorType>(Ty)) {
+ // Vector types are cast to i8 vectors. Recover original type.
+ Op = Builder.CreateBitCast(Op, Ty);
+ }
+
+ if (CmpInst::isFPPredicate(Pred)) {
+ if (Pred == CmpInst::FCMP_OEQ)
+ Op = Builder.CreateFCmp(Pred, Op, Constant::getNullValue(Op->getType()));
else
- Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy));
+ Op = Builder.CreateFCmpS(Pred, Op, Constant::getNullValue(Op->getType()));
} else {
- Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
+ Op = Builder.CreateICmp(Pred, Op, Constant::getNullValue(Op->getType()));
}
- return Builder.CreateSExt(Op, Ty, Name);
+
+ llvm::Type *ResTy = Ty;
+ if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
+ ResTy = FixedVectorType::get(
+ IntegerType::get(getLLVMContext(), VTy->getScalarSizeInBits()),
+ VTy->getNumElements());
+
+ return Builder.CreateSExt(Op, ResTy, Name);
}
static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
@@ -12350,45 +12359,66 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return Builder.CreateFAdd(Op0, Op1, "vpaddd");
}
case NEON::BI__builtin_neon_vceqzd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_EQ, "vceqz");
case NEON::BI__builtin_neon_vceqzd_f64:
case NEON::BI__builtin_neon_vceqzs_f32:
case NEON::BI__builtin_neon_vceqzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
+ ICmpInst::FCMP_OEQ, "vceqz");
case NEON::BI__builtin_neon_vcgezd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_SGE, "vcgez");
case NEON::BI__builtin_neon_vcgezd_f64:
case NEON::BI__builtin_neon_vcgezs_f32:
case NEON::BI__builtin_neon_vcgezh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
+ ICmpInst::FCMP_OGE, "vcgez");
case NEON::BI__builtin_neon_vclezd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_SLE, "vclez");
case NEON::BI__builtin_neon_vclezd_f64:
case NEON::BI__builtin_neon_vclezs_f32:
case NEON::BI__builtin_neon_vclezh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
+ ICmpInst::FCMP_OLE, "vclez");
case NEON::BI__builtin_neon_vcgtzd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_SGT, "vcgtz");
case NEON::BI__builtin_neon_vcgtzd_f64:
case NEON::BI__builtin_neon_vcgtzs_f32:
case NEON::BI__builtin_neon_vcgtzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
+ ICmpInst::FCMP_OGT, "vcgtz");
case NEON::BI__builtin_neon_vcltzd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_SLT, "vcltz");
+
case NEON::BI__builtin_neon_vcltzd_f64:
case NEON::BI__builtin_neon_vcltzs_f32:
case NEON::BI__builtin_neon_vcltzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
+ ICmpInst::FCMP_OLT, "vcltz");
case NEON::BI__builtin_neon_vceqzd_u64: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 1a5c42f8f974d0..d1bec166a435e2 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4628,10 +4628,10 @@ class CodeGenFunction : public CodeGenTypeCache {
llvm::Value *EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
ReturnValueSlot ReturnValue);
- llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty,
- const llvm::CmpInst::Predicate Fp,
- const llvm::CmpInst::Predicate Ip,
- const llvm::Twine &Name = "");
+ llvm::Value *
+ EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty,
+ const llvm::CmpInst::Predicate Pred,
+ const llvm::Twine &Name = "");
llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
ReturnValueSlot ReturnValue,
llvm::Triple::ArchType Arch);
diff --git a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
index 877d83c0fa3954..6da2762782acb9 100644
--- a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
@@ -1,6 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
-// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s
// REQUIRES: aarch64-registered-target || arm-registered-target
@@ -8,10 +8,7 @@
// CHECK-LABEL: @test_vbfdot_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
@@ -20,10 +17,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
// CHECK-LABEL: @test_vbfdotq_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
@@ -32,19 +26,10 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
// CHECK-LABEL: @test_vbfdot_lane_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[__REINT_128:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-NEXT: [[__REINT1_128:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_128]], align 8
-// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_128]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_128]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_128]], align 8
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]])
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST2]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -53,19 +38,10 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
// CHECK-LABEL: @test_vbfdotq_laneq_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[__REINT_130:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT: [[__REINT1_130:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_130]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_130]], align 16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_130]], align 16
-// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_130]], align 16
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]])
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST2]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -74,19 +50,10 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
// CHECK-LABEL: @test_vbfdot_laneq_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[__REINT_132:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT: [[__REINT1_132:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_132]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_132]], align 16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_132]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_132]], align 8
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]])
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: Momchil Velikov (momchil-velikov) ChangesThe The latter is especially troubling as it introduces undefined behaviour, e.g.
In all this usage the intended semantics are of a bitcast, thus this patch introduces The implementation of some compare intrinsics used to depend on the > // FIXME: this is utterly horrific. We should not be looking at previous This is resolved by using the last argument of the intrinsic call to determine Patch is 6.33 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121802.diff 53 Files Affected:
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 914be3691ee812..47870bd1678c17 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -214,6 +214,10 @@ namespace clang {
EltType ET = getEltType();
return ET == Poly8 || ET == Poly16 || ET == Poly64;
}
+ bool isFloatingPoint() const {
+ EltType ET = getEltType();
+ return ET == Float16 || ET == Float32 || ET == Float64 || ET == BFloat16;
+ }
bool isUnsigned() const { return (Flags & UnsignedFlag) != 0; }
bool isQuad() const { return (Flags & QuadFlag) != 0; }
unsigned getEltSizeInBits() const {
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef89fa4358dfeb..19cf6f1dbfb692 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -129,7 +129,7 @@ def OP_VCVT_NA_HI_F32 : Op<(call "vcombine", $p0, (call "vcvt_f32_f64", $p1))>;
def OP_VCVT_EX_HI_F32 : Op<(call "vcvt_f32_f16", (call "vget_high", $p0))>;
def OP_VCVT_EX_HI_F64 : Op<(call "vcvt_f64_f32", (call "vget_high", $p0))>;
def OP_VCVTX_HI : Op<(call "vcombine", $p0, (call "vcvtx_f32", $p1))>;
-def OP_REINT : Op<(cast "R", $p0)>;
+def OP_REINT : Op<(bitcast "R", $p0)>;
def OP_ADDHNHi : Op<(call "vcombine", $p0, (call "vaddhn", $p1, $p2))>;
def OP_RADDHNHi : Op<(call "vcombine", $p0, (call "vraddhn", $p1, $p2))>;
def OP_SUBHNHi : Op<(call "vcombine", $p0, (call "vsubhn", $p1, $p2))>;
@@ -929,12 +929,12 @@ def CFMLE : SOpInst<"vcle", "U..", "lUldQdQlQUl", OP_LE>;
def CFMGT : SOpInst<"vcgt", "U..", "lUldQdQlQUl", OP_GT>;
def CFMLT : SOpInst<"vclt", "U..", "lUldQdQlQUl", OP_LT>;
-def CMEQ : SInst<"vceqz", "U.",
+def CMEQ : SInst<"vceqz", "U(.!)",
"csilfUcUsUiUlPcPlQcQsQiQlQfQUcQUsQUiQUlQPcdQdQPl">;
-def CMGE : SInst<"vcgez", "U.", "csilfdQcQsQiQlQfQd">;
-def CMLE : SInst<"vclez", "U.", "csilfdQcQsQiQlQfQd">;
-def CMGT : SInst<"vcgtz", "U.", "csilfdQcQsQiQlQfQd">;
-def CMLT : SInst<"vcltz", "U.", "csilfdQcQsQiQlQfQd">;
+def CMGE : SInst<"vcgez", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMLE : SInst<"vclez", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMGT : SInst<"vcgtz", "U(.!)", "csilfdQcQsQiQlQfQd">;
+def CMLT : SInst<"vcltz", "U(.!)", "csilfdQcQsQiQlQfQd">;
////////////////////////////////////////////////////////////////////////////////
// Max/Min Integer
@@ -1672,11 +1672,11 @@ let TargetGuard = "fullfp16,neon" in {
// ARMv8.2-A FP16 one-operand vector intrinsics.
// Comparison
- def CMEQH : SInst<"vceqz", "U.", "hQh">;
- def CMGEH : SInst<"vcgez", "U.", "hQh">;
- def CMGTH : SInst<"vcgtz", "U.", "hQh">;
- def CMLEH : SInst<"vclez", "U.", "hQh">;
- def CMLTH : SInst<"vcltz", "U.", "hQh">;
+ def CMEQH : SInst<"vceqz", "U(.!)", "hQh">;
+ def CMGEH : SInst<"vcgez", "U(.!)", "hQh">;
+ def CMGTH : SInst<"vcgtz", "U(.!)", "hQh">;
+ def CMLEH : SInst<"vclez", "U(.!)", "hQh">;
+ def CMLTH : SInst<"vcltz", "U(.!)", "hQh">;
// Vector conversion
def VCVT_F16 : SInst<"vcvt_f16", "F(.!)", "sUsQsQUs">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c419fb0cc055e0..b3c76986511444 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8158,8 +8158,9 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
// Determine the type of this overloaded NEON intrinsic.
NeonTypeFlags Type(NeonTypeConst->getZExtValue());
- bool Usgn = Type.isUnsigned();
- bool Quad = Type.isQuad();
+ const bool Usgn = Type.isUnsigned();
+ const bool Quad = Type.isQuad();
+ const bool Floating = Type.isFloatingPoint();
const bool HasLegalHalfType = getTarget().hasLegalHalfType();
const bool AllowBFloatArgsAndRet =
getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
@@ -8260,24 +8261,28 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
}
case NEON::BI__builtin_neon_vceqz_v:
case NEON::BI__builtin_neon_vceqzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
- ICmpInst::ICMP_EQ, "vceqz");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, "vceqz");
case NEON::BI__builtin_neon_vcgez_v:
case NEON::BI__builtin_neon_vcgezq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
- ICmpInst::ICMP_SGE, "vcgez");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE,
+ "vcgez");
case NEON::BI__builtin_neon_vclez_v:
case NEON::BI__builtin_neon_vclezq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
- ICmpInst::ICMP_SLE, "vclez");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE,
+ "vclez");
case NEON::BI__builtin_neon_vcgtz_v:
case NEON::BI__builtin_neon_vcgtzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
- ICmpInst::ICMP_SGT, "vcgtz");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT,
+ "vcgtz");
case NEON::BI__builtin_neon_vcltz_v:
case NEON::BI__builtin_neon_vcltzq_v:
- return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
- ICmpInst::ICMP_SLT, "vcltz");
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], Ty, Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT,
+ "vcltz");
case NEON::BI__builtin_neon_vclz_v:
case NEON::BI__builtin_neon_vclzq_v:
// We generate target-independent intrinsic, which needs a second argument
@@ -8840,28 +8845,32 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
return Builder.CreateBitCast(Result, ResultType, NameHint);
}
-Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
- Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
- const CmpInst::Predicate Ip, const Twine &Name) {
- llvm::Type *OTy = Op->getType();
-
- // FIXME: this is utterly horrific. We should not be looking at previous
- // codegen context to find out what needs doing. Unfortunately TableGen
- // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
- // (etc).
- if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
- OTy = BI->getOperand(0)->getType();
-
- Op = Builder.CreateBitCast(Op, OTy);
- if (OTy->getScalarType()->isFloatingPointTy()) {
- if (Fp == CmpInst::FCMP_OEQ)
- Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
+Value *
+CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty,
+ const CmpInst::Predicate Pred,
+ const Twine &Name) {
+
+ if (isa<FixedVectorType>(Ty)) {
+ // Vector types are cast to i8 vectors. Recover original type.
+ Op = Builder.CreateBitCast(Op, Ty);
+ }
+
+ if (CmpInst::isFPPredicate(Pred)) {
+ if (Pred == CmpInst::FCMP_OEQ)
+ Op = Builder.CreateFCmp(Pred, Op, Constant::getNullValue(Op->getType()));
else
- Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy));
+ Op = Builder.CreateFCmpS(Pred, Op, Constant::getNullValue(Op->getType()));
} else {
- Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
+ Op = Builder.CreateICmp(Pred, Op, Constant::getNullValue(Op->getType()));
}
- return Builder.CreateSExt(Op, Ty, Name);
+
+ llvm::Type *ResTy = Ty;
+ if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
+ ResTy = FixedVectorType::get(
+ IntegerType::get(getLLVMContext(), VTy->getScalarSizeInBits()),
+ VTy->getNumElements());
+
+ return Builder.CreateSExt(Op, ResTy, Name);
}
static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
@@ -12350,45 +12359,66 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return Builder.CreateFAdd(Op0, Op1, "vpaddd");
}
case NEON::BI__builtin_neon_vceqzd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_EQ, "vceqz");
case NEON::BI__builtin_neon_vceqzd_f64:
case NEON::BI__builtin_neon_vceqzs_f32:
case NEON::BI__builtin_neon_vceqzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
+ ICmpInst::FCMP_OEQ, "vceqz");
case NEON::BI__builtin_neon_vcgezd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_SGE, "vcgez");
case NEON::BI__builtin_neon_vcgezd_f64:
case NEON::BI__builtin_neon_vcgezs_f32:
case NEON::BI__builtin_neon_vcgezh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
+ ICmpInst::FCMP_OGE, "vcgez");
case NEON::BI__builtin_neon_vclezd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_SLE, "vclez");
case NEON::BI__builtin_neon_vclezd_f64:
case NEON::BI__builtin_neon_vclezs_f32:
case NEON::BI__builtin_neon_vclezh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
+ ICmpInst::FCMP_OLE, "vclez");
case NEON::BI__builtin_neon_vcgtzd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_SGT, "vcgtz");
case NEON::BI__builtin_neon_vcgtzd_f64:
case NEON::BI__builtin_neon_vcgtzs_f32:
case NEON::BI__builtin_neon_vcgtzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
+ ICmpInst::FCMP_OGT, "vcgtz");
case NEON::BI__builtin_neon_vcltzd_s64:
+ Ops.push_back(EmitScalarExpr(E->getArg(0)));
+ return EmitAArch64CompareBuiltinExpr(
+ Ops[0], ConvertType(E->getCallReturnType(getContext())),
+ ICmpInst::ICMP_SLT, "vcltz");
+
case NEON::BI__builtin_neon_vcltzd_f64:
case NEON::BI__builtin_neon_vcltzs_f32:
case NEON::BI__builtin_neon_vcltzh_f16:
Ops.push_back(EmitScalarExpr(E->getArg(0)));
return EmitAArch64CompareBuiltinExpr(
Ops[0], ConvertType(E->getCallReturnType(getContext())),
- ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
+ ICmpInst::FCMP_OLT, "vcltz");
case NEON::BI__builtin_neon_vceqzd_u64: {
Ops.push_back(EmitScalarExpr(E->getArg(0)));
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 1a5c42f8f974d0..d1bec166a435e2 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4628,10 +4628,10 @@ class CodeGenFunction : public CodeGenTypeCache {
llvm::Value *EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
ReturnValueSlot ReturnValue);
- llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty,
- const llvm::CmpInst::Predicate Fp,
- const llvm::CmpInst::Predicate Ip,
- const llvm::Twine &Name = "");
+ llvm::Value *
+ EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty,
+ const llvm::CmpInst::Predicate Pred,
+ const llvm::Twine &Name = "");
llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
ReturnValueSlot ReturnValue,
llvm::Triple::ArchType Arch);
diff --git a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
index 877d83c0fa3954..6da2762782acb9 100644
--- a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c
@@ -1,6 +1,6 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
-// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s
// REQUIRES: aarch64-registered-target || arm-registered-target
@@ -8,10 +8,7 @@
// CHECK-LABEL: @test_vbfdot_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
@@ -20,10 +17,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
// CHECK-LABEL: @test_vbfdotq_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
@@ -32,19 +26,10 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
// CHECK-LABEL: @test_vbfdot_lane_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[__REINT_128:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-NEXT: [[__REINT1_128:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_128]], align 8
-// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_128]], align 8
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
-// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_128]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_128]], align 8
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]])
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST2]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -53,19 +38,10 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
// CHECK-LABEL: @test_vbfdotq_laneq_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[__REINT_130:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT: [[__REINT1_130:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_130]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_130]], align 16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_130]], align 16
-// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_130]], align 16
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]])
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST2]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -74,19 +50,10 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
// CHECK-LABEL: @test_vbfdot_laneq_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[__REINT_132:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT: [[__REINT1_132:%.*]] = alloca <2 x float>, align 8
-// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_132]], align 16
-// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_132]], align 16
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_132]], align 8
-// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_132]], align 8
-// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]])
+// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat...
[truncated]
|
✅ With the latest revision this PR passed the Python code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Momchil,
The patch looks good to me. I believe we can also use bit cast for the neon tuples, like I left in the comments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
27c8a1e
to
6a565f4
Compare
I reduced the scope of this PR to just fixing the immediate UB. |
When generating
arm_neon.h
, NeonEmitter outputs code thatviolates strict aliasing rules (C23 6.5 Expressions #7,
C++23 7.2.1 Value category [basic.lval] #11), for
example
This patch fixed the offending code by replacing it with
a call to
__builtin_bit_cast
.