-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Handle lowering addrspace casts from LDS to FLAT address in amdgpu-sw-lower-lds. #121214
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Chaitanya (skc7) Changes"infer-address-spaces" pass replaces all refinable generic pointers with equivalent specific pointers. At -O0 optimisation level, infer-address-spaces pass doesn't run in the pipeline. "amdgpu-sw-lower-lds" pass instruments memory operations on addrspace(3) ptrs. Since, extra addrspacecasts are present from lds to flat addrspaces at -O0 and the actual store/load memory instructions are now on flat addrspace, these addrspacecast need to be handled in the amdgpu-sw-lower-lds pass itself. This patch lowers the lds ptr first to the corresponding ptr in the global memory from the asan_malloc. Then replaces the original cast with addrspacecast from global ptr to flat ptr. Example: ->Before infer-address-spaces pass : ->After infer-address-spaces pass : ->Without infer-addrspaces pass and with this patch in amdgpu-sw-lower-lds: %asc1 = addrspacecast ptr addrspace(1) %gep2 to ptr Patch is 76.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121214.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 17207773b4858c..74b91950624681 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -655,6 +655,10 @@ void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
} else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
LDSInstructions.insert(&Inst);
+ } else if (AddrSpaceCastInst *AscI = dyn_cast<AddrSpaceCastInst>(&Inst)) {
+ if ((AscI->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) &&
+ (AscI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS))
+ LDSInstructions.insert(&Inst);
} else
continue;
}
@@ -722,6 +726,16 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
AsanInfo.Instructions.insert(NewXCHG);
XCHG->replaceAllUsesWith(NewXCHG);
XCHG->eraseFromParent();
+ } else if (AddrSpaceCastInst *AscI = dyn_cast<AddrSpaceCastInst>(Inst)) {
+ Value *AIOperand = AscI->getPointerOperand();
+ Value *Gep =
+ getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, AIOperand);
+ Value *NewAI = IRB.CreateAddrSpaceCast(Gep, AscI->getType());
+ // Note: No need to add the instruction to AsanInfo instructions to be
+ // instrumented list. FLAT_ADDRESS ptr would have been already
+ // instrumented by asan pass prior to this pass.
+ AscI->replaceAllUsesWith(NewAI);
+ AscI->eraseFromParent();
} else
report_fatal_error("Unimplemented LDS lowering instruction");
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
index ae2bcbbb81b5f1..a6e6b84bba3046 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll
@@ -20,8 +20,12 @@ define void @non_kernel_function() sanitize_address {
; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8
; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]]
-; CHECK-NEXT: [[Y:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr
-; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr
+; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr
+; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP12]]
+; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(1) [[TMP13]] to ptr
; CHECK-NEXT: store i8 5, ptr [[TMP9]], align 8
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
index 3a05f93df35a30..b9b4c90daea87d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel.
@@ -28,8 +28,12 @@ define void @use_variables() sanitize_address {
; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8
; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]]
-; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
-; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr
+; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]]
+; CHECK-NEXT: [[TMP34:%.*]] = addrspacecast ptr addrspace(1) [[TMP33]] to ptr
+; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP35]]
+; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP36]] to ptr
; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4
; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]]
@@ -45,16 +49,16 @@ define void @use_variables() sanitize_address {
; CHECK-NEXT: [[TMP25:%.*]] = and i1 [[TMP21]], [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP25]])
; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 0
-; CHECK-NEXT: br i1 [[TMP27]], label [[ASAN_REPORT:%.*]], label [[TMP30:%.*]], !prof [[PROF2:![0-9]+]]
-; CHECK: asan.report:
-; CHECK-NEXT: br i1 [[TMP25]], label [[TMP28:%.*]], label [[TMP29:%.*]]
-; CHECK: 28:
+; CHECK-NEXT: br i1 [[TMP27]], label %[[ASAN_REPORT:.*]], label %[[BB35:.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: [[ASAN_REPORT]]:
+; CHECK-NEXT: br i1 [[TMP25]], label %[[BB33:.*]], label %[[BB34:.*]]
+; CHECK: [[BB33]]:
; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7:[0-9]+]]
; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
-; CHECK-NEXT: br label [[TMP29]]
-; CHECK: 29:
-; CHECK-NEXT: br label [[TMP30]]
-; CHECK: 30:
+; CHECK-NEXT: br label %[[BB34]]
+; CHECK: [[BB34]]:
+; CHECK-NEXT: br label %[[BB35]]
+; CHECK: [[BB35]]:
; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8
; CHECK-NEXT: ret void
;
@@ -67,15 +71,15 @@ define void @use_variables() sanitize_address {
define amdgpu_kernel void @k0() sanitize_address {
; CHECK-LABEL: define amdgpu_kernel void @k0(
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
-; CHECK-NEXT: WId:
+; CHECK-NEXT: [[WID:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
-; CHECK: Malloc:
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]]
+; CHECK: [[MALLOC]]:
; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4
; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]]
@@ -100,9 +104,9 @@ define amdgpu_kernel void @k0() sanitize_address {
; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132
; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64
; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28)
-; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 24:
-; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: br label %[[BB24]]
+; CHECK: [[BB24]]:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ]
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8
; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4
@@ -124,16 +128,16 @@ define amdgpu_kernel void @k0() sanitize_address {
; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]]
; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]])
; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0
-; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT:%.*]], label [[TMP46:%.*]], !prof [[PROF2]]
-; CHECK: asan.report:
-; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[CONDFREE:%.*]]
-; CHECK: 44:
+; CHECK-NEXT: br i1 [[TMP43]], label %[[ASAN_REPORT:.*]], label %[[BB46:.*]], !prof [[PROF2]]
+; CHECK: [[ASAN_REPORT]]:
+; CHECK-NEXT: br i1 [[TMP41]], label %[[BB44:.*]], label %[[BB45:.*]]
+; CHECK: [[BB44]]:
; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]]
; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
-; CHECK-NEXT: br label [[CONDFREE]]
-; CHECK: 45:
-; CHECK-NEXT: br label [[TMP46]]
-; CHECK: 46:
+; CHECK-NEXT: br label %[[BB45]]
+; CHECK: [[BB45]]:
+; CHECK-NEXT: br label %[[BB46]]
+; CHECK: [[BB46]]:
; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1
; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32
; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]]
@@ -152,16 +156,16 @@ define amdgpu_kernel void @k0() sanitize_address {
; CHECK-NEXT: [[TMP59:%.*]] = and i1 [[TMP54]], [[TMP58]]
; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP59]])
; CHECK-NEXT: [[TMP61:%.*]] = icmp ne i64 [[TMP60]], 0
-; CHECK-NEXT: br i1 [[TMP61]], label [[ASAN_REPORT1:%.*]], label [[TMP64:%.*]], !prof [[PROF2]]
-; CHECK: asan.report1:
-; CHECK-NEXT: br i1 [[TMP59]], label [[TMP62:%.*]], label [[TMP63:%.*]]
-; CHECK: 64:
+; CHECK-NEXT: br i1 [[TMP61]], label %[[ASAN_REPORT1:.*]], label %[[BB66:.*]], !prof [[PROF2]]
+; CHECK: [[ASAN_REPORT1]]:
+; CHECK-NEXT: br i1 [[TMP59]], label %[[BB64:.*]], label %[[BB65:.*]]
+; CHECK: [[BB64]]:
; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP83]]) #[[ATTR7]]
; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
-; CHECK-NEXT: br label [[TMP63]]
-; CHECK: 65:
-; CHECK-NEXT: br label [[TMP64]]
-; CHECK: 66:
+; CHECK-NEXT: br label %[[BB65]]
+; CHECK: [[BB65]]:
+; CHECK-NEXT: br label %[[BB66]]
+; CHECK: [[BB66]]:
; CHECK-NEXT: [[TMP84:%.*]] = ptrtoint ptr addrspace(1) [[TMP82]] to i64
; CHECK-NEXT: [[TMP85:%.*]] = lshr i64 [[TMP84]], 3
; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[TMP85]], 2147450880
@@ -174,28 +178,28 @@ define amdgpu_kernel void @k0() sanitize_address {
; CHECK-NEXT: [[TMP76:%.*]] = and i1 [[TMP72]], [[TMP75]]
; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP76]])
; CHECK-NEXT: [[TMP78:%.*]] = icmp ne i64 [[TMP77]], 0
-; CHECK-NEXT: br i1 [[TMP78]], label [[ASAN_REPORT2:%.*]], label [[TMP81:%.*]], !prof [[PROF2]]
-; CHECK: asan.report2:
-; CHECK-NEXT: br i1 [[TMP76]], label [[TMP79:%.*]], label [[TMP80:%.*]]
-; CHECK: 79:
+; CHECK-NEXT: br i1 [[TMP78]], label %[[ASAN_REPORT2:.*]], label %[[BB81:.*]], !prof [[PROF2]]
+; CHECK: [[ASAN_REPORT2]]:
+; CHECK-NEXT: br i1 [[TMP76]], label %[[BB79:.*]], label %[[BB80:.*]]
+; CHECK: [[BB79]]:
; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP84]]) #[[ATTR7]]
; CHECK-NEXT: call void @llvm.amdgcn.unreachable()
-; CHECK-NEXT: br label [[TMP80]]
-; CHECK: 80:
-; CHECK-NEXT: br label [[TMP81]]
-; CHECK: 81:
+; CHECK-NEXT: br label %[[BB80]]
+; CHECK: [[BB80]]:
+; CHECK-NEXT: br label %[[BB81]]
+; CHECK: [[BB81]]:
; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2
-; CHECK-NEXT: br label [[CONDFREE1:%.*]]
-; CHECK: CondFree:
+; CHECK-NEXT: br label %[[CONDFREE:.*]]
+; CHECK: [[CONDFREE]]:
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
-; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
-; CHECK: Free:
+; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]]
+; CHECK: [[FREE]]:
; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0)
; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64
; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]])
-; CHECK-NEXT: br label [[END]]
-; CHECK: End:
+; CHECK-NEXT: br label %[[END]]
+; CHECK: [[END]]:
; CHECK-NEXT: ret void
;
call void @use_variables()
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
index 1dd391ec6321a7..255dda562c1ea4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s
; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel.
@@ -6,50 +6,64 @@
@A = external addrspace(3) global [8 x ptr]
@B = external addrspace(3) global [0 x i32]
+;.
+; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]]
+; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]]
+; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]]
+; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]]
+; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]]
+; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]]
+; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address
+; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address
+; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address
+;.
define amdgpu_kernel void @kernel_0() sanitize_address {
; CHECK-LABEL: define amdgpu_kernel void @kernel_0(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] {
-; CHECK-NEXT: WId:
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] {
+; CHECK-NEXT: [[WID:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
-; CHECK: Malloc:
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4
-; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
-; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]])
-; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1)
-; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8
-; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64
-; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24)
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96
-; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64
-; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32)
-; CHECK-NEXT: br label [[TMP7]]
-; CHECK: 18:
-; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]]
+; CHECK: [[MALLOC]]:
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT: [[TMP10:%.*]] = cal...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't feel like a sound approach. What happens if the pointer value is captured in any way?
@@ -655,6 +655,10 @@ void AMDGPUSwLowerLDS::getLDSMemoryInstructions( | |||
} else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) { | |||
if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) | |||
LDSInstructions.insert(&Inst); | |||
} else if (AddrSpaceCastInst *AscI = dyn_cast<AddrSpaceCastInst>(&Inst)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This approach doesn't feel very sound and is looking for very specific patterns instead of what is structurally possible.
store i32 1, ptr %gep, align 4 | ||
ret void | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if you have a generic pointer downcast to a local pointer? What happens with vectors of pointers?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"generic pointer downcast to a local pointer"
This would be a normal case already handled in the pass. Then load/store would be on the lds ptrs and the pass would lower to global memory and then instrument them.
"vectors of pointers?"
Without infer-addrspace pass running prior, these vector of pointers instead of being in LDS addrspace, would be in FLAT addrspace as in example below. And there would be extra addrspace casts from lds to flat in the IR. These addrspace casts would now be handled by the change introduced in the PR.
Example:
@lds_var1 = internal addrspace(3) global i32 poison
@lds_var2 = internal addrspace(3) global i32 poison
define amdgpu_kernel void @example() #0 {
entry:
%flat_ptr1 = addrspacecast ptr addrspace(3) @lds_var1 to ptr
%flat_ptr2 = addrspacecast ptr addrspace(3) @lds_var2 to ptr
%vec_flat_ptrs = insertelement <2 x ptr> undef, ptr %flat_ptr1, i32 0
%vec_flat_ptrs1 = insertelement <2 x ptr> %vec_flat_ptrs, ptr %flat_ptr2, i32 1
%elem0 = extractelement <2 x ptr> %vec_flat_ptrs1, i32 0
store i32 42, ptr %elem0, align 4
%elem1 = extractelement <2 x ptr> %vec_flat_ptrs1, i32 1
store i32 43, ptr %elem1, align 4
ret void
}
@arsenm please let me know if I missed anything here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This example does not include a vector typed addrspacecast
Since, infer-address-space pass is not run prior to this pass at -O0, load/stores that were supposed to be on lds ptrs are now on flat ptrs with extra addrspacecasts from lds to flat ptr being added. This PR tries to specifically handle these casts from LDS to FLAT. It does it first by lowering lds ptr to corresponding ptr in global memory and then replaces the original cast with addrspacecast from global ptr to flat ptr. |
Loads and stores aren't "supposed" to be" in any particular address space. The pass needs to function independently of context. The description and justification should not be based around -O0 or whatever infer address space happens to do. It's not clear to me from the description what the symptoms of not handling this cast before are. Was it a fatal error? |
if ((AscI->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) && | ||
(AscI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if ((AscI->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) && | |
(AscI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)) | |
if (AscI->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && | |
AscI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated in latest commit.
} else if (AddrSpaceCastInst *AscI = dyn_cast<AddrSpaceCastInst>(Inst)) { | ||
Value *AIOperand = AscI->getPointerOperand(); | ||
Value *Gep = | ||
getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, AIOperand); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function looks like it will break on vectors of pointers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @arsenm for feedback. I have tried to fix the logic to work for vector of pointers in latest commit. Please review.
// instrumented list. FLAT_ADDRESS ptr would have been already | ||
// instrumented by asan pass prior to this pass. | ||
AscI->replaceAllUsesWith(NewAI); | ||
AscI->eraseFromParent(); | ||
} else | ||
report_fatal_error("Unimplemented LDS lowering instruction"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is missing quite a lot of cases With addrspacecast unhandled, this was previously a fatal error? I don't see a fatal error in your testcase now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The latest commit now handles addrspace cast with vector of ptrs. Please let me know if I missed any other cases here.
|
||
!llvm.module.flags = !{!0} | ||
!0 = !{i32 4, !"nosanitize_address", i32 1} | ||
;. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should add a test with a vector of pointer addrspacecast
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added testcase with addrspacecast on vector of ptrs.
store i32 1, ptr %gep, align 4 | ||
ret void | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This example does not include a vector typed addrspacecast
✅ With the latest revision this PR passed the C/C++ code formatter. |
✅ With the latest revision this PR passed the undef deprecator. |
With out this patch, catching addressing errors on LDS at -O0 would be missed. With changes in this PR, addrspacecasts from LDS to FLAT are detected and they are properly lowered (from LDS to global and then replaced with addrspacecast from global to flat). This would fix catching LDS addressing errors. |
@arsenm Updated the PR as per your initial feedback. Could you please review? |
if (LDSPtrType->isVectorTy()) { | ||
// Handle vector of pointers | ||
VectorType *VecPtrTy = cast<VectorType>(LDSPtrType); | ||
ElementCount NumElements = VecPtrTy->getElementCount(); | ||
Type *Int32VecTy = VectorType::get(IRB.getInt32Ty(), NumElements); | ||
Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, Int32VecTy); | ||
Type *GlobalPtrVecTy = | ||
VectorType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumElements); | ||
Value *GlobalPtrVec = PoisonValue::get(GlobalPtrVecTy); | ||
for (uint64_t Index = 0; Index < NumElements.getKnownMinValue(); ++Index) { | ||
Value *ExtElem = IRB.CreateExtractElement(PtrToInt, Index); | ||
Value *Gep = | ||
IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {ExtElem}); | ||
GlobalPtrVec = IRB.CreateInsertElement(GlobalPtrVec, Gep, Index); | ||
} | ||
return GlobalPtrVec; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You don't need to manually scalarize this, create the vector operations
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm have updated PR as per your feedback. Could you please review?
amdgpu-sw-lower-lds.
if (LDSPtrType->isVectorTy()) { | ||
// Handle vector of pointers | ||
VectorType *VecPtrTy = cast<VectorType>(LDSPtrType); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use dyn_cast instead of isVectorTy + cast
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in latest commit. Thanks
// Handle vector of pointers | ||
VectorType *VecPtrTy = cast<VectorType>(LDSPtrType); | ||
ElementCount NumElements = VecPtrTy->getElementCount(); | ||
Type *Int32VecTy = VectorType::get(IRB.getInt32Ty(), NumElements); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can use DL.getIntPtrType for this instead of hardcoding the address space size
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed it.
VectorType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumElements); | ||
Value *GlobalPtrVec = | ||
IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, PtrToInt); | ||
GlobalPtrVec = IRB.CreateBitCast(GlobalPtrVec, GlobalPtrVecTy); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This bitcast is unnecessary since the change to opaque pointers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
auto &Ctx = M.getContext(); | ||
const auto &DL = M.getDataLayout(); | ||
auto *IntPtrTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
auto &Ctx = M.getContext(); | |
const auto &DL = M.getDataLayout(); | |
auto *IntPtrTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); | |
LLVMContext &Ctx = M.getContext(); | |
const DataLayout &DL = M.getDataLayout(); | |
Type *IntPtrTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); |
…amdgpu-sw-lower-lds. (llvm#121214) "infer-address-spaces" pass replaces all refinable generic pointers with equivalent specific pointers. At -O0 optimisation level, infer-address-spaces pass doesn't run in the pipeline. "amdgpu-sw-lower-lds" pass instruments memory operations on addrspace(3) ptrs. Since, extra addrspacecasts are present from lds to flat addrspaces at -O0 and the actual store/load memory instructions are now on flat addrspace, these addrspacecast need to be handled in the amdgpu-sw-lower-lds pass itself. This patch lowers the lds ptr first to the corresponding ptr in the global memory from the asan_malloc. Then replaces the original cast with addrspacecast from global ptr to flat ptr.
"infer-address-spaces" pass replaces all refinable generic pointers with equivalent specific pointers.
At -O0 optimisation level, infer-address-spaces pass doesn't run in the pipeline.
"amdgpu-sw-lower-lds" pass instruments memory operations on addrspace(3) ptrs. Since, extra addrspacecasts are present from lds to flat addrspaces at -O0 and the actual store/load memory instructions are now on flat addrspace, these addrspacecast need to be handled in the amdgpu-sw-lower-lds pass itself. This patch lowers the lds ptr first to the corresponding ptr in the global memory from the asan_malloc. Then replaces the original cast with addrspacecast from global ptr to flat ptr.
Example:
->Before infer-address-spaces pass :
%asc = addrspacecast ptr addrspace(3) @lds to ptr
%gep = getelementptr inbounds [5 x i32], ptr %asc, i64 0, i64 0
store i32 1, ptr %gep, align 4
->After infer-address-spaces pass :
%gep = getelementptr inbounds [5 x i32], ptr addrspace(3) @lds, i64 0, i64 0
store i32 1, ptr addrspace(3) %gep, align 4
->Without infer-addrspaces pass and with this patch in amdgpu-sw-lower-lds:
; get corresponding global memory ptr from the asan_malloc allocation.
%load = load ptr addrspace(1), ptr addrspace(3) @<lds.kernel.global>, align 8
%gep1 = getelementptr inbounds i8, ptr addrspace(3) @<lds.kernel.global>, i32
%ptoi = ptrtoint ptr addrspace(3) %gep1 to i32
%gep2 = getelementptr inbounds i8, ptr addrspace(1) %load, i32 %ptoi
%asc1 = addrspacecast ptr addrspace(1) %gep2 to ptr
%gep = getelementptr inbounds [5 x i32], ptr %asc1, i64 0, i64 0
store i32 1, ptr addrspace(3) %gep, align 4