Skip to content

[SelectionDAG] Preserve volatile undef stores. #99918

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21413,7 +21413,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
}

// Turn 'store undef, Ptr' -> nothing.
if (Value.isUndef() && ST->isUnindexed())
if (Value.isUndef() && ST->isUnindexed() && !ST->isVolatile())
return Chain;

// Try to infer better alignment information than the store already has.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,6 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
%queue.load = load volatile i8, ptr addrspace(4) %queue.ptr
%implicitarg.load = load volatile i8, ptr addrspace(4) %implicitarg.ptr
%dispatch.load = load volatile i8, ptr addrspace(4) %dispatch.ptr
store volatile i64 %dispatch.id, ptr addrspace(1) %ptr
ret void
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(ptr addrspace(
define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
%undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 999) #1
%bc = bitcast i64 %undef to <2 x i32>
store volatile <2 x i32> %bc, ptr addrspace(1) %out
store <2 x i32> %bc, ptr addrspace(1) %out
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't know why these tests are losing volatile

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, the removal of the volatile preserves the original intent of the test, that there is no store_dword. I'm not really sure what we should here since this test is explicitly checking for the absence of a store.

ret void
}

Expand All @@ -83,7 +83,7 @@ define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractel
%undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 9999) #1
%bc = bitcast i64 %undef to <2 x i32>
%elt1 = extractelement <2 x i32> %bc, i32 1
store volatile i32 %elt1, ptr addrspace(1) %out
store i32 %elt1, ptr addrspace(1) %out
ret void
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ define void @func_use_lds_global() {
ret void
}

; ERR: warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (): local memory global used by non-kernel function
define void @func_use_lds_global_constexpr_cast() {
; ERR: warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (ptr addrspace(1)): local memory global used by non-kernel function
define void @func_use_lds_global_constexpr_cast(ptr addrspace(1) %out) {
; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
; GFX8-SDAG: ; %bb.0:
; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Expand Down Expand Up @@ -153,7 +153,7 @@ define void @func_use_lds_global_constexpr_cast() {
; GISEL-NEXT: s_setpc_b64 s[30:31]
; GISEL-NEXT: .LBB1_2:
; GISEL-NEXT: s_endpgm
store volatile i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) poison, align 4
store i32 ptrtoint (ptr addrspace(3) @lds to i32), ptr addrspace(1) %out, align 4
ret void
}

Expand Down
23 changes: 17 additions & 6 deletions llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-SDAG %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-SDAG %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH-GISEL %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
Expand Down Expand Up @@ -126,10 +126,21 @@ define amdgpu_cs void @caller() {
declare amdgpu_gfx void @callee(i32)

define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
; GFX9-LABEL: workgroup_ids_gfx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-SDAG-LABEL: workgroup_ids_gfx:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v0, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: global_store_dword v[2:3], v0, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: global_store_dword v[4:5], v0, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: workgroup_ids_gfx:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9ARCH-SDAG-LABEL: workgroup_ids_gfx:
; GFX9ARCH-SDAG: ; %bb.0:
Expand Down
36 changes: 18 additions & 18 deletions llvm/test/CodeGen/AMDGPU/mem-builtins.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,64 +9,64 @@ declare hidden i32 @strnlen(ptr nocapture, i32) #1
declare hidden i32 @strcmp(ptr nocapture, ptr nocapture) #1


; ERROR: error: <unknown>:0:0: in function test_memcmp void (ptr addrspace(1), ptr addrspace(1), ptr): unsupported call to function memcmp
; ERROR: error: <unknown>:0:0: in function test_memcmp void (ptr addrspace(1), ptr addrspace(1), ptr, ptr addrspace(1)): unsupported call to function memcmp

; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, memcmp@rel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memcmp@rel32@hi+12
define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p) #0 {
define amdgpu_kernel void @test_memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, ptr nocapture %p, ptr addrspace(1) %out) #0 {
entry:
%cmp = tail call i32 @memcmp(ptr addrspace(1) %x, ptr addrspace(1) %y, i64 2)
store volatile i32 %cmp, ptr addrspace(1) undef
store i32 %cmp, ptr addrspace(1) %out
ret void
}

; ERROR: error: <unknown>:0:0: in function test_memchr void (ptr addrspace(1), i32, i64): unsupported call to function memchr
; ERROR: error: <unknown>:0:0: in function test_memchr void (ptr addrspace(1), i32, i64, ptr addrspace(1)): unsupported call to function memchr

; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr@rel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, memchr@rel32@hi+12
define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len) #0 {
define amdgpu_kernel void @test_memchr(ptr addrspace(1) %src, i32 %char, i64 %len, ptr addrspace(1) %out) #0 {
%res = call ptr addrspace(1) @memchr(ptr addrspace(1) %src, i32 %char, i64 %len)
store volatile ptr addrspace(1) %res, ptr addrspace(1) undef
store ptr addrspace(1) %res, ptr addrspace(1) %out
ret void
}

; ERROR: error: <unknown>:0:0: in function test_strcpy void (ptr, ptr): unsupported call to function strcpy
; ERROR: error: <unknown>:0:0: in function test_strcpy void (ptr, ptr, ptr addrspace(1)): unsupported call to function strcpy

; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy@rel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcpy@rel32@hi+12
define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src) #0 {
define amdgpu_kernel void @test_strcpy(ptr %dst, ptr %src, ptr addrspace(1) %out) #0 {
%res = call ptr @strcpy(ptr %dst, ptr %src)
store volatile ptr %res, ptr addrspace(1) undef
store ptr %res, ptr addrspace(1) %out
ret void
}

; ERROR: error: <unknown>:0:0: in function test_strcmp void (ptr, ptr): unsupported call to function strcmp
; ERROR: error: <unknown>:0:0: in function test_strcmp void (ptr, ptr, ptr addrspace(1)): unsupported call to function strcmp

; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp@rel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strcmp@rel32@hi+12
define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1) #0 {
define amdgpu_kernel void @test_strcmp(ptr %src0, ptr %src1, ptr addrspace(1) %out) #0 {
%res = call i32 @strcmp(ptr %src0, ptr %src1)
store volatile i32 %res, ptr addrspace(1) undef
store i32 %res, ptr addrspace(1) %out
ret void
}

; ERROR: error: <unknown>:0:0: in function test_strlen void (ptr): unsupported call to function strlen
; ERROR: error: <unknown>:0:0: in function test_strlen void (ptr, ptr addrspace(1)): unsupported call to function strlen

; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen@rel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strlen@rel32@hi+12
define amdgpu_kernel void @test_strlen(ptr %src) #0 {
define amdgpu_kernel void @test_strlen(ptr %src, ptr addrspace(1) %out) #0 {
%res = call i32 @strlen(ptr %src)
store volatile i32 %res, ptr addrspace(1) undef
store i32 %res, ptr addrspace(1) %out
ret void
}

; ERROR: error: <unknown>:0:0: in function test_strnlen void (ptr, i32): unsupported call to function strnlen
; ERROR: error: <unknown>:0:0: in function test_strnlen void (ptr, i32, ptr addrspace(1)): unsupported call to function strnlen

; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen@rel32@lo+4
; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, strnlen@rel32@hi+12
define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size) #0 {
define amdgpu_kernel void @test_strnlen(ptr %src, i32 %size, ptr addrspace(1) %out) #0 {
%res = call i32 @strnlen(ptr %src, i32 %size)
store volatile i32 %res, ptr addrspace(1) undef
store i32 %res, ptr addrspace(1) %out
ret void
}

Expand Down
58 changes: 58 additions & 0 deletions llvm/test/CodeGen/NVPTX/store-undef.ll
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,61 @@ define void @test_store_def(i64 %param0, i32 %param1, ptr %out) {
store %struct.T %S2, ptr %out
ret void
}

define void @test_store_volatile_undef(ptr %out, <8 x i32> %vec) {
; CHECK-LABEL: test_store_volatile_undef(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<23>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [test_store_volatile_undef_param_0];
; CHECK-NEXT: st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
; CHECK-NEXT: st.volatile.u64 [%rd1], %rd2;
; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_store_volatile_undef_param_1];
; CHECK-NEXT: ld.param.v4.u32 {%r11, %r12, %r13, %r14}, [test_store_volatile_undef_param_1+16];
; CHECK-NEXT: st.volatile.v4.u32 [%rd3], {%r11, %r12, %r13, %r14};
; CHECK-NEXT: st.volatile.v4.u32 [%rd4], {%r7, %r8, %r9, %r10};
; CHECK-NEXT: st.volatile.v4.u32 [%rd1+16], {%r15, %r16, %r17, %r18};
; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r19, %r20, %r21, %r22};
; CHECK-NEXT: ret;
store volatile %struct.T undef, ptr %out
store volatile <8 x i32> %vec, ptr undef
store volatile <8 x i32> undef, ptr %out
Copy link
Contributor

@gonzalobg gonzalobg Aug 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So just for my understanding, we are essentially lowering a volatile store of undef:

store volatile i32 undef, ptr %out;

to

{
   .reg .b32 %reg;
   st.volatile.u32 [%out], %reg;
}

?

And similar for a volatile store of poison ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gonzalobg, yes. In this case both will be lowered to st.volatile.u32

ret void
}

define void @test_store_volatile_of_poison(ptr %out) {
; CHECK-LABEL: test_store_volatile_of_poison(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [test_store_volatile_of_poison_param_0];
; CHECK-NEXT: st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: st.volatile.v2.u32 [%rd1+8], {%r5, %r6};
; CHECK-NEXT: st.volatile.u64 [%rd1], %rd2;
; CHECK-NEXT: ret;
store volatile %struct.T poison, ptr %out
ret void
}

define void @test_store_volatile_to_poison(%struct.T %param) {
; CHECK-LABEL: test_store_volatile_to_poison(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [test_store_volatile_to_poison_param_0];
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_store_volatile_to_poison_param_0+8];
; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_store_volatile_to_poison_param_0+16];
; CHECK-NEXT: st.volatile.v4.u32 [%rd2], {%r3, %r4, %r5, %r6};
; CHECK-NEXT: st.volatile.v2.u32 [%rd3], {%r1, %r2};
; CHECK-NEXT: st.volatile.u64 [%rd4], %rd1;
; CHECK-NEXT: ret;
store volatile %struct.T %param, ptr poison
ret void
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test a vector, and store to poison

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Specifically a vector that requires legalization splitting

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(curious only): Are atomic undef stores preserved as well?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I see those get turned into unreachable in opt -O3

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And atomic volatile stores?

Loading