1
- ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2
1
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
3
2
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
4
3
4
+ ; CHECK-LABEL: @merge_v2i32_v2i32(
5
+ ; CHECK: load <4 x i32>
6
+ ; CHECK: store <4 x i32> zeroinitializer
5
7
define amdgpu_kernel void @merge_v2i32_v2i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
6
- ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32(
7
- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
8
- ; CHECK-NEXT: [[ENTRY:.*:]]
9
- ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 4
10
- ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
11
- ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
12
- ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
13
- ; CHECK-NEXT: ret void
14
- ;
15
8
entry:
16
9
%a.1 = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %a , i64 1
17
10
%b.1 = getelementptr inbounds <2 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -25,16 +18,10 @@ entry:
25
18
ret void
26
19
}
27
20
21
+ ; CHECK-LABEL: @merge_v1i32_v1i32(
22
+ ; CHECK: load <2 x i32>
23
+ ; CHECK: store <2 x i32> zeroinitializer
28
24
define amdgpu_kernel void @merge_v1i32_v1i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
29
- ; CHECK-LABEL: define amdgpu_kernel void @merge_v1i32_v1i32(
30
- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
31
- ; CHECK-NEXT: [[ENTRY:.*:]]
32
- ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[B]], align 4
33
- ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> zeroinitializer
34
- ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> <i32 1>
35
- ; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
36
- ; CHECK-NEXT: ret void
37
- ;
38
25
entry:
39
26
%a.1 = getelementptr inbounds <1 x i32 >, ptr addrspace (1 ) %a , i64 1
40
27
%b.1 = getelementptr inbounds <1 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -48,18 +35,12 @@ entry:
48
35
ret void
49
36
}
50
37
38
+ ; CHECK-LABEL: @no_merge_v3i32_v3i32(
39
+ ; CHECK: load <3 x i32>
40
+ ; CHECK: load <3 x i32>
41
+ ; CHECK: store <3 x i32> zeroinitializer
42
+ ; CHECK: store <3 x i32> zeroinitializer
51
43
define amdgpu_kernel void @no_merge_v3i32_v3i32 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
52
- ; CHECK-LABEL: define amdgpu_kernel void @no_merge_v3i32_v3i32(
53
- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
54
- ; CHECK-NEXT: [[ENTRY:.*:]]
55
- ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[A]], i64 1
56
- ; CHECK-NEXT: [[B_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[B]], i64 1
57
- ; CHECK-NEXT: [[LD_C:%.*]] = load <3 x i32>, ptr addrspace(1) [[B]], align 4
58
- ; CHECK-NEXT: [[LD_C_IDX_1:%.*]] = load <3 x i32>, ptr addrspace(1) [[B_1]], align 4
59
- ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4
60
- ; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A_1]], align 4
61
- ; CHECK-NEXT: ret void
62
- ;
63
44
entry:
64
45
%a.1 = getelementptr inbounds <3 x i32 >, ptr addrspace (1 ) %a , i64 1
65
46
%b.1 = getelementptr inbounds <3 x i32 >, ptr addrspace (1 ) %b , i64 1
@@ -73,16 +54,10 @@ entry:
73
54
ret void
74
55
}
75
56
57
+ ; CHECK-LABEL: @merge_v2i16_v2i16(
58
+ ; CHECK: load <4 x i16>
59
+ ; CHECK: store <4 x i16> zeroinitializer
76
60
define amdgpu_kernel void @merge_v2i16_v2i16 (ptr addrspace (1 ) nocapture %a , ptr addrspace (1 ) nocapture readonly %b ) #0 {
77
- ; CHECK-LABEL: define amdgpu_kernel void @merge_v2i16_v2i16(
78
- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
79
- ; CHECK-NEXT: [[ENTRY:.*:]]
80
- ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 4
81
- ; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
82
- ; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
83
- ; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(1) [[A]], align 4
84
- ; CHECK-NEXT: ret void
85
- ;
86
61
entry:
87
62
%a.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %a , i64 1
88
63
%b.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (1 ) %b , i64 1
@@ -96,27 +71,15 @@ entry:
96
71
ret void
97
72
}
98
73
74
+ ; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs(
75
+ ; CHECK-OOB-RELAXED: load <4 x i16>
76
+ ; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer
77
+ ; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs(
78
+ ; CHECK-OOB-STRICT: load <2 x i16>
79
+ ; CHECK-OOB-STRICT: load <2 x i16>
80
+ ; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
81
+ ; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
99
82
define amdgpu_kernel void @merge_fat_ptrs (ptr addrspace (7 ) nocapture %a , ptr addrspace (7 ) nocapture readonly %b ) #0 {
100
- ; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_fat_ptrs(
101
- ; CHECK-OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
102
- ; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]]
103
- ; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(7) [[B]], align 4
104
- ; CHECK-OOB-RELAXED-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
105
- ; CHECK-OOB-RELAXED-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
106
- ; CHECK-OOB-RELAXED-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4
107
- ; CHECK-OOB-RELAXED-NEXT: ret void
108
- ;
109
- ; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @merge_fat_ptrs(
110
- ; CHECK-OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] {
111
- ; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]]
112
- ; CHECK-OOB-STRICT-NEXT: [[A_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[A]], i32 1
113
- ; CHECK-OOB-STRICT-NEXT: [[B_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[B]], i32 1
114
- ; CHECK-OOB-STRICT-NEXT: [[LD_C:%.*]] = load <2 x i16>, ptr addrspace(7) [[B]], align 4
115
- ; CHECK-OOB-STRICT-NEXT: [[LD_C_IDX_1:%.*]] = load <2 x i16>, ptr addrspace(7) [[B_1]], align 4
116
- ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4
117
- ; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A_1]], align 4
118
- ; CHECK-OOB-STRICT-NEXT: ret void
119
- ;
120
83
entry:
121
84
%a.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (7 ) %a , i32 1
122
85
%b.1 = getelementptr inbounds <2 x i16 >, ptr addrspace (7 ) %b , i32 1
@@ -131,15 +94,10 @@ entry:
131
94
}
132
95
133
96
; Ideally this would be merged
97
+ ; CHECK-LABEL: @merge_load_i32_v2i16(
98
+ ; CHECK: load i32,
99
+ ; CHECK: load <2 x i16>
134
100
define amdgpu_kernel void @merge_load_i32_v2i16 (ptr addrspace (1 ) nocapture %a ) #0 {
135
- ; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16(
136
- ; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] {
137
- ; CHECK-NEXT: [[ENTRY:.*:]]
138
- ; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1
139
- ; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4
140
- ; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4
141
- ; CHECK-NEXT: ret void
142
- ;
143
101
entry:
144
102
%a.1 = getelementptr inbounds i32 , ptr addrspace (1 ) %a , i32 1
145
103
0 commit comments