From 452e13e849a2deecfaaedeb453813745beb25f1b Mon Sep 17 00:00:00 2001 From: Anshil Gandhi Date: Thu, 1 May 2025 10:42:27 -0500 Subject: [PATCH 1/2] [NFC] Precommit autogenerated test --- .../AMDGPU/merge-vectors.ll | 92 ++++++++++++++----- 1 file changed, 67 insertions(+), 25 deletions(-) diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll index 318e55c748f7f..d6b51039d5b44 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll @@ -1,10 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s -; CHECK-LABEL: @merge_v2i32_v2i32( -; CHECK: load <4 x i32> -; CHECK: store <4 x i32> zeroinitializer define amdgpu_kernel void @merge_v2i32_v2i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { +; CHECK-LABEL: define amdgpu_kernel void @merge_v2i32_v2i32( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 4 +; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <2 x i32>, ptr addrspace(1) %a, i64 1 %b.1 = getelementptr inbounds <2 x i32>, ptr addrspace(1) %b, i64 1 @@ -18,10 +25,16 @@ entry: ret void } -; CHECK-LABEL: @merge_v1i32_v1i32( -; CHECK: load <2 x i32> -; CHECK: store <2 x i32> zeroinitializer define amdgpu_kernel void @merge_v1i32_v1i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { +; CHECK-LABEL: define amdgpu_kernel void @merge_v1i32_v1i32( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(1) [[B]], align 4 +; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <1 x i32> +; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <1 x i32>, ptr addrspace(1) %a, i64 1 %b.1 = getelementptr inbounds <1 x i32>, ptr addrspace(1) %b, i64 1 @@ -35,12 +48,18 @@ entry: ret void } -; CHECK-LABEL: @no_merge_v3i32_v3i32( -; CHECK: load <3 x i32> -; CHECK: load <3 x i32> -; CHECK: store <3 x i32> zeroinitializer -; CHECK: store <3 x i32> zeroinitializer define amdgpu_kernel void @no_merge_v3i32_v3i32(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { +; CHECK-LABEL: define amdgpu_kernel void @no_merge_v3i32_v3i32( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[A]], i64 1 +; CHECK-NEXT: [[B_1:%.*]] = getelementptr inbounds <3 x i32>, ptr addrspace(1) [[B]], i64 1 +; CHECK-NEXT: [[LD_C:%.*]] = load <3 x i32>, ptr addrspace(1) [[B]], align 4 +; CHECK-NEXT: [[LD_C_IDX_1:%.*]] = load <3 x i32>, ptr addrspace(1) [[B_1]], align 4 +; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: store <3 x i32> zeroinitializer, ptr addrspace(1) [[A_1]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a, i64 1 %b.1 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b, i64 1 @@ -54,10 +73,16 @@ entry: ret void } -; CHECK-LABEL: @merge_v2i16_v2i16( -; CHECK: load <4 x i16> -; CHECK: store <4 x i16> zeroinitializer define amdgpu_kernel void @merge_v2i16_v2i16(ptr addrspace(1) nocapture %a, ptr addrspace(1) nocapture readonly %b) #0 { +; CHECK-LABEL: define amdgpu_kernel void @merge_v2i16_v2i16( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]], ptr addrspace(1) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 4 +; CHECK-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; CHECK-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a, i64 1 %b.1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b, i64 1 @@ -71,15 +96,27 @@ entry: ret void } -; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs( -; CHECK-OOB-RELAXED: load <4 x i16> -; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer -; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs( -; CHECK-OOB-STRICT: load <2 x i16> -; CHECK-OOB-STRICT: load <2 x i16> -; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer -; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer define amdgpu_kernel void @merge_fat_ptrs(ptr addrspace(7) nocapture %a, ptr addrspace(7) nocapture readonly %b) #0 { +; CHECK-OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_fat_ptrs( +; CHECK-OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-OOB-RELAXED-NEXT: [[ENTRY:.*:]] +; CHECK-OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(7) [[B]], align 4 +; CHECK-OOB-RELAXED-NEXT: [[LD_C1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; CHECK-OOB-RELAXED-NEXT: [[LD_C_IDX_12:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> +; CHECK-OOB-RELAXED-NEXT: store <4 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4 +; CHECK-OOB-RELAXED-NEXT: ret void +; +; CHECK-OOB-STRICT-LABEL: define amdgpu_kernel void @merge_fat_ptrs( +; CHECK-OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[A:%.*]], ptr addrspace(7) readonly captures(none) [[B:%.*]]) #[[ATTR0]] { +; CHECK-OOB-STRICT-NEXT: [[ENTRY:.*:]] +; CHECK-OOB-STRICT-NEXT: [[A_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[A]], i32 1 +; CHECK-OOB-STRICT-NEXT: [[B_1:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(7) [[B]], i32 1 +; CHECK-OOB-STRICT-NEXT: [[LD_C:%.*]] = load <2 x i16>, ptr addrspace(7) [[B]], align 4 +; CHECK-OOB-STRICT-NEXT: [[LD_C_IDX_1:%.*]] = load <2 x i16>, ptr addrspace(7) [[B_1]], align 4 +; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A]], align 4 +; CHECK-OOB-STRICT-NEXT: store <2 x i16> zeroinitializer, ptr addrspace(7) [[A_1]], align 4 +; CHECK-OOB-STRICT-NEXT: ret void +; entry: %a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %a, i32 1 %b.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %b, i32 1 @@ -94,10 +131,15 @@ entry: } ; Ideally this would be merged -; CHECK-LABEL: @merge_load_i32_v2i16( -; CHECK: load i32, -; CHECK: load <2 x i16> define amdgpu_kernel void @merge_load_i32_v2i16(ptr addrspace(1) nocapture %a) #0 { +; CHECK-LABEL: define amdgpu_kernel void @merge_load_i32_v2i16( +; CHECK-SAME: ptr addrspace(1) captures(none) [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A_1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i32 1 +; CHECK-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(1) [[A]], align 4 +; CHECK-NEXT: [[LD_1:%.*]] = load <2 x i16>, ptr addrspace(1) [[A_1]], align 4 +; CHECK-NEXT: ret void +; entry: %a.1 = getelementptr inbounds i32, ptr addrspace(1) %a, i32 1 From 1e55058622f5eb5c9f40fe1e21906cfcd12673ec Mon Sep 17 00:00:00 2001 From: Anshil Gandhi Date: Thu, 1 May 2025 10:57:42 -0500 Subject: [PATCH 2/2] Add a complex test with mismatched types --- .../AMDGPU/merge-vectors-complex.ll | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll new file mode 100644 index 0000000000000..a6a6deb8d9899 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors-complex.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s + +define void @merge_i32_2i16_float_4i8(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-LABEL: define void @merge_i32_2i16_float_4i8( +; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x i16>, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 2 +; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr addrspace(1) [[GEP3]], align 4 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(1) [[PTR1]], i64 3 +; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP4]], align 4 +; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-NEXT: store i32 [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x i16>, ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-NEXT: store <2 x i16> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-NEXT: [[STORE_GEP3:%.*]] = getelementptr inbounds float, ptr addrspace(2) [[PTR2]], i64 2 +; CHECK-NEXT: store float [[LOAD3]], ptr addrspace(2) [[STORE_GEP3]], align 4 +; CHECK-NEXT: [[STORE_GEP4:%.*]] = getelementptr inbounds <4 x i8>, ptr addrspace(2) [[PTR2]], i64 3 +; CHECK-NEXT: store <4 x i8> [[LOAD4]], ptr addrspace(2) [[STORE_GEP4]], align 4 +; CHECK-NEXT: ret void +; + %gep1 = getelementptr inbounds i32, ptr addrspace(1) %ptr1, i64 0 + %load1 = load i32, ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %ptr1, i64 1 + %load2 = load <2 x i16>, ptr addrspace(1) %gep2, align 4 + %gep3 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 2 + %load3 = load float, ptr addrspace(1) %gep3, align 4 + %gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(1) %ptr1, i64 3 + %load4 = load <4 x i8>, ptr addrspace(1) %gep4, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store i32 %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds <2 x i16>, ptr addrspace(2) %ptr2, i64 1 + store <2 x i16> %load2, ptr addrspace(2) %store.gep2, align 4 + %store.gep3 = getelementptr inbounds float, ptr addrspace(2) %ptr2, i64 2 + store float %load3, ptr addrspace(2) %store.gep3, align 4 + %store.gep4 = getelementptr inbounds <4 x i8>, ptr addrspace(2) %ptr2, i64 3 + store <4 x i8> %load4, ptr addrspace(2) %store.gep4, align 4 + ret void +} + +define void @merge_fp_v2half_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-LABEL: define void @merge_fp_v2half_type( +; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-NEXT: store float [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-NEXT: store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-NEXT: ret void +; + %gep1 = getelementptr inbounds float, ptr addrspace(1) %ptr1, i64 0 + %load1 = load float, ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %ptr1, i64 1 + %load2 = load <2 x half>, ptr addrspace(1) %gep2, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store float %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(2) %ptr2, i64 1 + store <2 x half> %load2, ptr addrspace(2) %store.gep2, align 4 + ret void +} + +define void @merge_v2half_bfloat_type(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-LABEL: define void @merge_v2half_bfloat_type( +; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds bfloat, ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load bfloat, ptr addrspace(1) [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-NEXT: store bfloat [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds <2 x half>, ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-NEXT: store <2 x half> [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-NEXT: ret void +; + %gep1 = getelementptr inbounds bfloat, ptr addrspace(1) %ptr1, i64 0 + %load1 = load bfloat, ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds <2 x half>, ptr addrspace(1) %ptr1, i64 1 + %load2 = load <2 x half>, ptr addrspace(1) %gep2, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store bfloat %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(2) %ptr2, i64 1 + store <2 x half> %load2, ptr addrspace(2) %store.gep2, align 4 + ret void +} + +define void @no_merge_mixed_ptr_addrspaces(ptr addrspace(1) %ptr1, ptr addrspace(2) %ptr2) { +; CHECK-LABEL: define void @no_merge_mixed_ptr_addrspaces( +; CHECK-SAME: ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(2) [[PTR2:%.*]]) { +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[PTR1]], i64 0 +; CHECK-NEXT: [[LOAD1:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[GEP1]], align 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) [[PTR1]], i64 1 +; CHECK-NEXT: [[LOAD2:%.*]] = load ptr addrspace(2), ptr addrspace(1) [[GEP2]], align 4 +; CHECK-NEXT: [[STORE_GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(2) [[PTR2]], i64 0 +; CHECK-NEXT: store ptr addrspace(1) [[LOAD1]], ptr addrspace(2) [[STORE_GEP1]], align 4 +; CHECK-NEXT: [[STORE_GEP2:%.*]] = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) [[PTR2]], i64 1 +; CHECK-NEXT: store ptr addrspace(2) [[LOAD2]], ptr addrspace(2) [[STORE_GEP2]], align 4 +; CHECK-NEXT: ret void +; + %gep1 = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) %ptr1, i64 0 + %load1 = load ptr addrspace(1), ptr addrspace(1) %gep1, align 4 + %gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(1) %ptr1, i64 1 + %load2 = load ptr addrspace(2), ptr addrspace(1) %gep2, align 4 + %store.gep1 = getelementptr inbounds i32, ptr addrspace(2) %ptr2, i64 0 + store ptr addrspace(1) %load1, ptr addrspace(2) %store.gep1, align 4 + %store.gep2 = getelementptr inbounds ptr addrspace(2), ptr addrspace(2) %ptr2, i64 1 + store ptr addrspace(2) %load2, ptr addrspace(2) %store.gep2, align 4 + ret void +}