intel
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
+16-5 b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
+16-5
diff --git a/‎test/Conversion/amd/buffer_load_store.mlir
+4-4 b/‎test/Conversion/amd/buffer_load_store.mlir
+4-4
diff --git a/‎test/Conversion/intel/dot_layout_offset.mlir
+1-5 b/‎test/Conversion/intel/dot_layout_offset.mlir
+1-5
diff --git a/‎test/Conversion/intel/tritongpu_to_gen.mlir
+6-12 b/‎test/Conversion/intel/tritongpu_to_gen.mlir
+6-12
diff --git a/‎test/Conversion/tritongpu_to_llvm.mlir
+2-2 b/‎test/Conversion/tritongpu_to_llvm.mlir
+2-2
@@ -562,11 +562,22 @@ SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
     return {llvmStruct};
   ArrayRef<Type> types =
       cast<LLVM::LLVMStructType>(llvmStruct.getType()).getBody();
-  SmallVector<Value> results(types.size());
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  for (unsigned i = 0; i < types.size(); ++i) {
-    Type type = types[i];
-    results[i] = b.extract_val(type, llvmStruct, i);
+  unsigned remaining = types.size();
+  SmallVector<Value> results(remaining);
+  // If llvmStruct is an InsertValueOp, iterate up over the chain of
+  // InsertValueOps and get the inserted values instead of extracting
+  // from the struct.
+  for (auto ins = llvmStruct.getDefiningOp<LLVM::InsertValueOp>();
+       ins && ins.getPosition()[0] == remaining - 1;
+       ins = ins.getContainer().getDefiningOp<LLVM::InsertValueOp>()) {
+    results[--remaining] = ins.getValue();
+  }
+  if (remaining) {
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+    for (unsigned i = 0; i < remaining; ++i) {
+      Type type = types[i];
+      results[i] = b.extract_val(type, llvmStruct, i);
+    }
   }
   return results;
 }
 
@@ -28,7 +28,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
         %4 = arith.addi %3, %2 : tensor<128xi32, #blocked0>
         %5 = tt.splat %N: i32 -> tensor<128xi32, #blocked0>
         %7 = arith.cmpi slt, %4, %5: tensor<128xi32, #blocked0>
-        // CHECK: %[[mask:.*]] = llvm.extractvalue %{{.*}} : !llvm.struct<(i1, i1, i1, i1)>
+        // CHECK: %[[mask:.*]] = llvm.icmp "slt"
         // CHECK: %[[offset:.*]] = llvm.select %[[mask]]
         // CHECK: rocdl.raw.ptr.buffer.load {{.*}}, %[[offset]]
         %ret = amdgpu.buffer_load %arg0[%offset], %7 stride = %c256_i32 : tensor<128xf32, #blocked0>
@@ -51,7 +51,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
         %5 = tt.splat %N: i32 -> tensor<128xi32, #blocked0>
         %7 = arith.cmpi slt, %4, %5: tensor<128xi32, #blocked0>
         %other = arith.constant dense<0.00e+00> : tensor<128xf32, #blocked0>
-        // CHECK: %[[mask:.*]] = llvm.extractvalue %{{.*}} : !llvm.struct<(i1, i1, i1, i1)>
+        // CHECK: %[[mask:.*]] = llvm.icmp "slt"
         // CHECK: %[[offset:.*]] = llvm.select %[[mask]]
         // CHECK: rocdl.raw.ptr.buffer.load {{.*}}, %[[offset]]
         // CHECK: llvm.select
@@ -90,7 +90,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
         %4 = arith.addi %3, %2 : tensor<128xi32, #blocked0>
         %5 = tt.splat %N: i32 -> tensor<128xi32, #blocked0>
         %7 = arith.cmpi slt, %4, %5: tensor<128xi32, #blocked0>
-        // CHECK: %[[mask0:.*]] = llvm.extractvalue %{{.*}} : !llvm.struct<(i1, i1, i1, i1)>
+        // CHECK: %[[mask0:.*]] = llvm.icmp "slt"
         // CHECK: %[[mask1:.*]] = llvm.mlir.constant(true) : i1
         // CHECK: %[[mask2:.*]] = llvm.and %[[mask1]], %[[mask0]]
         // CHECK: %[[offset:.*]] = llvm.select %[[mask2]]
@@ -216,7 +216,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
         %4 = arith.addi %3, %2 : tensor<128xi32, #blocked0>
         %5 = tt.splat %N: i32 -> tensor<128xi32, #blocked0>
         %mask = arith.cmpi slt, %4, %5: tensor<128xi32, #blocked0>
-        // CHECK: %[[mask0:.*]] = llvm.extractvalue %{{.*}} : !llvm.struct<(i1, i1, i1, i1)>
+        // CHECK: %[[mask0:.*]] = llvm.icmp "slt"
         // There should be a single release fence before any atomics
         // CHECK: llvm.fence syncscope("agent") release
         // CHECK: %[[mask1:.*]] = llvm.mlir.constant(true) : i1
 
@@ -6,7 +6,6 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
   // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset()
   tt.func public @dot_layout_emit_offset() {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dot_operand_a>
-    // CHECK-COUNT-64:  {{.*}} = llvm.extractvalue {{.*}}
 
     // COM: Base index of the dot layout.
     // CHECK:           %[[THREAD_ID_I64:.*]] = llvm.call spir_funccc @_Z12get_local_idj
@@ -327,11 +326,8 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, "ttg.thr
   // CHECK-LABEL:   llvm.func spir_kernelcc @dot_layout_emit_offset()
   tt.func public @dot_layout_emit_offset() {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dot_operand_b>
-    // CHECK-COUNT-64:           {{.*}} = llvm.extractvalue {{.*}}
-    // CHECK:           %[[VAL_142:.*]] = llvm.mlir.constant(0 : i32) : i32
-
     // COM: Base index of the dot layout.
-    // CHECK:           %[[THREAD_ID_I64:.*]] = llvm.call spir_funccc @_Z12get_local_idj(%[[VAL_142]])
+    // CHECK:           %[[THREAD_ID_I64:.*]] = llvm.call spir_funccc @_Z12get_local_idj
     // CHECK:           %[[THREAD_ID_I32:.*]] = llvm.trunc %[[THREAD_ID_I64]] : i64 to i32
     // CHECK:           %[[VAL_145:.*]] = llvm.mlir.constant(16 : i32) : i32
     // CHECK:           %[[LANE_ID:.*]] = llvm.urem %[[THREAD_ID_I32]], %[[VAL_145]]  : i32
 
@@ -499,13 +499,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: basic_view_broadcast
   tt.func @basic_view_broadcast(%arg : tensor<256xf32,#blocked0>) {
-    // CHECK:      [[ARG0_0:%.*]] = llvm.extractvalue %arg0[0]
-    // CHECK-NEXT: [[ARG0_1:%.*]] = llvm.extractvalue %arg0[1]
-    // CHECK-NEXT: [[STRUCT:%.*]] = llvm.mlir.undef : !llvm.struct<(f32, f32)>
-    // CHECK-NEXT: [[STRUCT1:%.*]] = llvm.insertvalue [[ARG0_0]], [[STRUCT]][0]
-    // CHECK-NEXT: [[STRUCT2:%.*]] = llvm.insertvalue [[ARG0_1]], [[STRUCT1]][1]
-    // CHECK-NEXT: [[T0:%.*]] = llvm.extractvalue [[STRUCT2]][0]
-    // CHECK-NEXT: [[T1:%.*]] = llvm.extractvalue [[STRUCT2]][1]
+    // CHECK:      [[T0:%.*]] = llvm.extractvalue %arg0[0]
+    // CHECK-NEXT: [[T1:%.*]] = llvm.extractvalue %arg0[1]
     %0 = tt.reshape %arg allow_reorder : tensor<256xf32, #blocked0> -> tensor<256x1xf32,#blocked2>
     // CHECK:      [[RES:%.*]] = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
     // CHECK-NEXT: [[RES1:%.*]] = llvm.insertvalue [[T0]], [[RES]][0]
@@ -1889,13 +1884,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
 #blocked1 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: convert_single_element_and_add
-  // CHECK-NOT: llvm.store
-  // CHECK-NOT: llvm.load
-  // CHECK: llvm.insertvalue
-  // CHECK: llvm.extractvalue
+  // CHECK: llvm.mlir.constant(1.000000e+03 : f32) : f32
+  // CHECK: llvm.mlir.constant(2.000000e+03 : f32) : f32
+  // CHECK: llvm.fadd %{{.*}}, %{{.*}} : f32
   tt.func public @convert_single_element_and_add() attributes {noinline = false} {
     %cst = arith.constant dense<1.000000e+03> : tensor<1xf32, #blocked1>
-    %cst2 = arith.constant dense<1.000000e+03> : tensor<1xf32, #blocked>
+    %cst2 = arith.constant dense<2.000000e+03> : tensor<1xf32, #blocked>
     %0 = ttg.convert_layout %cst : tensor<1xf32, #blocked1> -> tensor<1xf32, #blocked>
     %1 = arith.addf %0, %cst2 : tensor<1xf32, #blocked>
     tt.return
 
@@ -426,7 +426,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32} {
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: basic_view_broadcast
   tt.func @basic_view_broadcast(%arg : tensor<256xf32,#blocked0>) {
-    // CHECK: llvm.mlir.undef
     // CHECK: %[[T0:.*]] = llvm.extractvalue
     // CHECK: %[[T1:.*]] = llvm.extractvalue
     %0 = tt.reshape %arg allow_reorder : tensor<256xf32, #blocked0> -> tensor<256x1xf32,#blocked2>
@@ -1967,8 +1966,9 @@ module attributes {"ttg.target" = "cuda:75", "ttg.num-ctas" = 1 : i32, "ttg.num-
   // CHECK-LABEL: convert_single_element_and_add
   // CHECK-NOT: llvm.store
   // CHECK-NOT: llvm.load
+  // CHECK: llvm.fadd
+  // CHECK: llvm.mlir.undef
   // CHECK: llvm.insertvalue
-  // CHECK: llvm.extractvalue
   tt.func public @convert_single_element_and_add() attributes {noinline = false} {
     %cst = arith.constant dense<1.000000e+03> : tensor<1xf32, #blocked1>
     %cst2 = arith.constant dense<1.000000e+03> : tensor<1xf32, #blocked>