[AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. #83514

dtemirbulatov · 2024-03-01T01:30:52Z

Allow to fold or/and-and to BSL instuction for scalable vectors.

llvmbot · 2024-03-01T01:31:22Z

@llvm/pr-subscribers-backend-aarch64

Author: Dinar Temirbulatov (dtemirbulatov)

Changes

Allow to fold or/and-and to BSL instuction for scalable vectors.

Full diff: https://github.com/llvm/llvm-project/pull/83514.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+23-16)
(added) llvm/test/CodeGen/AArch64/sve2-bsl.ll (+21)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b1677df56e1bea..7c922d9dd12412 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17594,16 +17594,14 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
 
   if (!VT.isVector())
     return SDValue();
 
-  // The combining code currently only works for NEON vectors. In particular,
-  // it does not work for SVE when dealing with vectors wider than 128 bits.
-  // It also doesn't work for streaming mode because it causes generating
-  // bsl instructions that are invalid in streaming mode.
-  if (TLI.useSVEForFixedLengthVectorVT(
-          VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
+  // The combining code works for NEON, SVE2 and SME.
+  if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
+      (VT.isScalableVector() && !Subtarget.hasSVE2orSME()))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -17660,23 +17658,32 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     for (int j = 1; j >= 0; --j) {
       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
-      if (!BVN0 || !BVN1)
+      APInt Val1, Val2;
+      if ((!BVN0 || !BVN1) &&
+          (!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
+           !ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))
         continue;
 
       bool FoundMatch = true;
-      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-        if (!CN0 || !CN1 ||
-            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-          FoundMatch = false;
-          break;
+      if (BVN0) {
+        for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+          ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+          ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+          if (!CN0 || !CN1 ||
+              CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+            FoundMatch = false;
+            break;
+          }
         }
+      } else {
+        FoundMatch = ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue());
       }
 
-      if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
+      if (FoundMatch) {
+        SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
+      }
     }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
new file mode 100644
index 00000000000000..00ace4ebdb91c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: bsl:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z0.s, #0x7fffffff
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %5 = or disjoint <vscale x 4 x i32> %3, %4
+  store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+  ret void
+}

sdesmalen-arm · 2024-03-01T09:37:53Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)


Can this use the fancy new splat syntax now?

Suggested change

%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)

%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)

%3 = and <vscale x 4 x i32> %1, <vscale x 4 x i32> splat(i32 2147483647

%4 = and <vscale x 4 x i32> %2, <vscale x 4 x i32> splat(i32 -2147483648)

sdesmalen-arm · 2024-03-01T09:52:49Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %5 = or disjoint <vscale x 4 x i32> %3, %4


I see you've added disjoint here, which makes me wonder if it's sufficient in AArch64ISelLowering to simply check for the disjoint flag. It seems that the disjoint flag is already added automatically by one of the optimiser passes: https://godbolt.org/z/brGW79jzd

No. I don't think so accoding to https://llvm.org/docs/LangRef.html : disjoint means that for each bit, that bit is zero in at least one of the inputs. This allows the Or to be treated as an Add since no carry can occur from any bit. If the disjoint keyword is present, the result value of the or is a poison value if both inputs have a one in the same bit position. For vectors, only the element containing the bit is poison.

Hmm, This flag was set by ScalarEvaluation and if our lowering procedure excludes that pass then we would not be able to find this opportunity.

Why do you think this is set by ScalarEvolution? From what I can see when trying out a simple example that contains no loops (i.e. not requiring scalar evolution), it is added by InstCombine which is always run (https://godbolt.org/z/zY9f7E97j)

sdesmalen-arm · 2024-03-01T09:52:58Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {


Can you also add a negative test?

paulwalker-arm · 2024-03-11T18:09:12Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+  ret void
+}
+
+define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {


Please add either a more descriptive name or a function comment that documents why there should be no bsl emitted.

paulwalker-arm · 2024-03-11T18:16:22Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  if (!N->getFlags().hasDisjoint() &&
+      (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND))


This looks weird to me. You're basically saying we can skip the early return when N has the disjoint flag? but knowing the operands are and instructions is fundamental to the algorithm?

paulwalker-arm · 2024-03-11T18:23:25Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+        SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),


Can SDValue(Arg, 0) just be N0->getOperand(i)?

paulwalker-arm · 2024-03-11T18:38:10Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+      APInt Val1, Val2;
+      if ((!BVN0 || !BVN1) &&
+          (!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
+           !ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))


This is subjective so it's up to you but if we're dismissing the possibility of mixing build vectors and splat vectors then it would be simpler to just enter the loops with:

APInt Val1, Val2; if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) && ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) && ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue())) return DAG.getNode(AArch64ISD::BSP, ....

github-actions · 2024-03-25T20:20:33Z

✅ With the latest revision this PR passed the Python code formatter.

github-actions · 2024-03-25T20:20:33Z

✅ With the latest revision this PR passed the C/C++ code formatter.

paulwalker-arm

A couple of requests but otherwise the patch looks good to me.

paulwalker-arm · 2024-04-04T09:36:01Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));


These casts are not used by the new code so really belong just before their null checks.

paulwalker-arm · 2024-04-04T09:41:00Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+  ret void
+}
+
+; we are not expecting bsl instruction here.


Whilst the comment is true I can get this from the function name. What is missing is the reason we don't expect a bsl instruction (i.e. what specifically is the test verifying? overlapping masks? extra uses of instruction results? wrong types?).

paulwalker-arm · 2024-04-04T09:49:19Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+  %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4


Are the loads and store are necessary to test the combine? I'm guessing not, which if true then please simplify the new tests to remove any unnecessary IR.

paulwalker-arm

Sorry I missed something important during my previous review, plus I've a couple more suggestions to simplify the tests further.

paulwalker-arm · 2024-04-09T12:24:30Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+  %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %c = or disjoint <vscale x 4 x i32> %1, %2
+  store <vscale x 4 x i32> %c, ptr %ptr1, align 4


Is the store required? Why not just return %c?

paulwalker-arm · 2024-04-09T12:24:46Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+  %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %c = or disjoint <vscale x 4 x i32> %1, %2
+  store <vscale x 4 x i32> %c, ptr %ptr1, align 4


As above, can you just return %c here?

paulwalker-arm · 2024-04-09T12:28:32Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+; CHECK-NEXT:    ret
+  %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %c = or disjoint <vscale x 4 x i32> %1, %2


Sorry I missed this during my previous review but the use of disjoints here is bogus because I think it does presents the opportunity to use a bsl instruction, which would invalidate the test. I think it's better to remove the disjoint keyword from this and your other test to be more specific about what we're testing.

paulwalker-arm · 2024-04-09T12:31:17Z

llvm/test/CodeGen/AArch64/sve2-bsl.ll

+; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)


Please can you follow Sander's suggestion of using splat() rather that the insertelement->shufflevector combo, because support for the latter will be removed in the future.

paulwalker-arm · 2024-04-09T16:10:19Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

-          VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
+  // The combining code works for NEON, SVE2 and SME.
+  if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
+      (VT.isScalableVector() && !Subtarget.hasSVE2orSME()))


Just a note to say we currently have an issue with using hasSVE2orSME like this because the presence of SME doesn't necessarily mean we can execute SVE instructions. I'm going to ignore it for this patch because it'll be fixed in the future and we'll need to check all uses of hasSVE2orSME at that point anyway.

If you're feeling paranoid you can also play it safe and just use hasSVE2 for now and we can enable the SME path once the new interfaces are available.

Allow to fold or/and-and to BSL instuction for scalable vectors.

tryCombineToBSL().

…ubtarget.hasSVE2orSME() could support SME without SVE2.

llvmbot added the backend:AArch64 label Mar 1, 2024

dtemirbulatov requested review from davemgreen, paulwalker-arm, huntergr-arm and sdesmalen-arm March 1, 2024 08:55

sdesmalen-arm reviewed Mar 1, 2024

View reviewed changes

paulwalker-arm reviewed Mar 11, 2024

View reviewed changes

paulwalker-arm reviewed Apr 4, 2024

View reviewed changes

paulwalker-arm reviewed Apr 9, 2024

View reviewed changes

paulwalker-arm approved these changes Apr 9, 2024

View reviewed changes

Dinar Temirbulatov added 7 commits April 9, 2024 16:53

[AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling.

d0c1517

Allow to fold or/and-and to BSL instuction for scalable vectors.

Added negative test.

8399103

Removed disjoint flag from the test.

28b4607

Restored disjoint in the sve2-bsl.ll, Check disjoint first in

f868577

tryCombineToBSL().

Resolve comments.

73eba66

Resolve comments.

71f33c9

Resolve comments.

7540ebb

dtemirbulatov force-pushed the sve2-bsl branch from 816f60b to 7540ebb Compare April 9, 2024 17:18

Changed Subtarget.hasSVE2orSME() to Subtarget.hasSVE2(), since with S…

44cb8c2

…ubtarget.hasSVE2orSME() could support SME without SVE2.

dtemirbulatov merged commit 990c4bc into llvm:main Apr 10, 2024

dtemirbulatov deleted the sve2-bsl branch April 10, 2024 10:08

		%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
		%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)

		if (!N->getFlags().hasDisjoint() &&
		(N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != ISD::AND))

		SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
		return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),

		BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
		BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));

		%1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
		%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4

[AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. #83514

[AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. #83514

Uh oh!

Conversation

dtemirbulatov commented Mar 1, 2024

Uh oh!

llvmbot commented Mar 1, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Mar 25, 2024

Uh oh!

github-actions bot commented Mar 25, 2024

Uh oh!

paulwalker-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

paulwalker-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

paulwalker-arm Apr 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

paulwalker-arm Apr 9, 2024 •

edited

Loading