Skip to content

[AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. #83514

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 10, 2024

Conversation

dtemirbulatov
Copy link
Contributor

Allow to fold or/and-and to BSL instuction for scalable vectors.

@llvmbot
Copy link
Member

llvmbot commented Mar 1, 2024

@llvm/pr-subscribers-backend-aarch64

Author: Dinar Temirbulatov (dtemirbulatov)

Changes

Allow to fold or/and-and to BSL instuction for scalable vectors.


Full diff: https://github.com/llvm/llvm-project/pull/83514.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+23-16)
  • (added) llvm/test/CodeGen/AArch64/sve2-bsl.ll (+21)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b1677df56e1bea..7c922d9dd12412 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17594,16 +17594,14 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
 
   if (!VT.isVector())
     return SDValue();
 
-  // The combining code currently only works for NEON vectors. In particular,
-  // it does not work for SVE when dealing with vectors wider than 128 bits.
-  // It also doesn't work for streaming mode because it causes generating
-  // bsl instructions that are invalid in streaming mode.
-  if (TLI.useSVEForFixedLengthVectorVT(
-          VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
+  // The combining code works for NEON, SVE2 and SME.
+  if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
+      (VT.isScalableVector() && !Subtarget.hasSVE2orSME()))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -17660,23 +17658,32 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     for (int j = 1; j >= 0; --j) {
       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
-      if (!BVN0 || !BVN1)
+      APInt Val1, Val2;
+      if ((!BVN0 || !BVN1) &&
+          (!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
+           !ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))
         continue;
 
       bool FoundMatch = true;
-      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
-        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
-        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
-        if (!CN0 || !CN1 ||
-            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
-          FoundMatch = false;
-          break;
+      if (BVN0) {
+        for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+          ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+          ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+          if (!CN0 || !CN1 ||
+              CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+            FoundMatch = false;
+            break;
+          }
         }
+      } else {
+        FoundMatch = ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue());
       }
 
-      if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
+      if (FoundMatch) {
+        SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
+      }
     }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
new file mode 100644
index 00000000000000..00ace4ebdb91c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: bsl:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z0.s, #0x7fffffff
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    ret
+  %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+  %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+  %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %5 = or disjoint <vscale x 4 x i32> %3, %4
+  store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+  ret void
+}

Comment on lines 16 to 17
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this use the fancy new splat syntax now?

Suggested change
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%3 = and <vscale x 4 x i32> %1, <vscale x 4 x i32> splat(i32 2147483647
%4 = and <vscale x 4 x i32> %2, <vscale x 4 x i32> splat(i32 -2147483648)

%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%5 = or disjoint <vscale x 4 x i32> %3, %4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you've added disjoint here, which makes me wonder if it's sufficient in AArch64ISelLowering to simply check for the disjoint flag. It seems that the disjoint flag is already added automatically by one of the optimiser passes: https://godbolt.org/z/brGW79jzd

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. I don't think so accoding to https://llvm.org/docs/LangRef.html : disjoint means that for each bit, that bit is zero in at least one of the inputs. This allows the Or to be treated as an Add since no carry can occur from any bit. If the disjoint keyword is present, the result value of the or is a poison value if both inputs have a one in the same bit position. For vectors, only the element containing the bit is poison.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, This flag was set by ScalarEvaluation and if our lowering procedure excludes that pass then we would not be able to find this opportunity.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you think this is set by ScalarEvolution? From what I can see when trying out a simple example that contains no loops (i.e. not requiring scalar evolution), it is added by InstCombine which is always run (https://godbolt.org/z/zY9f7E97j)

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK

define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a negative test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

ret void
}

define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add either a more descriptive name or a function comment that documents why there should be no bsl emitted.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Comment on lines 17609 to 17610
if (!N->getFlags().hasDisjoint() &&
(N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks weird to me. You're basically saying we can skip the early return when N has the disjoint flag? but knowing the operands are and instructions is fundamental to the algorithm?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Comment on lines 17681 to 17682
SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can SDValue(Arg, 0) just be N0->getOperand(i)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Comment on lines 17659 to 17662
APInt Val1, Val2;
if ((!BVN0 || !BVN1) &&
(!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
!ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is subjective so it's up to you but if we're dismissing the possibility of mixing build vectors and splat vectors then it would be simpler to just enter the loops with:

APInt Val1, Val2;
if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
    ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
    ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()))
  return DAG.getNode(AArch64ISD::BSP, ....
    

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link

✅ With the latest revision this PR passed the Python code formatter.

Copy link

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Collaborator

@paulwalker-arm paulwalker-arm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple of requests but otherwise the patch looks good to me.

Comment on lines 17659 to 17660
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These casts are not used by the new code so really belong just before their null checks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

ret void
}

; we are not expecting bsl instruction here.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whilst the comment is true I can get this from the function name. What is missing is the reason we don't expect a bsl instruction (i.e. what specifically is the test verifying? overlapping masks? extra uses of instruction results? wrong types?).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Comment on lines 14 to 15
%1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the loads and store are necessary to test the combine? I'm guessing not, which if true then please simplify the new tests to remove any unnecessary IR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Collaborator

@paulwalker-arm paulwalker-arm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I missed something important during my previous review, plus I've a couple more suggestions to simplify the tests further.

%1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%c = or disjoint <vscale x 4 x i32> %1, %2
store <vscale x 4 x i32> %c, ptr %ptr1, align 4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the store required? Why not just return %c?

%1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%c = or disjoint <vscale x 4 x i32> %1, %2
store <vscale x 4 x i32> %c, ptr %ptr1, align 4
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above, can you just return %c here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

; CHECK-NEXT: ret
%1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
%c = or disjoint <vscale x 4 x i32> %1, %2
Copy link
Collaborator

@paulwalker-arm paulwalker-arm Apr 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I missed this during my previous review but the use of disjoints here is bogus because I think it does presents the opportunity to use a bsl instruction, which would invalidate the test. I think it's better to remove the disjoint keyword from this and your other test to be more specific about what we're testing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please can you follow Sander's suggestion of using splat() rather that the insertelement->shufflevector combo, because support for the latter will be removed in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
// The combining code works for NEON, SVE2 and SME.
if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
(VT.isScalableVector() && !Subtarget.hasSVE2orSME()))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a note to say we currently have an issue with using hasSVE2orSME like this because the presence of SME doesn't necessarily mean we can execute SVE instructions. I'm going to ignore it for this patch because it'll be fixed in the future and we'll need to check all uses of hasSVE2orSME at that point anyway.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're feeling paranoid you can also play it safe and just use hasSVE2 for now and we can enable the SME path once the new interfaces are available.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

…ubtarget.hasSVE2orSME()

could support SME without SVE2.
@dtemirbulatov dtemirbulatov merged commit 990c4bc into llvm:main Apr 10, 2024
@dtemirbulatov dtemirbulatov deleted the sve2-bsl branch April 10, 2024 10:08
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants