-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. #83514
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Dinar Temirbulatov (dtemirbulatov) ChangesAllow to fold or/and-and to BSL instuction for scalable vectors. Full diff: https://github.com/llvm/llvm-project/pull/83514.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b1677df56e1bea..7c922d9dd12412 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17594,16 +17594,14 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
if (!VT.isVector())
return SDValue();
- // The combining code currently only works for NEON vectors. In particular,
- // it does not work for SVE when dealing with vectors wider than 128 bits.
- // It also doesn't work for streaming mode because it causes generating
- // bsl instructions that are invalid in streaming mode.
- if (TLI.useSVEForFixedLengthVectorVT(
- VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
+ // The combining code works for NEON, SVE2 and SME.
+ if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
+ (VT.isScalableVector() && !Subtarget.hasSVE2orSME()))
return SDValue();
SDValue N0 = N->getOperand(0);
@@ -17660,23 +17658,32 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
for (int j = 1; j >= 0; --j) {
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
- if (!BVN0 || !BVN1)
+ APInt Val1, Val2;
+ if ((!BVN0 || !BVN1) &&
+ (!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) ||
+ !ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2)))
continue;
bool FoundMatch = true;
- for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
- ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
- ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
- if (!CN0 || !CN1 ||
- CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
- FoundMatch = false;
- break;
+ if (BVN0) {
+ for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+ ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+ ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+ if (!CN0 || !CN1 ||
+ CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+ FoundMatch = false;
+ break;
+ }
}
+ } else {
+ FoundMatch = ((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue());
}
- if (FoundMatch)
- return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
+ if (FoundMatch) {
+ SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode();
+ return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0),
N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
}
return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve2-bsl.ll b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
new file mode 100644
index 00000000000000..00ace4ebdb91c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-bsl.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
+
+define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) {
+; CHECK-LABEL: bsl:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov z0.s, #0x7fffffff
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT: bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT: st1w { z1.s }, p0, [x2]
+; CHECK-NEXT: ret
+ %1 = load <vscale x 4 x i32>, ptr %ptr1, align 4
+ %2 = load <vscale x 4 x i32>, ptr %ptr2, align 4
+ %3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+ %4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+ %5 = or disjoint <vscale x 4 x i32> %3, %4
+ store <vscale x 4 x i32> %5, ptr %ptr3, align 4
+ ret void
+}
|
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this use the fancy new splat
syntax now?
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | |
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | |
%3 = and <vscale x 4 x i32> %1, <vscale x 4 x i32> splat(i32 2147483647 | |
%4 = and <vscale x 4 x i32> %2, <vscale x 4 x i32> splat(i32 -2147483648) |
%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4 | ||
%3 = and <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%4 = and <vscale x 4 x i32> %2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%5 = or disjoint <vscale x 4 x i32> %3, %4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see you've added disjoint
here, which makes me wonder if it's sufficient in AArch64ISelLowering to simply check for the disjoint
flag. It seems that the disjoint
flag is already added automatically by one of the optimiser passes: https://godbolt.org/z/brGW79jzd
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No. I don't think so accoding to https://llvm.org/docs/LangRef.html : disjoint means that for each bit, that bit is zero in at least one of the inputs. This allows the Or to be treated as an Add since no carry can occur from any bit. If the disjoint keyword is present, the result value of the or is a poison value if both inputs have a one in the same bit position. For vectors, only the element containing the bit is poison.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, This flag was set by ScalarEvaluation and if our lowering procedure excludes that pass then we would not be able to find this opportunity.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you think this is set by ScalarEvolution? From what I can see when trying out a simple example that contains no loops (i.e. not requiring scalar evolution), it is added by InstCombine which is always run (https://godbolt.org/z/zY9f7E97j)
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK | ||
|
||
define void @bsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you also add a negative test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
ret void | ||
} | ||
|
||
define void @nobsl(ptr %ptr1, ptr %ptr2, ptr %ptr3) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add either a more descriptive name or a function comment that documents why there should be no bsl emitted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
if (!N->getFlags().hasDisjoint() && | ||
(N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks weird to me. You're basically saying we can skip the early return when N
has the disjoint
flag? but knowing the operands are and
instructions is fundamental to the algorithm?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
SDNode *Arg = (BVN0) ? BVN0 : N0->getOperand(i).getNode(); | ||
return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(Arg, 0), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can SDValue(Arg, 0)
just be N0->getOperand(i)
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
APInt Val1, Val2; | ||
if ((!BVN0 || !BVN1) && | ||
(!ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) || | ||
!ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is subjective so it's up to you but if we're dismissing the possibility of mixing build vectors and splat vectors then it would be simpler to just enter the loops with:
APInt Val1, Val2;
if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
((BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()))
return DAG.getNode(AArch64ISD::BSP, ....
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
✅ With the latest revision this PR passed the Python code formatter. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A couple of requests but otherwise the patch looks good to me.
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); | ||
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These casts are not used by the new code so really belong just before their null checks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
ret void | ||
} | ||
|
||
; we are not expecting bsl instruction here. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Whilst the comment is true I can get this from the function name. What is missing is the reason we don't expect a bsl instruction (i.e. what specifically is the test verifying? overlapping masks? extra uses of instruction results? wrong types?).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
%1 = load <vscale x 4 x i32>, ptr %ptr1, align 4 | ||
%2 = load <vscale x 4 x i32>, ptr %ptr2, align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are the loads and store are necessary to test the combine? I'm guessing not, which if true then please simplify the new tests to remove any unnecessary IR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I missed something important during my previous review, plus I've a couple more suggestions to simplify the tests further.
%1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -2147483648, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%c = or disjoint <vscale x 4 x i32> %1, %2 | ||
store <vscale x 4 x i32> %c, ptr %ptr1, align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the store required? Why not just return %c
?
%1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%c = or disjoint <vscale x 4 x i32> %1, %2 | ||
store <vscale x 4 x i32> %c, ptr %ptr1, align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As above, can you just return %c
here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
; CHECK-NEXT: ret | ||
%1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%2 = and <vscale x 4 x i32> %b, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483646, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) | ||
%c = or disjoint <vscale x 4 x i32> %1, %2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I missed this during my previous review but the use of disjoints
here is bogus because I think it does presents the opportunity to use a bsl
instruction, which would invalidate the test. I think it's better to remove the disjoint
keyword from this and your other test to be more specific about what we're testing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d | ||
; CHECK-NEXT: st1w { z0.s }, p0, [x0] | ||
; CHECK-NEXT: ret | ||
%1 = and <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2147483647, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please can you follow Sander's suggestion of using splat()
rather that the insertelement->shufflevector
combo, because support for the latter will be removed in the future.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())) | ||
// The combining code works for NEON, SVE2 and SME. | ||
if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) || | ||
(VT.isScalableVector() && !Subtarget.hasSVE2orSME())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a note to say we currently have an issue with using hasSVE2orSME
like this because the presence of SME doesn't necessarily mean we can execute SVE instructions. I'm going to ignore it for this patch because it'll be fixed in the future and we'll need to check all uses of hasSVE2orSME
at that point anyway.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you're feeling paranoid you can also play it safe and just use hasSVE2
for now and we can enable the SME path once the new interfaces are available.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Allow to fold or/and-and to BSL instuction for scalable vectors.
tryCombineToBSL().
…ubtarget.hasSVE2orSME() could support SME without SVE2.
Allow to fold or/and-and to BSL instuction for scalable vectors.