Skip to content

Commit 990c4bc

Browse files
author
Dinar Temirbulatov
authored
[AArch64][SVE2] Generate SVE2 BSL instruction in LLVM for bit-twiddling. (llvm#83514)
Allow to fold or/and-and to BSL instuction for scalable vectors.
1 parent 0e7d14d commit 990c4bc

File tree

2 files changed

+41
-8
lines changed

2 files changed

+41
-8
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17930,16 +17930,14 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
1793017930
EVT VT = N->getValueType(0);
1793117931
SelectionDAG &DAG = DCI.DAG;
1793217932
SDLoc DL(N);
17933+
const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
1793317934

1793417935
if (!VT.isVector())
1793517936
return SDValue();
1793617937

17937-
// The combining code currently only works for NEON vectors. In particular,
17938-
// it does not work for SVE when dealing with vectors wider than 128 bits.
17939-
// It also doesn't work for streaming mode because it causes generating
17940-
// bsl instructions that are invalid in streaming mode.
17941-
if (TLI.useSVEForFixedLengthVectorVT(
17942-
VT, !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable()))
17938+
// The combining code works for NEON, SVE2 and SME.
17939+
if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
17940+
(VT.isScalableVector() && !Subtarget.hasSVE2()))
1794317941
return SDValue();
1794417942

1794517943
SDValue N0 = N->getOperand(0);
@@ -17994,6 +17992,14 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
1799417992
uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
1799517993
for (int i = 1; i >= 0; --i)
1799617994
for (int j = 1; j >= 0; --j) {
17995+
APInt Val1, Val2;
17996+
17997+
if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
17998+
ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) &&
17999+
(BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18000+
return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18001+
N0->getOperand(1 - i), N1->getOperand(1 - j));
18002+
}
1799718003
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
1799818004
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
1799918005
if (!BVN0 || !BVN1)
@@ -18009,9 +18015,8 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
1800918015
break;
1801018016
}
1801118017
}
18012-
1801318018
if (FoundMatch)
18014-
return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
18019+
return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
1801518020
N0->getOperand(1 - i), N1->getOperand(1 - j));
1801618021
}
1801718022

llvm/test/CodeGen/AArch64/sve2-bsl.ll

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK
3+
4+
define <vscale x 4 x i32> @bsl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
5+
; CHECK-LABEL: bsl:
6+
; CHECK: // %bb.0:
7+
; CHECK-NEXT: mov z2.s, #0x7fffffff
8+
; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d
9+
; CHECK-NEXT: ret
10+
%1 = and <vscale x 4 x i32> %a, splat(i32 2147483647)
11+
%2 = and <vscale x 4 x i32> %b, splat(i32 -2147483648)
12+
%c = or <vscale x 4 x i32> %1, %2
13+
ret <vscale x 4 x i32> %c
14+
}
15+
16+
; we are not expecting bsl instruction here. the constants do not match to fold to bsl.
17+
define <vscale x 4 x i32> @no_bsl_fold(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
18+
; CHECK-LABEL: no_bsl_fold:
19+
; CHECK: // %bb.0:
20+
; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff
21+
; CHECK-NEXT: and z1.s, z1.s, #0x7ffffffe
22+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
23+
; CHECK-NEXT: ret
24+
%1 = and <vscale x 4 x i32> %a, splat(i32 2147483647)
25+
%2 = and <vscale x 4 x i32> %b, splat(i32 2147483646)
26+
%c = or <vscale x 4 x i32> %1, %2
27+
ret <vscale x 4 x i32> %c
28+
}

0 commit comments

Comments
 (0)