-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SLP]Remove operands upon marking instruction for deletion. #97409
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Remove operands upon marking instruction for deletion. #97409
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesIf the instruction is marked for deletion, better to drop all its Patch is 48.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/97409.diff 21 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e233de89a33f1..2638bd0ef8720 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1163,6 +1163,15 @@ class BoUpSLP {
return VectorizableTree.front()->Scalars;
}
+ /// Checks if the root graph node can be emitted with narrower bitwidth at
+ /// codegen and returns it signedness, if so.
+ std::optional<bool> isSignedMinBitwidthRootNode() const {
+ auto It = MinBWs.find(VectorizableTree.front().get());
+ if (It == MinBWs.end())
+ return std::nullopt;
+ return It->second.second;
+ }
+
/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -3795,7 +3804,7 @@ class BoUpSLP {
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
- void scheduleBlock(BlockScheduling *BS);
+ void scheduleBlock(BlockScheduling *BS, BoUpSLP &R);
/// List of users to ignore during scheduling and that don't need extracting.
const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
@@ -13524,7 +13533,7 @@ Value *BoUpSLP::vectorizeTree(
Instruction *ReductionRoot) {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
- scheduleBlock(BSIter.second.get());
+ scheduleBlock(BSIter.second.get(), *this);
}
// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
// need to rebuild it.
@@ -14064,11 +14073,21 @@ Value *BoUpSLP::vectorizeTree(
}
#endif
LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
- eraseInstruction(cast<Instruction>(Scalar));
+ auto *I = cast<Instruction>(Scalar);
+ // Clear the operands, marking for deletion trivially dead operands.
+ for (unsigned Idx : seq<unsigned>(I->getNumOperands())) {
+ Value *Op = I->getOperand(Idx);
+ I->setOperand(Idx, PoisonValue::get(Op->getType()));
+ if (auto *OpI = dyn_cast<Instruction>(Op))
+ if (!isDeleted(OpI) && isInstructionTriviallyDead(OpI, TLI) &&
+ Entry->VectorizedValue != OpI)
+ eraseInstruction(OpI);
+ }
+ eraseInstruction(I);
// Retain to-be-deleted instructions for some debug-info
// bookkeeping. NOTE: eraseInstruction only marks the instruction for
// deletion - instructions are not deleted until later.
- RemovedInsts.push_back(cast<Instruction>(Scalar));
+ RemovedInsts.push_back(I);
}
}
@@ -14681,6 +14700,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
for (; DepDest; DepDest = DepDest->NextLoadStore) {
assert(isInSchedulingRegion(DepDest));
+ if (SLP->isDeleted(DepDest->Inst))
+ continue;
// We have two limits to reduce the complexity:
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
@@ -14750,7 +14771,7 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
ReadyInsts.clear();
}
-void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+void BoUpSLP::scheduleBlock(BlockScheduling *BS, BoUpSLP &R) {
if (!BS->ScheduleStart)
return;
@@ -14807,6 +14828,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
for (ScheduleData *BundleMember = Picked; BundleMember;
BundleMember = BundleMember->NextInBundle) {
Instruction *PickedInst = BundleMember->Inst;
+ if (R.isDeleted(PickedInst))
+ continue;
if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
LastScheduledInst = PickedInst;
@@ -17344,14 +17367,11 @@ class HorizontalReduction {
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
if (ReducedSubTree->getType() != VL.front()->getType()) {
- ReducedSubTree = Builder.CreateIntCast(
- ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
- KnownBits Known = computeKnownBits(
- R, cast<Instruction>(ReductionOps.front().front())
- ->getModule()
- ->getDataLayout());
- return !Known.isNonNegative();
- }));
+ assert(ReducedSubTree->getType() != VL.front()->getType() &&
+ "Expected different reduction type.");
+ ReducedSubTree =
+ Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
+ *V.isSignedMinBitwidthRootNode());
}
// Improved analysis for add/fadd/xor reductions with same scale factor
@@ -17513,10 +17533,19 @@ class HorizontalReduction {
}
#endif
if (!Ignore->use_empty()) {
- Value *Undef = UndefValue::get(Ignore->getType());
- Ignore->replaceAllUsesWith(Undef);
+ Value *P = PoisonValue::get(Ignore->getType());
+ Ignore->replaceAllUsesWith(P);
+ }
+ auto *I = cast<Instruction>(Ignore);
+ // Clear the operands, marking for deletion trivially dead operands.
+ for (unsigned Idx : seq<unsigned>(I->getNumOperands())) {
+ Value *Op = I->getOperand(Idx);
+ I->setOperand(Idx, PoisonValue::get(Op->getType()));
+ if (auto *OpI = dyn_cast<Instruction>(Op))
+ if (!V.isDeleted(OpI) && isInstructionTriviallyDead(OpI, &TLI))
+ V.eraseInstruction(OpI);
}
- V.eraseInstruction(cast<Instruction>(Ignore));
+ V.eraseInstruction(I);
}
}
} else if (!CheckForReusedReductionOps) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
index 24c5fcb068086..8c4903dbc92bb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -503,10 +503,10 @@ define void @add_v64i8() {
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SSE-NEXT: ret void
;
@@ -522,10 +522,10 @@ define void @add_v64i8() {
; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SLM-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
index fab022d691c07..cb8d45b1a21a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -401,10 +401,10 @@ define void @add_v64i8() {
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SSE-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
index dafed43e6e71c..a7ae2d9e02ff4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -439,10 +439,10 @@ define void @add_v64i8() {
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
+; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
-; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SSE-NEXT: ret void
;
@@ -458,10 +458,10 @@ define void @add_v64i8() {
; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
+; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SLM-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
-; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SLM-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
index e4c76daddb02e..d4eafdeb50a47 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
@@ -520,10 +520,10 @@ define void @smul_v64i8() {
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SSE-NEXT: ret void
;
@@ -539,10 +539,10 @@ define void @smul_v64i8() {
; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SLM-NEXT: ret void
;
@@ -1323,10 +1323,10 @@ define void @umul_v64i8() {
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SSE-NEXT: ret void
;
@@ -1342,10 +1342,10 @@ define void @umul_v64i8() {
; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
; SLM-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
index 9b8480cd0088a..16977c025e3ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
@@ -480,10 +480,10 @@ define void @fshl_v64i8() {
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1
; SSE-NEXT: ret void
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
index daf28b9a0bb4d..609a9024e5bf7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
@@ -575,21 +575,21 @@ define void @fshl_v64i8() {
; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr @c8, align 1
; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
+; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1
; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
; SSE-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> [[TMP7]])
+; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1
; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.f...
[truncated]
|
Created using spr 1.3.5
"Expected different reduction type."); | ||
ReducedSubTree = | ||
Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(), | ||
*V.isSignedMinBitwidthRootNode()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is it safe to directly dereference an optional?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I reworked the function, if the types do not match, it means that the root node is resized due to minbitwidth analysis and there is signedness info in MinBWs map for the root node
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one minor
if (auto *OpI = dyn_cast<Instruction>(Op)) | ||
if (!isDeleted(OpI) && isInstructionTriviallyDead(OpI, TLI) && | ||
Entry->VectorizedValue != OpI) | ||
eraseInstruction(OpI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similar code to the version in HorizontalReduction - pull out into a small helper?
If the instruction is marked for deletion, better to drop all its operands and mark them for deletion too (if allowed). It allows to have more vectorizable patterns and generate less useless extractelement instructions. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#97409
If the instruction is marked for deletion, better to drop all its operands and mark them for deletion too (if allowed). It allows to have more vectorizable patterns and generate less useless extractelement instructions. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #97409
Hi @alexey-bataev, we are seeing an assertion failure in internal tests which I bisected back to this change. What is odd is that it does not hit the assertion failure reliably, so there seems to be some element of randomness to it. I have put the repro and assertion failure stack trace in issue #98133. Can you take a look? |
This change has caused failed asserts. Strangely enough, the failed asserts aren't entirely deterministic though. Reduced testcase: int b;
float c;
int e;
void f(float p1[]) {
float *j = &b;
for (e = 0; e < 2;) {
float a;
float d;
{
a += p1[e] * 0;
d += e * j[e];
}
e = a;
c = d;
}
} To reproduce:
Please have a look, and consider reverting if fixing takes a long time. |
More info about the assert in case its helpful: We typically see this assert when a pass sets an insertion point to the beginning of a block or the first non-phi using an DbgRecords and the new iterator bit are documented over here. I'm happy to help with any debug-info questions. |
Hi, thanks for the reports, will fix ASAP |
Must be fixed in 2cba218 |
Hi - Chrome is seeing crashes due to this PR as well (https://crbug.com/351866598). Unfortunately, 2cba218 does not fix the crash in our case. |
Can you provide a reproducer? |
See repro.tar.gz I'll see if I can get a reduced repro. |
Ok, able to reproduce, will try to investigate |
Here's a (much) reduced repro with CReduce: int a, b, c;
double d, e, f, g;
void h() {
do {
while (a)
;
d = f - (0 * f + e * g);
g = e * f - 0 * g + g;
f = d * g;
a = a - c;
} while (b);
} Reduced command: |
I reduced already, the fix will be ready soon |
Must be fixed in af21bc1 |
Confirmed. Thanks for the extremely quick fix! |
Another crash bisected to this patch: It still crashes on current top of tree 015526b . |
Fixed in 3742c2a |
Confirmed that it fixes the crash. |
We also see crashes after this commit. None of the fixes resolve the crashes, the crashes reproduce at ToT as of now (5b0dba13a5632d944d1eac8b39f44f65ec524e86). No assertion failures are visible, just a crash with this call stack:
In our case it's only reproducible when compiling internal code with thinlto, which makes producing an isolated test case more problematic. |
Need a reproducer! |
Just got to this, which crashes
|
Thanks, will fix ASAP |
Fixed in 9e261c5 |
Thanks! I'll let you know if I see any more related issues. |
If the instruction is marked for deletion, better to drop all its
operands and mark them for deletion too (if allowed). It allows to have
more vectorizable patterns and generate less useless extractelement
instructions.