Skip to content

[SLP]Remove operands upon marking instruction for deletion. #97409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Conversation

alexey-bataev
Copy link
Member

If the instruction is marked for deletion, better to drop all its
operands and mark them for deletion too (if allowed). It allows to have
more vectorizable patterns and generate less useless extractelement
instructions.

Created using spr 1.3.5
@llvmbot
Copy link
Member

llvmbot commented Jul 2, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

If the instruction is marked for deletion, better to drop all its
operands and mark them for deletion too (if allowed). It allows to have
more vectorizable patterns and generate less useless extractelement
instructions.


Patch is 48.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/97409.diff

21 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+45-16)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll (+2-2)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll (+2-2)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll (+4-4)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll (+10-10)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll (+10-10)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll (+3-3)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll (+2-2)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll (+2-2)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll (+3-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll (+1-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll (+1-1)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e233de89a33f1..2638bd0ef8720 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1163,6 +1163,15 @@ class BoUpSLP {
     return VectorizableTree.front()->Scalars;
   }
 
+  /// Checks if the root graph node can be emitted with narrower bitwidth at
+  /// codegen and returns it signedness, if so.
+  std::optional<bool> isSignedMinBitwidthRootNode() const {
+    auto It = MinBWs.find(VectorizableTree.front().get());
+    if (It == MinBWs.end())
+      return std::nullopt;
+    return It->second.second;
+  }
+
   /// Builds external uses of the vectorized scalars, i.e. the list of
   /// vectorized scalars to be extracted, their lanes and their scalar users. \p
   /// ExternallyUsedValues contains additional list of external uses to handle
@@ -3795,7 +3804,7 @@ class BoUpSLP {
 
   /// Performs the "real" scheduling. Done before vectorization is actually
   /// performed in a basic block.
-  void scheduleBlock(BlockScheduling *BS);
+  void scheduleBlock(BlockScheduling *BS, BoUpSLP &R);
 
   /// List of users to ignore during scheduling and that don't need extracting.
   const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
@@ -13524,7 +13533,7 @@ Value *BoUpSLP::vectorizeTree(
     Instruction *ReductionRoot) {
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
-    scheduleBlock(BSIter.second.get());
+    scheduleBlock(BSIter.second.get(), *this);
   }
   // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
   // need to rebuild it.
@@ -14064,11 +14073,21 @@ Value *BoUpSLP::vectorizeTree(
       }
 #endif
       LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
-      eraseInstruction(cast<Instruction>(Scalar));
+      auto *I = cast<Instruction>(Scalar);
+      // Clear the operands, marking for deletion trivially dead operands.
+      for (unsigned Idx : seq<unsigned>(I->getNumOperands())) {
+        Value *Op = I->getOperand(Idx);
+        I->setOperand(Idx, PoisonValue::get(Op->getType()));
+        if (auto *OpI = dyn_cast<Instruction>(Op))
+          if (!isDeleted(OpI) && isInstructionTriviallyDead(OpI, TLI) &&
+              Entry->VectorizedValue != OpI)
+            eraseInstruction(OpI);
+      }
+      eraseInstruction(I);
       // Retain to-be-deleted instructions for some debug-info
       // bookkeeping. NOTE: eraseInstruction only marks the instruction for
       // deletion - instructions are not deleted until later.
-      RemovedInsts.push_back(cast<Instruction>(Scalar));
+      RemovedInsts.push_back(I);
     }
   }
 
@@ -14681,6 +14700,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
 
       for (; DepDest; DepDest = DepDest->NextLoadStore) {
         assert(isInSchedulingRegion(DepDest));
+        if (SLP->isDeleted(DepDest->Inst))
+          continue;
 
         // We have two limits to reduce the complexity:
         // 1) AliasedCheckLimit: It's a small limit to reduce calls to
@@ -14750,7 +14771,7 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
   ReadyInsts.clear();
 }
 
-void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+void BoUpSLP::scheduleBlock(BlockScheduling *BS, BoUpSLP &R) {
   if (!BS->ScheduleStart)
     return;
 
@@ -14807,6 +14828,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     for (ScheduleData *BundleMember = Picked; BundleMember;
          BundleMember = BundleMember->NextInBundle) {
       Instruction *PickedInst = BundleMember->Inst;
+      if (R.isDeleted(PickedInst))
+        continue;
       if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
         PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
@@ -17344,14 +17367,11 @@ class HorizontalReduction {
         Value *ReducedSubTree =
             emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
         if (ReducedSubTree->getType() != VL.front()->getType()) {
-          ReducedSubTree = Builder.CreateIntCast(
-              ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
-                KnownBits Known = computeKnownBits(
-                    R, cast<Instruction>(ReductionOps.front().front())
-                           ->getModule()
-                           ->getDataLayout());
-                return !Known.isNonNegative();
-              }));
+          assert(ReducedSubTree->getType() != VL.front()->getType() &&
+                 "Expected different reduction type.");
+          ReducedSubTree =
+              Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
+                                    *V.isSignedMinBitwidthRootNode());
         }
 
         // Improved analysis for add/fadd/xor reductions with same scale factor
@@ -17513,10 +17533,19 @@ class HorizontalReduction {
           }
 #endif
           if (!Ignore->use_empty()) {
-            Value *Undef = UndefValue::get(Ignore->getType());
-            Ignore->replaceAllUsesWith(Undef);
+            Value *P = PoisonValue::get(Ignore->getType());
+            Ignore->replaceAllUsesWith(P);
+          }
+          auto *I = cast<Instruction>(Ignore);
+          // Clear the operands, marking for deletion trivially dead operands.
+          for (unsigned Idx : seq<unsigned>(I->getNumOperands())) {
+            Value *Op = I->getOperand(Idx);
+            I->setOperand(Idx, PoisonValue::get(Op->getType()));
+            if (auto *OpI = dyn_cast<Instruction>(Op))
+              if (!V.isDeleted(OpI) && isInstructionTriviallyDead(OpI, &TLI))
+                V.eraseInstruction(OpI);
           }
-          V.eraseInstruction(cast<Instruction>(Ignore));
+          V.eraseInstruction(I);
         }
       }
     } else if (!CheckForReusedReductionOps) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
index 24c5fcb068086..8c4903dbc92bb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -503,10 +503,10 @@ define void @add_v64i8() {
 ; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
@@ -522,10 +522,10 @@ define void @add_v64i8() {
 ; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SLM-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
index fab022d691c07..cb8d45b1a21a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -401,10 +401,10 @@ define void @add_v64i8() {
 ; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
index dafed43e6e71c..a7ae2d9e02ff4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -439,10 +439,10 @@ define void @add_v64i8() {
 ; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
+; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
-; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
@@ -458,10 +458,10 @@ define void @add_v64i8() {
 ; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
+; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SLM-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SLM-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
index e4c76daddb02e..d4eafdeb50a47 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
@@ -520,10 +520,10 @@ define void @smul_v64i8() {
 ; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
@@ -539,10 +539,10 @@ define void @smul_v64i8() {
 ; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SLM-NEXT:    ret void
 ;
@@ -1323,10 +1323,10 @@ define void @umul_v64i8() {
 ; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
@@ -1342,10 +1342,10 @@ define void @umul_v64i8() {
 ; SLM-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
+; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SLM-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SLM-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SLM-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
-; SLM-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
 ; SLM-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
 ; SLM-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
index 9b8480cd0088a..16977c025e3ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
@@ -480,10 +480,10 @@ define void @fshl_v64i8() {
 ; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
+; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
 ; SSE-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
-; SSE-NEXT:    store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
 ; SSE-NEXT:    store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1
 ; SSE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
index daf28b9a0bb4d..609a9024e5bf7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
@@ -575,21 +575,21 @@ define void @fshl_v64i8() {
 ; SSE-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
 ; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @c8, align 1
 ; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]])
+; SSE-NEXT:    store <16 x i8> [[TMP4]], ptr @d8, align 1
 ; SSE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
 ; SSE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
 ; SSE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
 ; SSE-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> [[TMP7]])
+; SSE-NEXT:    store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1
 ; SSE-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
 ; SSE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SSE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SSE-NEXT:    [[TMP12:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SSE-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SSE-NEXT:    [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SSE-NEXT:    [[TMP15:%.*]] = call <16 x i8> @llvm.f...
[truncated]

Created using spr 1.3.5
"Expected different reduction type.");
ReducedSubTree =
Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
*V.isSignedMinBitwidthRootNode());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it safe to directly dereference an optional?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reworked the function, if the types do not match, it means that the root node is resized due to minbitwidth analysis and there is signedness info in MinBWs map for the root node

Created using spr 1.3.5
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM with one minor

if (auto *OpI = dyn_cast<Instruction>(Op))
if (!isDeleted(OpI) && isInstructionTriviallyDead(OpI, TLI) &&
Entry->VectorizedValue != OpI)
eraseInstruction(OpI);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar code to the version in HorizontalReduction - pull out into a small helper?

Created using spr 1.3.5
@alexey-bataev alexey-bataev merged commit bbd52dd into main Jul 3, 2024
3 of 5 checks passed
@alexey-bataev alexey-bataev deleted the users/alexey-bataev/spr/slpremove-operands-upon-marking-instruction-for-deletion branch July 3, 2024 19:11
kbluck pushed a commit to kbluck/llvm-project that referenced this pull request Jul 6, 2024
If the instruction is marked for deletion, better to drop all its
operands and mark them for deletion too (if allowed). It allows to have
more vectorizable patterns and generate less useless extractelement
instructions.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: llvm#97409
@alexey-bataev alexey-bataev restored the users/alexey-bataev/spr/slpremove-operands-upon-marking-instruction-for-deletion branch July 8, 2024 13:15
@alexey-bataev alexey-bataev deleted the users/alexey-bataev/spr/slpremove-operands-upon-marking-instruction-for-deletion branch July 8, 2024 13:20
alexey-bataev added a commit that referenced this pull request Jul 8, 2024
If the instruction is marked for deletion, better to drop all its
operands and mark them for deletion too (if allowed). It allows to have
more vectorizable patterns and generate less useless extractelement
instructions.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: #97409
@dyung
Copy link
Collaborator

dyung commented Jul 9, 2024

Hi @alexey-bataev, we are seeing an assertion failure in internal tests which I bisected back to this change. What is odd is that it does not hit the assertion failure reliably, so there seems to be some element of randomness to it. I have put the repro and assertion failure stack trace in issue #98133. Can you take a look?

@mstorsjo
Copy link
Member

mstorsjo commented Jul 9, 2024

This change has caused failed asserts. Strangely enough, the failed asserts aren't entirely deterministic though. Reduced testcase:

int b;
float c;
int e;
void f(float p1[]) {
  float *j = &b;
  for (e = 0; e < 2;) {
    float a;
    float d;
    {
      a += p1[e] * 0;
      d += e * j[e];
    }
    e = a;
    c = d;
  }
}

To reproduce:

$ clang -target i686-linux-gnu -c repro.c -O2 -g
clang: ../lib/IR/Instruction.cpp:153: void llvm::Instruction::insertBefore(llvm:
:BasicBlock&, llvm::iplist_impl<llvm::simple_ilist<llvm::Instruction, llvm::ilis
t_iterator_bits<true>, llvm::ilist_parent<llvm::BasicBlock> >, llvm::SymbolTable
ListTraits<llvm::Instruction, llvm::ilist_iterator_bits<true>, llvm::ilist_paren
t<llvm::BasicBlock> > >::iterator): Assertion `!isa<PHINode>(this) && "Inserting
 PHI after debug-records!"' failed.

Please have a look, and consider reverting if fixing takes a long time.

@OCHyams
Copy link
Contributor

OCHyams commented Jul 9, 2024

More info about the assert in case its helpful:

We typically see this assert when a pass sets an insertion point to the beginning of a block or the first non-phi using an Instruction * rather than iterator (which carries a bit that describes whether the insertion point comes before the instruction only or before the instruction and the debug records attached to it). Scanning it I couldn't see anything obvious in this patch that would cause that assertion to fire - it's possible that this patch is just exercising a latent bug in SLP. I also have no clues as to why it's nondeterministic.

DbgRecords and the new iterator bit are documented over here. I'm happy to help with any debug-info questions.

@alexey-bataev
Copy link
Member Author

Hi, thanks for the reports, will fix ASAP

@alexey-bataev
Copy link
Member Author

Must be fixed in 2cba218

@alanzhao1
Copy link
Contributor

Hi - Chrome is seeing crashes due to this PR as well (https://crbug.com/351866598).

Unfortunately, 2cba218 does not fix the crash in our case.

@alexey-bataev
Copy link
Member Author

Hi - Chrome is seeing crashes due to this PR as well (https://crbug.com/351866598).

Unfortunately, 2cba218 does not fix the crash in our case.

Can you provide a reproducer?

@alanzhao1
Copy link
Contributor

See repro.tar.gz

I'll see if I can get a reduced repro.

@alexey-bataev
Copy link
Member Author

See repro.tar.gz

I'll see if I can get a reduced repro.

Ok, able to reproduce, will try to investigate

@alanzhao1
Copy link
Contributor

See repro.tar.gz
I'll see if I can get a reduced repro.

Ok, able to reproduce, will try to investigate

Here's a (much) reduced repro with CReduce:

int a, b, c;
double d, e, f, g;
void h() {
  do {
    while (a)
      ;
    d = f - (0 * f + e * g);
    g = e * f - 0 * g + g;
    f = d * g;
    a = a - c;
  } while (b);
}

Reduced command: clang -cc1 -triple i386-pc-windows-msvc19.34.0 -target-feature +sse3 -O2 -vectorize-slp -emit-llvm fft-06e938.reduced.c

@alexey-bataev
Copy link
Member Author

I reduced already, the fix will be ready soon

@alexey-bataev
Copy link
Member Author

Must be fixed in af21bc1

@alanzhao1
Copy link
Contributor

Must be fixed in af21bc1

Confirmed. Thanks for the extremely quick fix!

@mikaelholmen
Copy link
Collaborator

Another crash bisected to this patch:
opt -passes=slp-vectorizer bbi-97294.ll -o /dev/null

It still crashes on current top of tree 015526b .

bbi-97294.ll.gz

@alexey-bataev
Copy link
Member Author

Fixed in 3742c2a

@mikaelholmen
Copy link
Collaborator

Fixed in 3742c2a

Confirmed that it fixes the crash.

@alexfh
Copy link
Contributor

alexfh commented Jul 12, 2024

We also see crashes after this commit. None of the fixes resolve the crashes, the crashes reproduce at ToT as of now (5b0dba13a5632d944d1eac8b39f44f65ec524e86). No assertion failures are visible, just a crash with this call stack:

 #3 0x000055bfbd8e5211 llvm::CastInst::isNoopCast(llvm::DataLayout const&) const (bin/clang+0x80e5211)
 #4 0x000055bfbd2d5bce llvm::salvageDebugInfoImpl(llvm::Instruction&, unsigned long, llvm::SmallVectorImpl<unsigned long>&, llvm::SmallVectorImpl<llvm::Value*>&) (bin/clang+0x7ad5bce)
 #5 0x000055bfbd2d57f8 llvm::salvageDebugInfoForDbgValues(llvm::Instruction&, llvm::ArrayRef<llvm::DbgVariableIntrinsic*>, llvm::ArrayRef<llvm::DbgVariableRecord*>) (bin/clang+0x7ad57f8)
 #6 0x000055bfbd2cecb9 llvm::salvageDebugInfo(llvm::Instruction&) (bin/clang+0x7acecb9)
 #7 0x000055bfbce8d5b4 void llvm::slpvectorizer::BoUpSLP::removeInstructionsAndOperands<llvm::Instruction>(llvm::ArrayRef<llvm::Instruction*>) (bin/clang+0x768d5b4)
 #8 0x000055bfbce8ba2b llvm::slpvectorizer::BoUpSLP::vectorizeTree(llvm::MapVector<llvm::Value*, llvm::SmallVector<llvm::Instruction*, 2u>, llvm::DenseMap<llvm::Value*, unsigned int, llvm::DenseMapInfo<llvm::Value*, void>, llvm::detail::DenseMapPair<llvm::Value*, unsigned int>>, llvm::SmallVector<std::__u::pair<llvm::Value*, llvm::SmallVector<llvm::Instruction*, 2u>>, 0u>> const&, llvm::SmallVectorImpl<std::__u::pair<llvm::Value*, llvm::Value*>>&, llvm::Instruction*) (bin/clang+0x768ba2b)
 #9 0x000055bfbce88dd7 llvm::slpvectorizer::BoUpSLP::vectorizeTree() (bin/clang+0x7688dd7)
#10 0x000055bfbce98eb2 llvm::SLPVectorizerPass::vectorizeStoreChain(llvm::ArrayRef<llvm::Value*>, llvm::slpvectorizer::BoUpSLP&, unsigned int, unsigned int, unsigned int&) (bin/clang+0x7698eb2)
#11 0x000055bfbce9a8d9 llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__u::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__u::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::$_0::operator()(std::__u::set<std::__u::pair<unsigned int, int>, llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__u::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__u::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&)::StoreDistCompare, std::__u::allocator<std::__u::pair<unsigned int, int>>> const&) const (bin/clang+0x769a8d9)
#12 0x000055bfbce99d1b llvm::SLPVectorizerPass::vectorizeStores(llvm::ArrayRef<llvm::StoreInst*>, llvm::slpvectorizer::BoUpSLP&, llvm::DenseSet<std::__u::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, llvm::DenseMapInfo<std::__u::tuple<llvm::Value*, llvm::Value*, llvm::Value*, llvm::Value*, unsigned int>, void>>&) (bin/clang+0x7699d1b)
#13 0x000055bfbce95c02 llvm::SLPVectorizerPass::vectorizeStoreChains(llvm::slpvectorizer::BoUpSLP&) (bin/clang+0x7695c02)
#14 0x000055bfbce9487d llvm::SLPVectorizerPass::runImpl(llvm::Function&, llvm::ScalarEvolution*, llvm::TargetTransformInfo*, llvm::TargetLibraryInfo*, llvm::AAResults*, llvm::LoopInfo*, llvm::DominatorTree*, llvm::AssumptionCache*, llvm::DemandedBits*, llvm::OptimizationRemarkEmitter*) (bin/clang+0x769487d)
#15 0x000055bfbce94395 llvm::SLPVectorizerPass::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (bin/clang+0x7694395)
#16 0x000055bfbc1c1e92 llvm::detail::PassModel<llvm::Function, llvm::SLPVectorizerPass, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (bin/clang+0x69c1e92)
#17 0x000055bfbd934a29 llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (bin/clang+0x8134a29)
#18 0x000055bfb8dd1552 llvm::detail::PassModel<llvm::Function, llvm::PassManager<llvm::Function, llvm::AnalysisManager<llvm::Function>>, llvm::AnalysisManager<llvm::Function>>::run(llvm::Function&, llvm::AnalysisManager<llvm::Function>&) (bin/clang+0x35d1552)
#19 0x000055bfbd9371c9 llvm::ModuleToFunctionPassAdaptor::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (bin/clang+0x81371c9)
#20 0x000055bfb8dcc352 llvm::detail::PassModel<llvm::Module, llvm::ModuleToFunctionPassAdaptor, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (bin/clang+0x35cc352)
#21 0x000055bfbd933e89 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (bin/clang+0x8133e89)
#22 0x000055bfb8f747f1 llvm::lto::opt(llvm::lto::Config const&, llvm::TargetMachine*, unsigned int, llvm::Module&, bool, llvm::ModuleSummaryIndex*, llvm::ModuleSummaryIndex const*, std::__u::vector<unsigned char, std::__u::allocator<unsigned char>> const&) (bin/clang+0x37747f1)
#23 0x000055bfb8f76685 llvm::lto::thinBackend(llvm::lto::Config const&, unsigned int, std::__u::function<llvm::Expected<std::__u::unique_ptr<llvm::CachedFileStream, std::__u::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, llvm::Module&, llvm::ModuleSummaryIndex const&, llvm::DenseMap<llvm::StringRef, std::__u::unordered_map<unsigned long, llvm::GlobalValueSummary::ImportKind, std::__u::hash<unsigned long>, std::__u::equal_to<unsigned long>, std::__u::allocator<std::__u::pair<unsigned long const, llvm::GlobalValueSummary::ImportKind>>>, llvm::DenseMapInfo<llvm::StringRef, void>, llvm::detail::DenseMapPair<llvm::StringRef, std::__u::unordered_map<unsigned long, llvm::GlobalValueSummary::ImportKind, std::__u::hash<unsigned long>, std::__u::equal_to<unsigned long>, std::__u::allocator<std::__u::pair<unsigned long const, llvm::GlobalValueSummary::ImportKind>>>>> const&, llvm::DenseMap<unsigned long, llvm::GlobalValueSummary*, llvm::DenseMapInfo<unsigned long, void>, llvm::detail::DenseMapPair<unsigned long, llvm::GlobalValueSummary*>> const&, llvm::MapVector<llvm::StringRef, llvm::BitcodeModule, llvm::DenseMap<llvm::StringRef, unsigned int, llvm::DenseMapInfo<llvm::StringRef, void>, llvm::detail::DenseMapPair<llvm::StringRef, unsigned int>>, llvm::SmallVector<std::__u::pair<llvm::StringRef, llvm::BitcodeModule>, 0u>>*, std::__u::vector<unsigned char, std::__u::allocator<unsigned char>> const&)::$_0::operator()(llvm::Module&, llvm::TargetMachine*, std::__u::unique_ptr<llvm::ToolOutputFile, std::__u::default_delete<llvm::ToolOutputFile>>) const (bin/clang+0x3776685)
#24 0x000055bfb8f76516 llvm::lto::thinBackend(llvm::lto::Config const&, unsigned int, std::__u::function<llvm::Expected<std::__u::unique_ptr<llvm::CachedFileStream, std::__u::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, llvm::Module&, llvm::ModuleSummaryIndex const&, llvm::DenseMap<llvm::StringRef, std::__u::unordered_map<unsigned long, llvm::GlobalValueSummary::ImportKind, std::__u::hash<unsigned long>, std::__u::equal_to<unsigned long>, std::__u::allocator<std::__u::pair<unsigned long const, llvm::GlobalValueSummary::ImportKind>>>, llvm::DenseMapInfo<llvm::StringRef, void>, llvm::detail::DenseMapPair<llvm::StringRef, std::__u::unordered_map<unsigned long, llvm::GlobalValueSummary::ImportKind, std::__u::hash<unsigned long>, std::__u::equal_to<unsigned long>, std::__u::allocator<std::__u::pair<unsigned long const, llvm::GlobalValueSummary::ImportKind>>>>> const&, llvm::DenseMap<unsigned long, llvm::GlobalValueSummary*, llvm::DenseMapInfo<unsigned long, void>, llvm::detail::DenseMapPair<unsigned long, llvm::GlobalValueSummary*>> const&, llvm::MapVector<llvm::StringRef, llvm::BitcodeModule, llvm::DenseMap<llvm::StringRef, unsigned int, llvm::DenseMapInfo<llvm::StringRef, void>, llvm::detail::DenseMapPair<llvm::StringRef, unsigned int>>, llvm::SmallVector<std::__u::pair<llvm::StringRef, llvm::BitcodeModule>, 0u>>*, std::__u::vector<unsigned char, std::__u::allocator<unsigned char>> const&) (bin/clang+0x3776516)
#25 0x000055bfb8dc1279 clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::StringRef, llvm::Module*, clang::BackendAction, llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem>, std::__u::unique_ptr<llvm::raw_pwrite_stream, std::__u::default_delete<llvm::raw_pwrite_stream>>, clang::BackendConsumer*) (bin/clang+0x35c1279)
#26 0x000055bfb8a91173 clang::CodeGenAction::ExecuteAction() (bin/clang+0x3291173)
#27 0x000055bfb964b15a clang::FrontendAction::Execute() (bin/clang+0x3e4b15a)
#28 0x000055bfb95c2484 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (bin/clang+0x3dc2484)
#29 0x000055bfb87573cf clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (bin/clang+0x2f573cf)
#30 0x000055bfb874c526 cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (bin/clang+0x2f4c526)

In our case it's only reproducible when compiling internal code with thinlto, which makes producing an isolated test case more problematic.

@alexey-bataev
Copy link
Member Author

Need a reproducer!

@alexfh
Copy link
Contributor

alexfh commented Jul 12, 2024

Just got to this, which crashes opt -passes=slp-vectorizer:

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-generic-linux-gnu"

define void @_h() #0 {
entry:
  %arrayidx51.1.i = getelementptr i8, ptr null, i64 4
  %retval.sroa.3.0.insert.ext.i.i = zext i8 0 to i32
  %retval.sroa.2.0.insert.ext.i.i = zext i8 0 to i32
  br label %cond.end.i

cond.end.i:                                       ; preds = %entry
  %add46.i30 = or i32 0, %retval.sroa.2.0.insert.ext.i.i
  %add49.i = or i32 0, %retval.sroa.3.0.insert.ext.i.i
    #dbg_value(i32 %add49.i, !8, !DIExpression(), !16)
  %0 = tail call i32 @llvm.umin.i32(i32 %add46.i30, i32 0)
  %cmp.i14.i.i.i = icmp slt i32 %add49.i, 0
  %block_color.sroa.7.0.insert.ext.i = select i1 %cmp.i14.i.i.i, i32 0, i32 0
  %block_color.sroa.7.0.insert.shift.i = shl i32 %block_color.sroa.7.0.insert.ext.i, 16
  %block_color.sroa.5.0.insert.ext.i = select i1 false, i32 0, i32 %0
  %block_color.sroa.5.0.insert.shift.i = shl i32 %block_color.sroa.5.0.insert.ext.i, 0
  %block_color.sroa.7.0.insert.insert.i = or i32 %block_color.sroa.7.0.insert.shift.i, %block_color.sroa.5.0.insert.shift.i
  %block_color.sroa.5.0.insert.insert.i = or i32 %block_color.sroa.7.0.insert.insert.i, 0
  %block_color.sroa.0.0.insert.insert.i = or i32 %block_color.sroa.5.0.insert.insert.i, 0
  store i32 %block_color.sroa.0.0.insert.insert.i, ptr null, align 4
  %add46.1.i = or i32 0, %retval.sroa.2.0.insert.ext.i.i
  %add49.1.i = or i32 0, %retval.sroa.3.0.insert.ext.i.i
  %cmp.i11.i.i.1.i = icmp slt i32 %add46.1.i, 0
  %1 = tail call i32 @llvm.umin.i32(i32 %add49.1.i, i32 0)
  %block_color.sroa.7.0.insert.ext.1.i = select i1 false, i32 0, i32 %1
  %block_color.sroa.7.0.insert.shift.1.i = shl i32 %block_color.sroa.7.0.insert.ext.1.i, 16
  %block_color.sroa.5.0.insert.ext.1.i = select i1 %cmp.i11.i.i.1.i, i32 0, i32 0
  %block_color.sroa.5.0.insert.shift.1.i = shl i32 %block_color.sroa.5.0.insert.ext.1.i, 0
  %block_color.sroa.7.0.insert.insert.1.i = or i32 %block_color.sroa.7.0.insert.shift.1.i, %block_color.sroa.5.0.insert.shift.1.i
  %block_color.sroa.5.0.insert.insert.1.i = or i32 %block_color.sroa.7.0.insert.insert.1.i, 0
  %block_color.sroa.0.0.insert.insert.1.i = or i32 %block_color.sroa.5.0.insert.insert.1.i, 0
  store i32 %block_color.sroa.0.0.insert.insert.1.i, ptr %arrayidx51.1.i, align 4
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.umin.i32(i32, i32) #1

; uselistorder directives
uselistorder ptr @llvm.umin.i32, { 1, 0 }

attributes #0 = { "target-features"="+aes,+cmov,+crc32,+cx16,+cx8,+fxsr,+mmx,+pclmul,+popcnt,+prfchw,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.dbg.cu = !{!0, !3, !5}
!llvm.module.flags = !{!7}

!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang 9ddfe62f5c11e3f65f444209f514029ded2d58b9", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "q.pic.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: GNU)
!1 = !DIFile(filename: "q.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "7f405f2fa7a85732c62eeb5ad41e9a99")
!2 = !{}
!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !4, producer: "clang 9ddfe62f5c11e3f65f444209f514029ded2d58b9", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "q.pic.dwo", emissionKind: FullDebug, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: GNU)
!4 = !DIFile(filename: "q.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "b1f6e06104b120054aa37f5011b1165d")
!5 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !6, producer: "clang 9ddfe62f5c11e3f65f444209f514029ded2d58b9", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "q.pic.dwo", emissionKind: FullDebug, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: GNU)
!6 = !DIFile(filename: "q.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "1a3869a68f60205ccd08e442850492ac")
!7 = !{i32 2, !"Debug Info Version", i32 3}
!8 = !DILocalVariable(name: "sb", arg: 4, scope: !9, file: !10, line: 813, type: !15)
!9 = distinct !DISubprogram(name: "color_rgba", linkageName: "_i", scope: !11, file: !10, line: 813, type: !13, scopeLine: 814, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !14, retainedNodes: !2)
!10 = !DIFile(filename: "q.h", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "798e10c1d58d4180e9266db9b9a5137b")
!11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "color_rgba", scope: !12, file: !10, line: 763, size: 32, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !2, identifier: "_ZTSN6basisu10color_rgbaE")
!12 = !DINamespace(name: "basisu", scope: null)
!13 = distinct !DISubroutineType(types: !2)
!14 = !DISubprogram(name: "color_rgba", scope: !11, file: !10, line: 813, type: !13, scopeLine: 813, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized)
!15 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!16 = !DILocation(line: 0, scope: !9, inlinedAt: !17)
!17 = distinct !DILocation(line: 1320, column: 16, scope: !18, inlinedAt: !29)
!18 = distinct !DILexicalBlock(scope: !19, file: !1, line: 1318, column: 4)
!19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 1317, column: 4)
!20 = distinct !DILexicalBlock(scope: !21, file: !1, line: 1317, column: 4)
!21 = distinct !DILexicalBlock(scope: !22, file: !1, line: 1312, column: 3)
!22 = distinct !DILexicalBlock(scope: !23, file: !1, line: 1311, column: 3)
!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 1311, column: 3)
!24 = distinct !DISubprogram(name: "evaluate_solution_fast", linkageName: "_f", scope: !25, file: !1, line: 1276, type: !27, scopeLine: 1277, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !28, retainedNodes: !2)
!25 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "u", scope: !12, file: !26, line: 924, size: 3072, flags: DIFlagFwdDecl | DIFlagNonTrivial, identifier: "_j")
!26 = !DIFile(filename: "q.h", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "fd8326e7945b2062220a64fd8351349c")
!27 = distinct !DISubroutineType(types: !2)
!28 = !DISubprogram(name: "evaluate_solution_fast", linkageName: "_f", scope: !25, file: !26, line: 1089, type: !27, scopeLine: 1089, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
!29 = distinct !DILocation(line: 1096, column: 12, scope: !30, inlinedAt: !33)
!30 = distinct !DILexicalBlock(scope: !31, file: !26, line: 1093, column: 8)
!31 = distinct !DISubprogram(name: "evaluate_solution", linkageName: "_g", scope: !25, file: !26, line: 1091, type: !27, scopeLine: 1092, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !32, retainedNodes: !2)
!32 = !DISubprogram(name: "evaluate_solution", linkageName: "_g", scope: !25, file: !26, line: 1091, type: !27, scopeLine: 1091, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
!33 = distinct !DILocation(line: 990, column: 4, scope: !34)
!34 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 2)
!35 = distinct !DILexicalBlock(scope: !36, file: !1, line: 958, column: 3)
!36 = distinct !DILexicalBlock(scope: !37, file: !1, line: 957, column: 3)
!37 = distinct !DILexicalBlock(scope: !38, file: !1, line: 957, column: 3)
!38 = distinct !DISubprogram(name: "y", linkageName: "_h", scope: !25, file: !1, line: 947, type: !39, scopeLine: 948, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !40, retainedNodes: !2)
!39 = distinct !DISubroutineType(types: !2)
!40 = !DISubprogram(name: "y", linkageName: "_h", scope: !25, file: !26, line: 1101, type: !39, scopeLine: 1101, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)

@alexey-bataev
Copy link
Member Author

Just got to this, which crashes opt -passes=slp-vectorizer:

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-generic-linux-gnu"

define void @_h() #0 {
entry:
  %arrayidx51.1.i = getelementptr i8, ptr null, i64 4
  %retval.sroa.3.0.insert.ext.i.i = zext i8 0 to i32
  %retval.sroa.2.0.insert.ext.i.i = zext i8 0 to i32
  br label %cond.end.i

cond.end.i:                                       ; preds = %entry
  %add46.i30 = or i32 0, %retval.sroa.2.0.insert.ext.i.i
  %add49.i = or i32 0, %retval.sroa.3.0.insert.ext.i.i
    #dbg_value(i32 %add49.i, !8, !DIExpression(), !16)
  %0 = tail call i32 @llvm.umin.i32(i32 %add46.i30, i32 0)
  %cmp.i14.i.i.i = icmp slt i32 %add49.i, 0
  %block_color.sroa.7.0.insert.ext.i = select i1 %cmp.i14.i.i.i, i32 0, i32 0
  %block_color.sroa.7.0.insert.shift.i = shl i32 %block_color.sroa.7.0.insert.ext.i, 16
  %block_color.sroa.5.0.insert.ext.i = select i1 false, i32 0, i32 %0
  %block_color.sroa.5.0.insert.shift.i = shl i32 %block_color.sroa.5.0.insert.ext.i, 0
  %block_color.sroa.7.0.insert.insert.i = or i32 %block_color.sroa.7.0.insert.shift.i, %block_color.sroa.5.0.insert.shift.i
  %block_color.sroa.5.0.insert.insert.i = or i32 %block_color.sroa.7.0.insert.insert.i, 0
  %block_color.sroa.0.0.insert.insert.i = or i32 %block_color.sroa.5.0.insert.insert.i, 0
  store i32 %block_color.sroa.0.0.insert.insert.i, ptr null, align 4
  %add46.1.i = or i32 0, %retval.sroa.2.0.insert.ext.i.i
  %add49.1.i = or i32 0, %retval.sroa.3.0.insert.ext.i.i
  %cmp.i11.i.i.1.i = icmp slt i32 %add46.1.i, 0
  %1 = tail call i32 @llvm.umin.i32(i32 %add49.1.i, i32 0)
  %block_color.sroa.7.0.insert.ext.1.i = select i1 false, i32 0, i32 %1
  %block_color.sroa.7.0.insert.shift.1.i = shl i32 %block_color.sroa.7.0.insert.ext.1.i, 16
  %block_color.sroa.5.0.insert.ext.1.i = select i1 %cmp.i11.i.i.1.i, i32 0, i32 0
  %block_color.sroa.5.0.insert.shift.1.i = shl i32 %block_color.sroa.5.0.insert.ext.1.i, 0
  %block_color.sroa.7.0.insert.insert.1.i = or i32 %block_color.sroa.7.0.insert.shift.1.i, %block_color.sroa.5.0.insert.shift.1.i
  %block_color.sroa.5.0.insert.insert.1.i = or i32 %block_color.sroa.7.0.insert.insert.1.i, 0
  %block_color.sroa.0.0.insert.insert.1.i = or i32 %block_color.sroa.5.0.insert.insert.1.i, 0
  store i32 %block_color.sroa.0.0.insert.insert.1.i, ptr %arrayidx51.1.i, align 4
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i32 @llvm.umin.i32(i32, i32) #1

; uselistorder directives
uselistorder ptr @llvm.umin.i32, { 1, 0 }

attributes #0 = { "target-features"="+aes,+cmov,+crc32,+cx16,+cx8,+fxsr,+mmx,+pclmul,+popcnt,+prfchw,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.dbg.cu = !{!0, !3, !5}
!llvm.module.flags = !{!7}

!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang 9ddfe62f5c11e3f65f444209f514029ded2d58b9", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "q.pic.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: GNU)
!1 = !DIFile(filename: "q.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "7f405f2fa7a85732c62eeb5ad41e9a99")
!2 = !{}
!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !4, producer: "clang 9ddfe62f5c11e3f65f444209f514029ded2d58b9", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "q.pic.dwo", emissionKind: FullDebug, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: GNU)
!4 = !DIFile(filename: "q.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "b1f6e06104b120054aa37f5011b1165d")
!5 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !6, producer: "clang 9ddfe62f5c11e3f65f444209f514029ded2d58b9", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "q.pic.dwo", emissionKind: FullDebug, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: GNU)
!6 = !DIFile(filename: "q.cpp", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "1a3869a68f60205ccd08e442850492ac")
!7 = !{i32 2, !"Debug Info Version", i32 3}
!8 = !DILocalVariable(name: "sb", arg: 4, scope: !9, file: !10, line: 813, type: !15)
!9 = distinct !DISubprogram(name: "color_rgba", linkageName: "_i", scope: !11, file: !10, line: 813, type: !13, scopeLine: 814, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !14, retainedNodes: !2)
!10 = !DIFile(filename: "q.h", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "798e10c1d58d4180e9266db9b9a5137b")
!11 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "color_rgba", scope: !12, file: !10, line: 763, size: 32, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !2, identifier: "_ZTSN6basisu10color_rgbaE")
!12 = !DINamespace(name: "basisu", scope: null)
!13 = distinct !DISubroutineType(types: !2)
!14 = !DISubprogram(name: "color_rgba", scope: !11, file: !10, line: 813, type: !13, scopeLine: 813, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized)
!15 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
!16 = !DILocation(line: 0, scope: !9, inlinedAt: !17)
!17 = distinct !DILocation(line: 1320, column: 16, scope: !18, inlinedAt: !29)
!18 = distinct !DILexicalBlock(scope: !19, file: !1, line: 1318, column: 4)
!19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 1317, column: 4)
!20 = distinct !DILexicalBlock(scope: !21, file: !1, line: 1317, column: 4)
!21 = distinct !DILexicalBlock(scope: !22, file: !1, line: 1312, column: 3)
!22 = distinct !DILexicalBlock(scope: !23, file: !1, line: 1311, column: 3)
!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 1311, column: 3)
!24 = distinct !DISubprogram(name: "evaluate_solution_fast", linkageName: "_f", scope: !25, file: !1, line: 1276, type: !27, scopeLine: 1277, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !28, retainedNodes: !2)
!25 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "u", scope: !12, file: !26, line: 924, size: 3072, flags: DIFlagFwdDecl | DIFlagNonTrivial, identifier: "_j")
!26 = !DIFile(filename: "q.h", directory: "/proc/self/cwd", checksumkind: CSK_MD5, checksum: "fd8326e7945b2062220a64fd8351349c")
!27 = distinct !DISubroutineType(types: !2)
!28 = !DISubprogram(name: "evaluate_solution_fast", linkageName: "_f", scope: !25, file: !26, line: 1089, type: !27, scopeLine: 1089, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
!29 = distinct !DILocation(line: 1096, column: 12, scope: !30, inlinedAt: !33)
!30 = distinct !DILexicalBlock(scope: !31, file: !26, line: 1093, column: 8)
!31 = distinct !DISubprogram(name: "evaluate_solution", linkageName: "_g", scope: !25, file: !26, line: 1091, type: !27, scopeLine: 1092, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !32, retainedNodes: !2)
!32 = !DISubprogram(name: "evaluate_solution", linkageName: "_g", scope: !25, file: !26, line: 1091, type: !27, scopeLine: 1091, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
!33 = distinct !DILocation(line: 990, column: 4, scope: !34)
!34 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 2)
!35 = distinct !DILexicalBlock(scope: !36, file: !1, line: 958, column: 3)
!36 = distinct !DILexicalBlock(scope: !37, file: !1, line: 957, column: 3)
!37 = distinct !DILexicalBlock(scope: !38, file: !1, line: 957, column: 3)
!38 = distinct !DISubprogram(name: "y", linkageName: "_h", scope: !25, file: !1, line: 947, type: !39, scopeLine: 948, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !40, retainedNodes: !2)
!39 = distinct !DISubroutineType(types: !2)
!40 = !DISubprogram(name: "y", linkageName: "_h", scope: !25, file: !26, line: 1101, type: !39, scopeLine: 1101, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)

Thanks, will fix ASAP

@alexey-bataev
Copy link
Member Author

Fixed in 9e261c5

@alexfh
Copy link
Contributor

alexfh commented Jul 12, 2024

Thanks! I'll let you know if I see any more related issues.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

9 participants