diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 11f9ad70dc725..bba80c3f675d7 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1163,6 +1163,12 @@ class BoUpSLP { return VectorizableTree.front()->Scalars; } + /// Checks if the root graph node can be emitted with narrower bitwidth at + /// codegen and returns it signedness, if so. + bool isSignedMinBitwidthRootNode() const { + return MinBWs.at(VectorizableTree.front().get()).second; + } + /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p /// ExternallyUsedValues contains additional list of external uses to handle @@ -2430,6 +2436,21 @@ class BoUpSLP { DeletedInstructions.insert(I); } + /// Clear the operands of \p I, marking for deletion trivially dead operands. + void clearOperands(Instruction *I, const TreeEntry *Entry = nullptr) { + for (unsigned Idx : seq(I->getNumOperands())) { + // Ignore pointer operand of stores to keep correct DIAssignID. + if (isa(I) && Idx == 1) + continue; + Value *Op = I->getOperand(Idx); + I->setOperand(Idx, PoisonValue::get(Op->getType())); + if (auto *OpI = dyn_cast(Op)) + if (!isDeleted(OpI) && isInstructionTriviallyDead(OpI, TLI) && + (!Entry || Entry->VectorizedValue != OpI)) + eraseInstruction(OpI); + } + } + /// Checks if the instruction was already analyzed for being possible /// reduction root. bool isAnalyzedReductionRoot(Instruction *I) const { @@ -3795,7 +3816,7 @@ class BoUpSLP { /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. - void scheduleBlock(BlockScheduling *BS); + void scheduleBlock(BlockScheduling *BS, BoUpSLP &R); /// List of users to ignore during scheduling and that don't need extracting. const SmallDenseSet *UserIgnoreList = nullptr; @@ -13524,7 +13545,7 @@ Value *BoUpSLP::vectorizeTree( Instruction *ReductionRoot) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + scheduleBlock(BSIter.second.get(), *this); } // Clean Entry-to-LastInstruction table. It can be affected after scheduling, // need to rebuild it. @@ -14064,11 +14085,14 @@ Value *BoUpSLP::vectorizeTree( } #endif LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); - eraseInstruction(cast(Scalar)); + auto *I = cast(Scalar); + // Clear the operands, marking for deletion trivially dead operands. + clearOperands(I, Entry); + eraseInstruction(I); // Retain to-be-deleted instructions for some debug-info // bookkeeping. NOTE: eraseInstruction only marks the instruction for // deletion - instructions are not deleted until later. - RemovedInsts.push_back(cast(Scalar)); + RemovedInsts.push_back(I); } } @@ -14681,6 +14705,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, for (; DepDest; DepDest = DepDest->NextLoadStore) { assert(isInSchedulingRegion(DepDest)); + if (SLP->isDeleted(DepDest->Inst)) + continue; // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to @@ -14750,7 +14776,7 @@ void BoUpSLP::BlockScheduling::resetSchedule() { ReadyInsts.clear(); } -void BoUpSLP::scheduleBlock(BlockScheduling *BS) { +void BoUpSLP::scheduleBlock(BlockScheduling *BS, BoUpSLP &R) { if (!BS->ScheduleStart) return; @@ -14807,6 +14833,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { for (ScheduleData *BundleMember = Picked; BundleMember; BundleMember = BundleMember->NextInBundle) { Instruction *PickedInst = BundleMember->Inst; + if (R.isDeleted(PickedInst)) + continue; if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst) PickedInst->moveAfter(LastScheduledInst->getPrevNode()); LastScheduledInst = PickedInst; @@ -17344,14 +17372,11 @@ class HorizontalReduction { Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); if (ReducedSubTree->getType() != VL.front()->getType()) { - ReducedSubTree = Builder.CreateIntCast( - ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) { - KnownBits Known = computeKnownBits( - R, cast(ReductionOps.front().front()) - ->getModule() - ->getDataLayout()); - return !Known.isNonNegative(); - })); + assert(ReducedSubTree->getType() != VL.front()->getType() && + "Expected different reduction type."); + ReducedSubTree = + Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(), + V.isSignedMinBitwidthRootNode()); } // Improved analysis for add/fadd/xor reductions with same scale factor @@ -17513,10 +17538,13 @@ class HorizontalReduction { } #endif if (!Ignore->use_empty()) { - Value *Undef = UndefValue::get(Ignore->getType()); - Ignore->replaceAllUsesWith(Undef); + Value *P = PoisonValue::get(Ignore->getType()); + Ignore->replaceAllUsesWith(P); } - V.eraseInstruction(cast(Ignore)); + auto *I = cast(Ignore); + // Clear the operands, marking for deletion trivially dead operands. + V.clearOperands(I); + V.eraseInstruction(I); } } } else if (!CheckForReusedReductionOps) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll index 24c5fcb068086..8c4903dbc92bb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll @@ -503,10 +503,10 @@ define void @add_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -522,10 +522,10 @@ define void @add_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll index fab022d691c07..cb8d45b1a21a2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll @@ -401,10 +401,10 @@ define void @add_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll index dafed43e6e71c..a7ae2d9e02ff4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll @@ -439,10 +439,10 @@ define void @add_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -458,10 +458,10 @@ define void @add_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]] -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll index e4c76daddb02e..d4eafdeb50a47 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll @@ -520,10 +520,10 @@ define void @smul_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -539,10 +539,10 @@ define void @smul_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; @@ -1323,10 +1323,10 @@ define void @umul_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -1342,10 +1342,10 @@ define void @umul_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll index 9b8480cd0088a..16977c025e3ea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll @@ -480,10 +480,10 @@ define void @fshl_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll index daf28b9a0bb4d..609a9024e5bf7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll @@ -575,21 +575,21 @@ define void @fshl_v64i8() { ; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 ; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr @c8, align 1 ; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1 ; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> [[TMP7]]) +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP13]]) -; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP11]], <16 x i8> [[TMP12]], <16 x i8> [[TMP14]]) -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1 -; SSE-NEXT: store <16 x i8> [[TMP15]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i8> [[TMP15]]) ; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -598,11 +598,11 @@ define void @fshl_v64i8() { ; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 ; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr @c8, align 1 ; AVX-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1 ; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP7:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP8:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1 ; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll index f3e73d0e6840e..090a9daa6a113 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll @@ -480,10 +480,10 @@ define void @fshr_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll index fb7532768c4b3..3dc7d164f5bc9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll @@ -575,21 +575,21 @@ define void @fshr_v64i8() { ; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 ; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr @c8, align 1 ; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1 ; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> [[TMP7]]) +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP13]]) -; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP11]], <16 x i8> [[TMP12]], <16 x i8> [[TMP14]]) -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1 -; SSE-NEXT: store <16 x i8> [[TMP15]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i8> [[TMP15]]) ; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -598,11 +598,11 @@ define void @fshr_v64i8() { ; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 ; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr @c8, align 1 ; AVX-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1 ; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP7:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP8:%.*]] = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1 ; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll index 94976a8cdee25..51cf32242bfdf 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -528,10 +528,10 @@ define void @mul_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -547,10 +547,10 @@ define void @mul_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; @@ -566,10 +566,10 @@ define void @mul_v64i8() { ; AVX128-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; AVX128-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; AVX128-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; AVX128-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; AVX128-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; AVX128-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; AVX128-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] -; AVX128-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; AVX128-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; AVX128-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll index c63b672f4187c..dd76992c2570b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll @@ -385,10 +385,10 @@ define void @smax_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll index 826f97f2a2d89..678477fa1e397 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll @@ -385,10 +385,10 @@ define void @smin_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll index afaab8b8ca642..65e2a011cc9a1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll @@ -503,10 +503,10 @@ define void @sub_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -522,10 +522,10 @@ define void @sub_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll index 3510863c88930..18df499c6646e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll @@ -401,10 +401,10 @@ define void @sub_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll index be54c1e04ca39..9d34edbb506c0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll @@ -439,10 +439,10 @@ define void @sub_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -458,10 +458,10 @@ define void @sub_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]] -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll index 3a187930055f0..a3f2b97a08a6e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll @@ -385,10 +385,10 @@ define void @umax_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll index 15119a9628067..0c7688345ac48 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll @@ -385,10 +385,10 @@ define void @umin_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index e3dc67558af02..f036801865048 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -810,14 +810,9 @@ define float @extra_args_same_several_times(ptr nocapture readonly %x, i32 %a, i ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 ; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) -; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], -; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP3]], -; THRESHOLD-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> -; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], 1.300000e+01 +; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP2]] ; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll index 51798deae694a..88aafb2bf148b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -464,10 +464,10 @@ define void @ashr_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll index 7583561bbecf9..96977cd4fb7d7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -413,10 +413,10 @@ define void @lshr_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll index 5ec327c131fb7..789316ab33c43 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -461,10 +461,10 @@ define void @shl_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ;