Skip to content

Commit bbd52dd

Browse files
[SLP]Remove operands upon marking instruction for deletion.
If the instruction is marked for deletion, better to drop all its operands and mark them for deletion too (if allowed). It allows to have more vectorizable patterns and generate less useless extractelement instructions. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#97409
1 parent d5f5dc9 commit bbd52dd

21 files changed

+93
-70
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+44-16
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,12 @@ class BoUpSLP {
11631163
return VectorizableTree.front()->Scalars;
11641164
}
11651165

1166+
/// Checks if the root graph node can be emitted with narrower bitwidth at
1167+
/// codegen and returns it signedness, if so.
1168+
bool isSignedMinBitwidthRootNode() const {
1169+
return MinBWs.at(VectorizableTree.front().get()).second;
1170+
}
1171+
11661172
/// Builds external uses of the vectorized scalars, i.e. the list of
11671173
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
11681174
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -2430,6 +2436,21 @@ class BoUpSLP {
24302436
DeletedInstructions.insert(I);
24312437
}
24322438

2439+
/// Clear the operands of \p I, marking for deletion trivially dead operands.
2440+
void clearOperands(Instruction *I, const TreeEntry *Entry = nullptr) {
2441+
for (unsigned Idx : seq<unsigned>(I->getNumOperands())) {
2442+
// Ignore pointer operand of stores to keep correct DIAssignID.
2443+
if (isa<StoreInst>(I) && Idx == 1)
2444+
continue;
2445+
Value *Op = I->getOperand(Idx);
2446+
I->setOperand(Idx, PoisonValue::get(Op->getType()));
2447+
if (auto *OpI = dyn_cast<Instruction>(Op))
2448+
if (!isDeleted(OpI) && isInstructionTriviallyDead(OpI, TLI) &&
2449+
(!Entry || Entry->VectorizedValue != OpI))
2450+
eraseInstruction(OpI);
2451+
}
2452+
}
2453+
24332454
/// Checks if the instruction was already analyzed for being possible
24342455
/// reduction root.
24352456
bool isAnalyzedReductionRoot(Instruction *I) const {
@@ -3795,7 +3816,7 @@ class BoUpSLP {
37953816

37963817
/// Performs the "real" scheduling. Done before vectorization is actually
37973818
/// performed in a basic block.
3798-
void scheduleBlock(BlockScheduling *BS);
3819+
void scheduleBlock(BlockScheduling *BS, BoUpSLP &R);
37993820

38003821
/// List of users to ignore during scheduling and that don't need extracting.
38013822
const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
@@ -13524,7 +13545,7 @@ Value *BoUpSLP::vectorizeTree(
1352413545
Instruction *ReductionRoot) {
1352513546
// All blocks must be scheduled before any instructions are inserted.
1352613547
for (auto &BSIter : BlocksSchedules) {
13527-
scheduleBlock(BSIter.second.get());
13548+
scheduleBlock(BSIter.second.get(), *this);
1352813549
}
1352913550
// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
1353013551
// need to rebuild it.
@@ -14064,11 +14085,14 @@ Value *BoUpSLP::vectorizeTree(
1406414085
}
1406514086
#endif
1406614087
LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14067-
eraseInstruction(cast<Instruction>(Scalar));
14088+
auto *I = cast<Instruction>(Scalar);
14089+
// Clear the operands, marking for deletion trivially dead operands.
14090+
clearOperands(I, Entry);
14091+
eraseInstruction(I);
1406814092
// Retain to-be-deleted instructions for some debug-info
1406914093
// bookkeeping. NOTE: eraseInstruction only marks the instruction for
1407014094
// deletion - instructions are not deleted until later.
14071-
RemovedInsts.push_back(cast<Instruction>(Scalar));
14095+
RemovedInsts.push_back(I);
1407214096
}
1407314097
}
1407414098

@@ -14681,6 +14705,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
1468114705

1468214706
for (; DepDest; DepDest = DepDest->NextLoadStore) {
1468314707
assert(isInSchedulingRegion(DepDest));
14708+
if (SLP->isDeleted(DepDest->Inst))
14709+
continue;
1468414710

1468514711
// We have two limits to reduce the complexity:
1468614712
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
@@ -14750,7 +14776,7 @@ void BoUpSLP::BlockScheduling::resetSchedule() {
1475014776
ReadyInsts.clear();
1475114777
}
1475214778

14753-
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14779+
void BoUpSLP::scheduleBlock(BlockScheduling *BS, BoUpSLP &R) {
1475414780
if (!BS->ScheduleStart)
1475514781
return;
1475614782

@@ -14807,6 +14833,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
1480714833
for (ScheduleData *BundleMember = Picked; BundleMember;
1480814834
BundleMember = BundleMember->NextInBundle) {
1480914835
Instruction *PickedInst = BundleMember->Inst;
14836+
if (R.isDeleted(PickedInst))
14837+
continue;
1481014838
if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
1481114839
PickedInst->moveAfter(LastScheduledInst->getPrevNode());
1481214840
LastScheduledInst = PickedInst;
@@ -17344,14 +17372,11 @@ class HorizontalReduction {
1734417372
Value *ReducedSubTree =
1734517373
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
1734617374
if (ReducedSubTree->getType() != VL.front()->getType()) {
17347-
ReducedSubTree = Builder.CreateIntCast(
17348-
ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
17349-
KnownBits Known = computeKnownBits(
17350-
R, cast<Instruction>(ReductionOps.front().front())
17351-
->getModule()
17352-
->getDataLayout());
17353-
return !Known.isNonNegative();
17354-
}));
17375+
assert(ReducedSubTree->getType() != VL.front()->getType() &&
17376+
"Expected different reduction type.");
17377+
ReducedSubTree =
17378+
Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
17379+
V.isSignedMinBitwidthRootNode());
1735517380
}
1735617381

1735717382
// Improved analysis for add/fadd/xor reductions with same scale factor
@@ -17513,10 +17538,13 @@ class HorizontalReduction {
1751317538
}
1751417539
#endif
1751517540
if (!Ignore->use_empty()) {
17516-
Value *Undef = UndefValue::get(Ignore->getType());
17517-
Ignore->replaceAllUsesWith(Undef);
17541+
Value *P = PoisonValue::get(Ignore->getType());
17542+
Ignore->replaceAllUsesWith(P);
1751817543
}
17519-
V.eraseInstruction(cast<Instruction>(Ignore));
17544+
auto *I = cast<Instruction>(Ignore);
17545+
// Clear the operands, marking for deletion trivially dead operands.
17546+
V.clearOperands(I);
17547+
V.eraseInstruction(I);
1752017548
}
1752117549
}
1752217550
} else if (!CheckForReusedReductionOps) {

llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -503,10 +503,10 @@ define void @add_v64i8() {
503503
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
504504
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
505505
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
506+
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
506507
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
507508
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
508509
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
509-
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
510510
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
511511
; SSE-NEXT: ret void
512512
;
@@ -522,10 +522,10 @@ define void @add_v64i8() {
522522
; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
523523
; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
524524
; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
525+
; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
525526
; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
526527
; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
527528
; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
528-
; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
529529
; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
530530
; SLM-NEXT: ret void
531531
;

llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -401,10 +401,10 @@ define void @add_v64i8() {
401401
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
402402
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
403403
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
404+
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
404405
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
405406
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
406407
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
407-
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
408408
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
409409
; SSE-NEXT: ret void
410410
;

llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -439,10 +439,10 @@ define void @add_v64i8() {
439439
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
440440
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
441441
; SSE-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
442+
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
442443
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
443444
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
444445
; SSE-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
445-
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
446446
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
447447
; SSE-NEXT: ret void
448448
;
@@ -458,10 +458,10 @@ define void @add_v64i8() {
458458
; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
459459
; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
460460
; SLM-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]]
461+
; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
461462
; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
462463
; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
463464
; SLM-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]]
464-
; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
465465
; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
466466
; SLM-NEXT: ret void
467467
;

llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -520,10 +520,10 @@ define void @smul_v64i8() {
520520
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
521521
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
522522
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
523+
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
523524
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
524525
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
525526
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
526-
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
527527
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
528528
; SSE-NEXT: ret void
529529
;
@@ -539,10 +539,10 @@ define void @smul_v64i8() {
539539
; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
540540
; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
541541
; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
542+
; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
542543
; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
543544
; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
544545
; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
545-
; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
546546
; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
547547
; SLM-NEXT: ret void
548548
;
@@ -1323,10 +1323,10 @@ define void @umul_v64i8() {
13231323
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
13241324
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
13251325
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
1326+
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
13261327
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
13271328
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
13281329
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
1329-
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
13301330
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
13311331
; SSE-NEXT: ret void
13321332
;
@@ -1342,10 +1342,10 @@ define void @umul_v64i8() {
13421342
; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
13431343
; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
13441344
; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3)
1345+
; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
13451346
; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
13461347
; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
13471348
; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3)
1348-
; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
13491349
; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
13501350
; SLM-NEXT: ret void
13511351
;

llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -480,10 +480,10 @@ define void @fshl_v64i8() {
480480
; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
481481
; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
482482
; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
483+
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
483484
; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
484485
; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
485486
; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
486-
; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1
487487
; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1
488488
; SSE-NEXT: ret void
489489
;

0 commit comments

Comments
 (0)