-
Notifications
You must be signed in to change notification settings - Fork 13.5k
Outline X86 autoupgrade patterns #97851
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Outline X86 autoupgrade patterns #97851
Conversation
@llvm/pr-subscribers-llvm-ir Author: Justin Holewinski (jholewinski) ChangesOutlining these patterns has a significant impact on the overall stack frame size of llvm::UpgradeIntrinsicCall. This is helpful for scenarios where compilation threads are stack-constrained. The overall impact is low when using clang as the host compiler, but very pronounced when using MSVC 2022 with release builds. Clang: 1,624 -> 824 bytes Patch is 167.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/97851.diff 1 Files Affected:
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 5beefaa1ec701..de27b6848b20a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -2245,6 +2245,1698 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
}
}
+static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
+ IRBuilder<> &Builder) {
+ LLVMContext &C = F->getContext();
+ Value *Rep = nullptr;
+
+ if (Name.starts_with("sse4a.movnt.")) {
+ SmallVector<Metadata *, 1> Elts;
+ Elts.push_back(
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+ MDNode *Node = MDNode::get(C, Elts);
+
+ Value *Arg0 = CI->getArgOperand(0);
+ Value *Arg1 = CI->getArgOperand(1);
+
+ // Nontemporal (unaligned) store of the 0'th element of the float/double
+ // vector.
+ Type *SrcEltTy = cast<VectorType>(Arg1->getType())->getElementType();
+ PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy);
+ Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast");
+ Value *Extract =
+ Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");
+
+ StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, Align(1));
+ SI->setMetadata(LLVMContext::MD_nontemporal, Node);
+ } else if ((Name.starts_with("avx.movnt.") ||
+ Name.starts_with("avx512.storent."))) {
+ SmallVector<Metadata *, 1> Elts;
+ Elts.push_back(
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+ MDNode *Node = MDNode::get(C, Elts);
+
+ Value *Arg0 = CI->getArgOperand(0);
+ Value *Arg1 = CI->getArgOperand(1);
+
+ // Convert the type of the pointer to a pointer to the stored type.
+ Value *BC = Builder.CreateBitCast(
+ Arg0, PointerType::getUnqual(Arg1->getType()), "cast");
+ StoreInst *SI = Builder.CreateAlignedStore(
+ Arg1, BC,
+ Align(Arg1->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
+ SI->setMetadata(LLVMContext::MD_nontemporal, Node);
+ } else if (Name == "sse2.storel.dq") {
+ Value *Arg0 = CI->getArgOperand(0);
+ Value *Arg1 = CI->getArgOperand(1);
+
+ auto *NewVecTy = FixedVectorType::get(Type::getInt64Ty(C), 2);
+ Value *BC0 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
+ Value *Elt = Builder.CreateExtractElement(BC0, (uint64_t)0);
+ Value *BC = Builder.CreateBitCast(
+ Arg0, PointerType::getUnqual(Elt->getType()), "cast");
+ Builder.CreateAlignedStore(Elt, BC, Align(1));
+ } else if ((Name.starts_with("sse.storeu.") ||
+ Name.starts_with("sse2.storeu.") ||
+ Name.starts_with("avx.storeu."))) {
+ Value *Arg0 = CI->getArgOperand(0);
+ Value *Arg1 = CI->getArgOperand(1);
+
+ Arg0 = Builder.CreateBitCast(Arg0, PointerType::getUnqual(Arg1->getType()),
+ "cast");
+ Builder.CreateAlignedStore(Arg1, Arg0, Align(1));
+ } else if (Name == "avx512.mask.store.ss") {
+ Value *Mask = Builder.CreateAnd(CI->getArgOperand(2), Builder.getInt8(1));
+ upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+ Mask, false);
+ } else if (Name.starts_with("avx512.mask.store")) {
+ // "avx512.mask.storeu." or "avx512.mask.store."
+ bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
+ upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), Aligned);
+ } else if ((Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp"))) {
+ // Upgrade packed integer vector compare intrinsics to compare instructions.
+ // "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
+ bool CmpEq = Name[9] == 'e';
+ Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
+ CI->getArgOperand(0), CI->getArgOperand(1));
+ Rep = Builder.CreateSExt(Rep, CI->getType(), "");
+ } else if ((Name.starts_with("avx512.broadcastm"))) {
+ Type *ExtTy = Type::getInt32Ty(C);
+ if (CI->getOperand(0)->getType()->isIntegerTy(8))
+ ExtTy = Type::getInt64Ty(C);
+ unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() /
+ ExtTy->getPrimitiveSizeInBits();
+ Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
+ Rep = Builder.CreateVectorSplat(NumElts, Rep);
+ } else if ((Name == "sse.sqrt.ss" || Name == "sse2.sqrt.sd")) {
+ Value *Vec = CI->getArgOperand(0);
+ Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0);
+ Function *Intr = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sqrt,
+ Elt0->getType());
+ Elt0 = Builder.CreateCall(Intr, Elt0);
+ Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0);
+ } else if ((Name.starts_with("avx.sqrt.p") ||
+ Name.starts_with("sse2.sqrt.p") ||
+ Name.starts_with("sse.sqrt.p"))) {
+ Rep =
+ Builder.CreateCall(Intrinsic::getDeclaration(
+ F->getParent(), Intrinsic::sqrt, CI->getType()),
+ {CI->getArgOperand(0)});
+ } else if ((Name.starts_with("avx512.mask.sqrt.p"))) {
+ if (CI->arg_size() == 4 &&
+ (!isa<ConstantInt>(CI->getArgOperand(3)) ||
+ cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
+ Intrinsic::ID IID = Name[18] == 's' ? Intrinsic::x86_avx512_sqrt_ps_512
+ : Intrinsic::x86_avx512_sqrt_pd_512;
+
+ Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)};
+ Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+ Args);
+ } else {
+ Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+ Intrinsic::sqrt,
+ CI->getType()),
+ {CI->getArgOperand(0)});
+ }
+ Rep =
+ emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+ } else if ((Name.starts_with("avx512.ptestm") ||
+ Name.starts_with("avx512.ptestnm"))) {
+ Value *Op0 = CI->getArgOperand(0);
+ Value *Op1 = CI->getArgOperand(1);
+ Value *Mask = CI->getArgOperand(2);
+ Rep = Builder.CreateAnd(Op0, Op1);
+ llvm::Type *Ty = Op0->getType();
+ Value *Zero = llvm::Constant::getNullValue(Ty);
+ ICmpInst::Predicate Pred = Name.starts_with("avx512.ptestm")
+ ? ICmpInst::ICMP_NE
+ : ICmpInst::ICMP_EQ;
+ Rep = Builder.CreateICmp(Pred, Rep, Zero);
+ Rep = applyX86MaskOn1BitsVec(Builder, Rep, Mask);
+ } else if ((Name.starts_with("avx512.mask.pbroadcast"))) {
+ unsigned NumElts = cast<FixedVectorType>(CI->getArgOperand(1)->getType())
+ ->getNumElements();
+ Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
+ Rep =
+ emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+ } else if ((Name.starts_with("avx512.kunpck"))) {
+ unsigned NumElts = CI->getType()->getScalarSizeInBits();
+ Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts);
+ Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts);
+ int Indices[64];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i;
+
+ // First extract half of each vector. This gives better codegen than
+ // doing it in a single shuffle.
+ LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
+ RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
+ // Concat the vectors.
+ // NOTE: Operands have to be swapped to match intrinsic definition.
+ Rep = Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
+ Rep = Builder.CreateBitCast(Rep, CI->getType());
+ } else if (Name == "avx512.kand.w") {
+ Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+ Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+ Rep = Builder.CreateAnd(LHS, RHS);
+ Rep = Builder.CreateBitCast(Rep, CI->getType());
+ } else if (Name == "avx512.kandn.w") {
+ Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+ Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+ LHS = Builder.CreateNot(LHS);
+ Rep = Builder.CreateAnd(LHS, RHS);
+ Rep = Builder.CreateBitCast(Rep, CI->getType());
+ } else if (Name == "avx512.kor.w") {
+ Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+ Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+ Rep = Builder.CreateOr(LHS, RHS);
+ Rep = Builder.CreateBitCast(Rep, CI->getType());
+ } else if (Name == "avx512.kxor.w") {
+ Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+ Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+ Rep = Builder.CreateXor(LHS, RHS);
+ Rep = Builder.CreateBitCast(Rep, CI->getType());
+ } else if (Name == "avx512.kxnor.w") {
+ Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+ Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+ LHS = Builder.CreateNot(LHS);
+ Rep = Builder.CreateXor(LHS, RHS);
+ Rep = Builder.CreateBitCast(Rep, CI->getType());
+ } else if (Name == "avx512.knot.w") {
+ Rep = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+ Rep = Builder.CreateNot(Rep);
+ Rep = Builder.CreateBitCast(Rep, CI->getType());
+ } else if ((Name == "avx512.kortestz.w" || Name == "avx512.kortestc.w")) {
+ Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+ Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+ Rep = Builder.CreateOr(LHS, RHS);
+ Rep = Builder.CreateBitCast(Rep, Builder.getInt16Ty());
+ Value *C;
+ if (Name[14] == 'c')
+ C = ConstantInt::getAllOnesValue(Builder.getInt16Ty());
+ else
+ C = ConstantInt::getNullValue(Builder.getInt16Ty());
+ Rep = Builder.CreateICmpEQ(Rep, C);
+ Rep = Builder.CreateZExt(Rep, Builder.getInt32Ty());
+ } else if ((Name == "sse.add.ss" || Name == "sse2.add.sd" ||
+ Name == "sse.sub.ss" || Name == "sse2.sub.sd" ||
+ Name == "sse.mul.ss" || Name == "sse2.mul.sd" ||
+ Name == "sse.div.ss" || Name == "sse2.div.sd")) {
+ Type *I32Ty = Type::getInt32Ty(C);
+ Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
+ ConstantInt::get(I32Ty, 0));
+ Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
+ ConstantInt::get(I32Ty, 0));
+ Value *EltOp;
+ if (Name.contains(".add."))
+ EltOp = Builder.CreateFAdd(Elt0, Elt1);
+ else if (Name.contains(".sub."))
+ EltOp = Builder.CreateFSub(Elt0, Elt1);
+ else if (Name.contains(".mul."))
+ EltOp = Builder.CreateFMul(Elt0, Elt1);
+ else
+ EltOp = Builder.CreateFDiv(Elt0, Elt1);
+ Rep = Builder.CreateInsertElement(CI->getArgOperand(0), EltOp,
+ ConstantInt::get(I32Ty, 0));
+ } else if (Name.starts_with("avx512.mask.pcmp")) {
+ // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
+ bool CmpEq = Name[16] == 'e';
+ Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
+ } else if (Name.starts_with("avx512.mask.vpshufbitqmb.")) {
+ Type *OpTy = CI->getArgOperand(0)->getType();
+ unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+ Intrinsic::ID IID;
+ switch (VecWidth) {
+ default:
+ llvm_unreachable("Unexpected intrinsic");
+ case 128:
+ IID = Intrinsic::x86_avx512_vpshufbitqmb_128;
+ break;
+ case 256:
+ IID = Intrinsic::x86_avx512_vpshufbitqmb_256;
+ break;
+ case 512:
+ IID = Intrinsic::x86_avx512_vpshufbitqmb_512;
+ break;
+ }
+
+ Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+ {CI->getOperand(0), CI->getArgOperand(1)});
+ Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+ } else if (Name.starts_with("avx512.mask.fpclass.p")) {
+ Type *OpTy = CI->getArgOperand(0)->getType();
+ unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+ unsigned EltWidth = OpTy->getScalarSizeInBits();
+ Intrinsic::ID IID;
+ if (VecWidth == 128 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_fpclass_ps_128;
+ else if (VecWidth == 256 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_fpclass_ps_256;
+ else if (VecWidth == 512 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_fpclass_ps_512;
+ else if (VecWidth == 128 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_fpclass_pd_128;
+ else if (VecWidth == 256 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_fpclass_pd_256;
+ else if (VecWidth == 512 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_fpclass_pd_512;
+ else
+ llvm_unreachable("Unexpected intrinsic");
+
+ Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+ {CI->getOperand(0), CI->getArgOperand(1)});
+ Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+ } else if (Name.starts_with("avx512.cmp.p")) {
+ SmallVector<Value *, 4> Args(CI->args());
+ Type *OpTy = Args[0]->getType();
+ unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+ unsigned EltWidth = OpTy->getScalarSizeInBits();
+ Intrinsic::ID IID;
+ if (VecWidth == 128 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
+ else if (VecWidth == 256 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
+ else if (VecWidth == 512 && EltWidth == 32)
+ IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
+ else if (VecWidth == 128 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
+ else if (VecWidth == 256 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
+ else if (VecWidth == 512 && EltWidth == 64)
+ IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
+ else
+ llvm_unreachable("Unexpected intrinsic");
+
+ Value *Mask = Constant::getAllOnesValue(CI->getType());
+ if (VecWidth == 512)
+ std::swap(Mask, Args.back());
+ Args.push_back(Mask);
+
+ Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+ Args);
+ } else if (Name.starts_with("avx512.mask.cmp.")) {
+ // Integer compare intrinsics.
+ unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+ Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
+ } else if (Name.starts_with("avx512.mask.ucmp.")) {
+ unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+ Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
+ } else if ((Name.starts_with("avx512.cvtb2mask.") ||
+ Name.starts_with("avx512.cvtw2mask.") ||
+ Name.starts_with("avx512.cvtd2mask.") ||
+ Name.starts_with("avx512.cvtq2mask."))) {
+ Value *Op = CI->getArgOperand(0);
+ Value *Zero = llvm::Constant::getNullValue(Op->getType());
+ Rep = Builder.CreateICmp(ICmpInst::ICMP_SLT, Op, Zero);
+ Rep = applyX86MaskOn1BitsVec(Builder, Rep, nullptr);
+ } else if ((Name == "ssse3.pabs.b.128" || Name == "ssse3.pabs.w.128" ||
+ Name == "ssse3.pabs.d.128" || Name.starts_with("avx2.pabs") ||
+ Name.starts_with("avx512.mask.pabs"))) {
+ Rep = upgradeAbs(Builder, *CI);
+ } else if ((Name == "sse41.pmaxsb" || Name == "sse2.pmaxs.w" ||
+ Name == "sse41.pmaxsd" || Name.starts_with("avx2.pmaxs") ||
+ Name.starts_with("avx512.mask.pmaxs"))) {
+ Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax);
+ } else if ((Name == "sse2.pmaxu.b" || Name == "sse41.pmaxuw" ||
+ Name == "sse41.pmaxud" || Name.starts_with("avx2.pmaxu") ||
+ Name.starts_with("avx512.mask.pmaxu"))) {
+ Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax);
+ } else if ((Name == "sse41.pminsb" || Name == "sse2.pmins.w" ||
+ Name == "sse41.pminsd" || Name.starts_with("avx2.pmins") ||
+ Name.starts_with("avx512.mask.pmins"))) {
+ Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin);
+ } else if ((Name == "sse2.pminu.b" || Name == "sse41.pminuw" ||
+ Name == "sse41.pminud" || Name.starts_with("avx2.pminu") ||
+ Name.starts_with("avx512.mask.pminu"))) {
+ Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin);
+ } else if ((Name == "sse2.pmulu.dq" || Name == "avx2.pmulu.dq" ||
+ Name == "avx512.pmulu.dq.512" ||
+ Name.starts_with("avx512.mask.pmulu.dq."))) {
+ Rep = upgradePMULDQ(Builder, *CI, /*Signed*/ false);
+ } else if ((Name == "sse41.pmuldq" || Name == "avx2.pmul.dq" ||
+ Name == "avx512.pmul.dq.512" ||
+ Name.starts_with("avx512.mask.pmul.dq."))) {
+ Rep = upgradePMULDQ(Builder, *CI, /*Signed*/ true);
+ } else if ((Name == "sse.cvtsi2ss" || Name == "sse2.cvtsi2sd" ||
+ Name == "sse.cvtsi642ss" || Name == "sse2.cvtsi642sd")) {
+ Rep =
+ Builder.CreateSIToFP(CI->getArgOperand(1),
+ cast<VectorType>(CI->getType())->getElementType());
+ Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+ } else if (Name == "avx512.cvtusi2sd") {
+ Rep =
+ Builder.CreateUIToFP(CI->getArgOperand(1),
+ cast<VectorType>(CI->getType())->getElementType());
+ Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+ } else if (Name == "sse2.cvtss2sd") {
+ Rep = Builder.CreateExtractElement(CI->getArgOperand(1), (uint64_t)0);
+ Rep = Builder.CreateFPExt(
+ Rep, cast<VectorType>(CI->getType())->getElementType());
+ Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+ } else if ((Name == "sse2.cvtdq2pd" || Name == "sse2.cvtdq2ps" ||
+ Name == "avx.cvtdq2.pd.256" || Name == "avx.cvtdq2.ps.256" ||
+ Name.starts_with("avx512.mask.cvtdq2pd.") ||
+ Name.starts_with("avx512.mask.cvtudq2pd.") ||
+ Name.starts_with("avx512.mask.cvtdq2ps.") ||
+ Name.starts_with("avx512.mask.cvtudq2ps.") ||
+ Name.starts_with("avx512.mask.cvtqq2pd.") ||
+ Name.starts_with("avx512.mask.cvtuqq2pd.") ||
+ Name == "avx512.mask.cvtqq2ps.256" ||
+ Name == "avx512.mask.cvtqq2ps.512" ||
+ Name == "avx512.mask.cvtuqq2ps.256" ||
+ Name == "avx512.mask.cvtuqq2ps.512" || Name == "sse2.cvtps2pd" ||
+ Name == "avx.cvt.ps2.pd.256" ||
+ Name == "avx512.mask.cvtps2pd.128" ||
+ Name == "avx512.mask.cvtps2pd.256")) {
+ auto *DstTy = cast<FixedVectorType>(CI->getType());
+ Rep = CI->getArgOperand(0);
+ auto *SrcTy = cast<FixedVectorType>(Rep->getType());
+
+ unsigned NumDstElts = DstTy->getNumElements();
+ if (NumDstElts < SrcTy->getNumElements()) {
+ assert(NumDstElts == 2 && "Unexpected vector size");
+ Rep = Builder.CreateShuffleVector(Rep, Rep, ArrayRef<int>{0, 1});
+ }
+
+ bool IsPS2PD = SrcTy->getElementType()->isFloatTy();
+ bool IsUnsigned = Name.contains("cvtu");
+ if (IsPS2PD)
+ Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
+ else if (CI->arg_size() == 4 &&
+ (!isa<ConstantInt>(CI->getArgOperand(3)) ||
+ cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
+ Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round
+ : Intrinsic::x86_avx512_sitofp_round;
+ Function *F =
+ Intrinsic::getDeclaration(CI->getModule(), IID, {DstTy, SrcTy});
+ Rep = Builder.CreateCall(F, {Rep, CI->getArgOperand(3)});
+ } else {
+ Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt")
+ : Builder.CreateSIToFP(Rep, DstTy, "cvt");
+ }
+
+ if (CI->arg_size() >= 3)
+ Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
+ CI->getArgOperand(1));
+ } else if ((Name.starts_with("avx512.mask.vcvtph2ps.") ||
+ ...
[truncated]
|
llvm/lib/IR/AutoUpgrade.cpp
Outdated
bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu". | ||
upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1), | ||
CI->getArgOperand(2), Aligned); | ||
} else if ((Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp"))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
} else if ((Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp"))) { | |
} else if (Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp")) { |
Can you please fix the cases that have redundant parens now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense, updated.
5085998
to
638eb82
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
Outlining these patterns has a significant impact on the overall stack frame size of llvm::UpgradeIntrinsicCall. This is helpful for scenarios where compilation threads are stack-constrained. The overall impact is low when using clang as the host compiler, but very pronounced when using MSVC 2022 with release builds. Clang: 1,624 -> 824 bytes MSVC: 23,560 -> 6,120 bytes
638eb82
to
5115830
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Outlining these patterns has a significant impact on the overall stack frame size of llvm::UpgradeIntrinsicCall. This is helpful for scenarios where compilation threads are stack-constrained. The overall impact is low when using clang as the host compiler, but very pronounced when using MSVC 2022 with release builds.
Clang: 1,624 -> 824 bytes
MSVC: 23,560 -> 6,120 bytes