Skip to content

Outline X86 autoupgrade patterns #97851

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

Conversation

jholewinski
Copy link
Contributor

Outlining these patterns has a significant impact on the overall stack frame size of llvm::UpgradeIntrinsicCall. This is helpful for scenarios where compilation threads are stack-constrained. The overall impact is low when using clang as the host compiler, but very pronounced when using MSVC 2022 with release builds.

Clang: 1,624 -> 824 bytes
MSVC: 23,560 -> 6,120 bytes

@llvmbot llvmbot added the llvm:ir label Jul 5, 2024
@llvmbot
Copy link
Member

llvmbot commented Jul 5, 2024

@llvm/pr-subscribers-llvm-ir

Author: Justin Holewinski (jholewinski)

Changes

Outlining these patterns has a significant impact on the overall stack frame size of llvm::UpgradeIntrinsicCall. This is helpful for scenarios where compilation threads are stack-constrained. The overall impact is low when using clang as the host compiler, but very pronounced when using MSVC 2022 with release builds.

Clang: 1,624 -> 824 bytes
MSVC: 23,560 -> 6,120 bytes


Patch is 167.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/97851.diff

1 Files Affected:

  • (modified) llvm/lib/IR/AutoUpgrade.cpp (+1695-1757)
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 5beefaa1ec701..de27b6848b20a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -2245,6 +2245,1698 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
   }
 }
 
+static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
+                                      IRBuilder<> &Builder) {
+  LLVMContext &C = F->getContext();
+  Value *Rep = nullptr;
+
+  if (Name.starts_with("sse4a.movnt.")) {
+    SmallVector<Metadata *, 1> Elts;
+    Elts.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+    MDNode *Node = MDNode::get(C, Elts);
+
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+
+    // Nontemporal (unaligned) store of the 0'th element of the float/double
+    // vector.
+    Type *SrcEltTy = cast<VectorType>(Arg1->getType())->getElementType();
+    PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy);
+    Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast");
+    Value *Extract =
+        Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");
+
+    StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, Align(1));
+    SI->setMetadata(LLVMContext::MD_nontemporal, Node);
+  } else if ((Name.starts_with("avx.movnt.") ||
+              Name.starts_with("avx512.storent."))) {
+    SmallVector<Metadata *, 1> Elts;
+    Elts.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+    MDNode *Node = MDNode::get(C, Elts);
+
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+
+    // Convert the type of the pointer to a pointer to the stored type.
+    Value *BC = Builder.CreateBitCast(
+        Arg0, PointerType::getUnqual(Arg1->getType()), "cast");
+    StoreInst *SI = Builder.CreateAlignedStore(
+        Arg1, BC,
+        Align(Arg1->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
+    SI->setMetadata(LLVMContext::MD_nontemporal, Node);
+  } else if (Name == "sse2.storel.dq") {
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+
+    auto *NewVecTy = FixedVectorType::get(Type::getInt64Ty(C), 2);
+    Value *BC0 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
+    Value *Elt = Builder.CreateExtractElement(BC0, (uint64_t)0);
+    Value *BC = Builder.CreateBitCast(
+        Arg0, PointerType::getUnqual(Elt->getType()), "cast");
+    Builder.CreateAlignedStore(Elt, BC, Align(1));
+  } else if ((Name.starts_with("sse.storeu.") ||
+              Name.starts_with("sse2.storeu.") ||
+              Name.starts_with("avx.storeu."))) {
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+
+    Arg0 = Builder.CreateBitCast(Arg0, PointerType::getUnqual(Arg1->getType()),
+                                 "cast");
+    Builder.CreateAlignedStore(Arg1, Arg0, Align(1));
+  } else if (Name == "avx512.mask.store.ss") {
+    Value *Mask = Builder.CreateAnd(CI->getArgOperand(2), Builder.getInt8(1));
+    upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                       Mask, false);
+  } else if (Name.starts_with("avx512.mask.store")) {
+    // "avx512.mask.storeu." or "avx512.mask.store."
+    bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
+    upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2), Aligned);
+  } else if ((Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp"))) {
+    // Upgrade packed integer vector compare intrinsics to compare instructions.
+    // "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
+    bool CmpEq = Name[9] == 'e';
+    Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
+                             CI->getArgOperand(0), CI->getArgOperand(1));
+    Rep = Builder.CreateSExt(Rep, CI->getType(), "");
+  } else if ((Name.starts_with("avx512.broadcastm"))) {
+    Type *ExtTy = Type::getInt32Ty(C);
+    if (CI->getOperand(0)->getType()->isIntegerTy(8))
+      ExtTy = Type::getInt64Ty(C);
+    unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() /
+                       ExtTy->getPrimitiveSizeInBits();
+    Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
+    Rep = Builder.CreateVectorSplat(NumElts, Rep);
+  } else if ((Name == "sse.sqrt.ss" || Name == "sse2.sqrt.sd")) {
+    Value *Vec = CI->getArgOperand(0);
+    Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0);
+    Function *Intr = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sqrt,
+                                               Elt0->getType());
+    Elt0 = Builder.CreateCall(Intr, Elt0);
+    Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0);
+  } else if ((Name.starts_with("avx.sqrt.p") ||
+              Name.starts_with("sse2.sqrt.p") ||
+              Name.starts_with("sse.sqrt.p"))) {
+    Rep =
+        Builder.CreateCall(Intrinsic::getDeclaration(
+                               F->getParent(), Intrinsic::sqrt, CI->getType()),
+                           {CI->getArgOperand(0)});
+  } else if ((Name.starts_with("avx512.mask.sqrt.p"))) {
+    if (CI->arg_size() == 4 &&
+        (!isa<ConstantInt>(CI->getArgOperand(3)) ||
+         cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
+      Intrinsic::ID IID = Name[18] == 's' ? Intrinsic::x86_avx512_sqrt_ps_512
+                                          : Intrinsic::x86_avx512_sqrt_pd_512;
+
+      Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)};
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                               Args);
+    } else {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                         Intrinsic::sqrt,
+                                                         CI->getType()),
+                               {CI->getArgOperand(0)});
+    }
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+  } else if ((Name.starts_with("avx512.ptestm") ||
+              Name.starts_with("avx512.ptestnm"))) {
+    Value *Op0 = CI->getArgOperand(0);
+    Value *Op1 = CI->getArgOperand(1);
+    Value *Mask = CI->getArgOperand(2);
+    Rep = Builder.CreateAnd(Op0, Op1);
+    llvm::Type *Ty = Op0->getType();
+    Value *Zero = llvm::Constant::getNullValue(Ty);
+    ICmpInst::Predicate Pred = Name.starts_with("avx512.ptestm")
+                                   ? ICmpInst::ICMP_NE
+                                   : ICmpInst::ICMP_EQ;
+    Rep = Builder.CreateICmp(Pred, Rep, Zero);
+    Rep = applyX86MaskOn1BitsVec(Builder, Rep, Mask);
+  } else if ((Name.starts_with("avx512.mask.pbroadcast"))) {
+    unsigned NumElts = cast<FixedVectorType>(CI->getArgOperand(1)->getType())
+                           ->getNumElements();
+    Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+  } else if ((Name.starts_with("avx512.kunpck"))) {
+    unsigned NumElts = CI->getType()->getScalarSizeInBits();
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts);
+    int Indices[64];
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = i;
+
+    // First extract half of each vector. This gives better codegen than
+    // doing it in a single shuffle.
+    LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
+    RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
+    // Concat the vectors.
+    // NOTE: Operands have to be swapped to match intrinsic definition.
+    Rep = Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kand.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    Rep = Builder.CreateAnd(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kandn.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    LHS = Builder.CreateNot(LHS);
+    Rep = Builder.CreateAnd(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kor.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    Rep = Builder.CreateOr(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kxor.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    Rep = Builder.CreateXor(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kxnor.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    LHS = Builder.CreateNot(LHS);
+    Rep = Builder.CreateXor(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.knot.w") {
+    Rep = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Rep = Builder.CreateNot(Rep);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if ((Name == "avx512.kortestz.w" || Name == "avx512.kortestc.w")) {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    Rep = Builder.CreateOr(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, Builder.getInt16Ty());
+    Value *C;
+    if (Name[14] == 'c')
+      C = ConstantInt::getAllOnesValue(Builder.getInt16Ty());
+    else
+      C = ConstantInt::getNullValue(Builder.getInt16Ty());
+    Rep = Builder.CreateICmpEQ(Rep, C);
+    Rep = Builder.CreateZExt(Rep, Builder.getInt32Ty());
+  } else if ((Name == "sse.add.ss" || Name == "sse2.add.sd" ||
+              Name == "sse.sub.ss" || Name == "sse2.sub.sd" ||
+              Name == "sse.mul.ss" || Name == "sse2.mul.sd" ||
+              Name == "sse.div.ss" || Name == "sse2.div.sd")) {
+    Type *I32Ty = Type::getInt32Ty(C);
+    Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
+                                               ConstantInt::get(I32Ty, 0));
+    Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
+                                               ConstantInt::get(I32Ty, 0));
+    Value *EltOp;
+    if (Name.contains(".add."))
+      EltOp = Builder.CreateFAdd(Elt0, Elt1);
+    else if (Name.contains(".sub."))
+      EltOp = Builder.CreateFSub(Elt0, Elt1);
+    else if (Name.contains(".mul."))
+      EltOp = Builder.CreateFMul(Elt0, Elt1);
+    else
+      EltOp = Builder.CreateFDiv(Elt0, Elt1);
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(0), EltOp,
+                                      ConstantInt::get(I32Ty, 0));
+  } else if (Name.starts_with("avx512.mask.pcmp")) {
+    // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
+    bool CmpEq = Name[16] == 'e';
+    Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
+  } else if (Name.starts_with("avx512.mask.vpshufbitqmb.")) {
+    Type *OpTy = CI->getArgOperand(0)->getType();
+    unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+    Intrinsic::ID IID;
+    switch (VecWidth) {
+    default:
+      llvm_unreachable("Unexpected intrinsic");
+    case 128:
+      IID = Intrinsic::x86_avx512_vpshufbitqmb_128;
+      break;
+    case 256:
+      IID = Intrinsic::x86_avx512_vpshufbitqmb_256;
+      break;
+    case 512:
+      IID = Intrinsic::x86_avx512_vpshufbitqmb_512;
+      break;
+    }
+
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                             {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.fpclass.p")) {
+    Type *OpTy = CI->getArgOperand(0)->getType();
+    unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+    unsigned EltWidth = OpTy->getScalarSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_fpclass_ps_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_fpclass_ps_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_fpclass_ps_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_fpclass_pd_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_fpclass_pd_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_fpclass_pd_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                             {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.cmp.p")) {
+    SmallVector<Value *, 4> Args(CI->args());
+    Type *OpTy = Args[0]->getType();
+    unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+    unsigned EltWidth = OpTy->getScalarSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Value *Mask = Constant::getAllOnesValue(CI->getType());
+    if (VecWidth == 512)
+      std::swap(Mask, Args.back());
+    Args.push_back(Mask);
+
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                             Args);
+  } else if (Name.starts_with("avx512.mask.cmp.")) {
+    // Integer compare intrinsics.
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
+  } else if (Name.starts_with("avx512.mask.ucmp.")) {
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
+  } else if ((Name.starts_with("avx512.cvtb2mask.") ||
+              Name.starts_with("avx512.cvtw2mask.") ||
+              Name.starts_with("avx512.cvtd2mask.") ||
+              Name.starts_with("avx512.cvtq2mask."))) {
+    Value *Op = CI->getArgOperand(0);
+    Value *Zero = llvm::Constant::getNullValue(Op->getType());
+    Rep = Builder.CreateICmp(ICmpInst::ICMP_SLT, Op, Zero);
+    Rep = applyX86MaskOn1BitsVec(Builder, Rep, nullptr);
+  } else if ((Name == "ssse3.pabs.b.128" || Name == "ssse3.pabs.w.128" ||
+              Name == "ssse3.pabs.d.128" || Name.starts_with("avx2.pabs") ||
+              Name.starts_with("avx512.mask.pabs"))) {
+    Rep = upgradeAbs(Builder, *CI);
+  } else if ((Name == "sse41.pmaxsb" || Name == "sse2.pmaxs.w" ||
+              Name == "sse41.pmaxsd" || Name.starts_with("avx2.pmaxs") ||
+              Name.starts_with("avx512.mask.pmaxs"))) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax);
+  } else if ((Name == "sse2.pmaxu.b" || Name == "sse41.pmaxuw" ||
+              Name == "sse41.pmaxud" || Name.starts_with("avx2.pmaxu") ||
+              Name.starts_with("avx512.mask.pmaxu"))) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax);
+  } else if ((Name == "sse41.pminsb" || Name == "sse2.pmins.w" ||
+              Name == "sse41.pminsd" || Name.starts_with("avx2.pmins") ||
+              Name.starts_with("avx512.mask.pmins"))) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin);
+  } else if ((Name == "sse2.pminu.b" || Name == "sse41.pminuw" ||
+              Name == "sse41.pminud" || Name.starts_with("avx2.pminu") ||
+              Name.starts_with("avx512.mask.pminu"))) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin);
+  } else if ((Name == "sse2.pmulu.dq" || Name == "avx2.pmulu.dq" ||
+              Name == "avx512.pmulu.dq.512" ||
+              Name.starts_with("avx512.mask.pmulu.dq."))) {
+    Rep = upgradePMULDQ(Builder, *CI, /*Signed*/ false);
+  } else if ((Name == "sse41.pmuldq" || Name == "avx2.pmul.dq" ||
+              Name == "avx512.pmul.dq.512" ||
+              Name.starts_with("avx512.mask.pmul.dq."))) {
+    Rep = upgradePMULDQ(Builder, *CI, /*Signed*/ true);
+  } else if ((Name == "sse.cvtsi2ss" || Name == "sse2.cvtsi2sd" ||
+              Name == "sse.cvtsi642ss" || Name == "sse2.cvtsi642sd")) {
+    Rep =
+        Builder.CreateSIToFP(CI->getArgOperand(1),
+                             cast<VectorType>(CI->getType())->getElementType());
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+  } else if (Name == "avx512.cvtusi2sd") {
+    Rep =
+        Builder.CreateUIToFP(CI->getArgOperand(1),
+                             cast<VectorType>(CI->getType())->getElementType());
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+  } else if (Name == "sse2.cvtss2sd") {
+    Rep = Builder.CreateExtractElement(CI->getArgOperand(1), (uint64_t)0);
+    Rep = Builder.CreateFPExt(
+        Rep, cast<VectorType>(CI->getType())->getElementType());
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+  } else if ((Name == "sse2.cvtdq2pd" || Name == "sse2.cvtdq2ps" ||
+              Name == "avx.cvtdq2.pd.256" || Name == "avx.cvtdq2.ps.256" ||
+              Name.starts_with("avx512.mask.cvtdq2pd.") ||
+              Name.starts_with("avx512.mask.cvtudq2pd.") ||
+              Name.starts_with("avx512.mask.cvtdq2ps.") ||
+              Name.starts_with("avx512.mask.cvtudq2ps.") ||
+              Name.starts_with("avx512.mask.cvtqq2pd.") ||
+              Name.starts_with("avx512.mask.cvtuqq2pd.") ||
+              Name == "avx512.mask.cvtqq2ps.256" ||
+              Name == "avx512.mask.cvtqq2ps.512" ||
+              Name == "avx512.mask.cvtuqq2ps.256" ||
+              Name == "avx512.mask.cvtuqq2ps.512" || Name == "sse2.cvtps2pd" ||
+              Name == "avx.cvt.ps2.pd.256" ||
+              Name == "avx512.mask.cvtps2pd.128" ||
+              Name == "avx512.mask.cvtps2pd.256")) {
+    auto *DstTy = cast<FixedVectorType>(CI->getType());
+    Rep = CI->getArgOperand(0);
+    auto *SrcTy = cast<FixedVectorType>(Rep->getType());
+
+    unsigned NumDstElts = DstTy->getNumElements();
+    if (NumDstElts < SrcTy->getNumElements()) {
+      assert(NumDstElts == 2 && "Unexpected vector size");
+      Rep = Builder.CreateShuffleVector(Rep, Rep, ArrayRef<int>{0, 1});
+    }
+
+    bool IsPS2PD = SrcTy->getElementType()->isFloatTy();
+    bool IsUnsigned = Name.contains("cvtu");
+    if (IsPS2PD)
+      Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
+    else if (CI->arg_size() == 4 &&
+             (!isa<ConstantInt>(CI->getArgOperand(3)) ||
+              cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
+      Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round
+                                     : Intrinsic::x86_avx512_sitofp_round;
+      Function *F =
+          Intrinsic::getDeclaration(CI->getModule(), IID, {DstTy, SrcTy});
+      Rep = Builder.CreateCall(F, {Rep, CI->getArgOperand(3)});
+    } else {
+      Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt")
+                       : Builder.CreateSIToFP(Rep, DstTy, "cvt");
+    }
+
+    if (CI->arg_size() >= 3)
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+  } else if ((Name.starts_with("avx512.mask.vcvtph2ps.") ||
+             ...
[truncated]

bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
CI->getArgOperand(2), Aligned);
} else if ((Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp"))) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
} else if ((Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp"))) {
} else if (Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp")) {

Can you please fix the cases that have redundant parens now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, updated.

@jholewinski jholewinski force-pushed the dev/jholewinski/autoupgrade-stack-fix branch from 5085998 to 638eb82 Compare July 5, 2024 22:05
Copy link

github-actions bot commented Jul 5, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

Outlining these patterns has a significant impact on the overall stack
frame size of llvm::UpgradeIntrinsicCall. This is helpful for scenarios
where compilation threads are stack-constrained. The overall impact is
low when using clang as the host compiler, but very pronounced when using
MSVC 2022 with release builds.

Clang:   1,624 ->   824 bytes
MSVC:   23,560 -> 6,120 bytes
@jholewinski jholewinski force-pushed the dev/jholewinski/autoupgrade-stack-fix branch from 638eb82 to 5115830 Compare July 5, 2024 22:11
Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@jholewinski jholewinski merged commit 9374f83 into llvm:main Jul 6, 2024
7 checks passed
@jholewinski jholewinski deleted the dev/jholewinski/autoupgrade-stack-fix branch July 6, 2024 13:24
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants