-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[MemProf] Disable alloc context in combined summary for ndebug builds #139161
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[MemProf] Disable alloc context in combined summary for ndebug builds #139161
Conversation
Since we currently only use the context information in the alloc info summary in the LTO backend for assertion checking, there is no need to write this into the combined summary index for distributed ThinLTO for NDEBUG builds. Put this under a new -combined-index-memprof-context option which is off by default for NDEBUG. The advantage is that we save time (not having to sort in preparation for building the radix trees), and space in the generated bitcode files. We could also do so for the callsite info records, but those are smaller and less expensive to prepare.
@llvm/pr-subscribers-lto Author: Teresa Johnson (teresajohnson) ChangesSince we currently only use the context information in the alloc info The advantage is that we save time (not having to sort in preparation We could also do so for the callsite info records, but those are smaller Full diff: https://github.com/llvm/llvm-project/pull/139161.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 73e74f3836ca4..b362a88963f6c 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -335,6 +335,11 @@ enum GlobalValueSummarySymtabCodes {
// CallStackRadixTreeBuilder class in ProfileData/MemProf.h for format.
// [n x entry]
FS_CONTEXT_RADIX_TREE_ARRAY = 32,
+ // Summary of combined index allocation memprof metadata, without context.
+ // [nummib, numver,
+ // nummib x alloc type,
+ // numver x version]
+ FS_COMBINED_ALLOC_INFO_NO_CONTEXT = 33,
};
enum MetadataCodes {
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 96f86eb52f15c..2dfc8254d5885 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -10781,12 +10781,15 @@ bool LLParser::parseMemProfs(std::vector<MIBInfo> &MIBs) {
return true;
SmallVector<unsigned> StackIdIndices;
- do {
- uint64_t StackId = 0;
- if (parseUInt64(StackId))
- return true;
- StackIdIndices.push_back(Index->addOrGetStackIdIndex(StackId));
- } while (EatIfPresent(lltok::comma));
+ // Combined index alloc records may not have a stack id list.
+ if (Lex.getKind() != lltok::rparen) {
+ do {
+ uint64_t StackId = 0;
+ if (parseUInt64(StackId))
+ return true;
+ StackIdIndices.push_back(Index->addOrGetStackIdIndex(StackId));
+ } while (EatIfPresent(lltok::comma));
+ }
if (parseToken(lltok::rparen, "expected ')' in stackIds"))
return true;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 032c0de3c7a00..fe9e0ddca7091 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -330,6 +330,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
STRINGIFY_CODE(FS, STACK_IDS)
STRINGIFY_CODE(FS, ALLOC_CONTEXT_IDS)
STRINGIFY_CODE(FS, CONTEXT_RADIX_TREE_ARRAY)
+ STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO_NO_CONTEXT)
}
case bitc::METADATA_ATTACHMENT_ID:
switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index de7e9bbe69bd4..a7fbb0c74cb1e 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8178,7 +8178,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
break;
}
- case bitc::FS_COMBINED_ALLOC_INFO: {
+ case bitc::FS_COMBINED_ALLOC_INFO:
+ case bitc::FS_COMBINED_ALLOC_INFO_NO_CONTEXT: {
unsigned I = 0;
std::vector<MIBInfo> MIBs;
unsigned NumMIBs = Record[I++];
@@ -8187,7 +8188,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
while (MIBsRead++ < NumMIBs) {
assert(Record.size() - I >= 2);
AllocationType AllocType = (AllocationType)Record[I++];
- auto StackIdList = parseAllocInfoContext(Record, I);
+ SmallVector<unsigned> StackIdList;
+ if (BitCode == bitc::FS_COMBINED_ALLOC_INFO)
+ StackIdList = parseAllocInfoContext(Record, I);
MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
}
assert(Record.size() - I >= NumVersions);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 27ada0ddcd831..4c28146e1e69d 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -99,6 +99,21 @@ static cl::opt<bool> WriteRelBFToSummary(
"write-relbf-to-summary", cl::Hidden, cl::init(false),
cl::desc("Write relative block frequency to function summary "));
+// Since we only use the context information in the memprof summary records in
+// the LTO backends to do assertion checking, save time and space by only
+// serializing the context for non-NDEBUG builds.
+// TODO: Currently this controls writing context of the allocation info records,
+// which are larger and more expensive, but we should do this for the callsite
+// records as well.
+static cl::opt<bool>
+ CombinedIndexMemProfContext("combined-index-memprof-context", cl::Hidden,
+#ifndef NDEBUG
+ cl::init(true),
+#else
+ cl::init(false),
+#endif
+ cl::desc(""));
+
namespace llvm {
extern FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold;
}
@@ -528,10 +543,12 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
for (auto Idx : CI.StackIdIndices)
RecordStackIdReference(Idx);
}
- for (auto &AI : FS->allocs())
- for (auto &MIB : AI.MIBs)
- for (auto Idx : MIB.StackIdIndices)
- RecordStackIdReference(Idx);
+ if (CombinedIndexMemProfContext) {
+ for (auto &AI : FS->allocs())
+ for (auto &MIB : AI.MIBs)
+ for (auto Idx : MIB.StackIdIndices)
+ RecordStackIdReference(Idx);
+ }
});
}
@@ -4349,9 +4366,11 @@ static void writeFunctionHeapProfileRecords(
Record.push_back(AI.Versions.size());
for (auto &MIB : AI.MIBs) {
Record.push_back((uint8_t)MIB.AllocType);
- // Record the index into the radix tree array for this context.
- assert(CallStackCount <= CallStackPos.size());
- Record.push_back(CallStackPos[CallStackCount++]);
+ if (PerModule || CombinedIndexMemProfContext) {
+ // Record the index into the radix tree array for this context.
+ assert(CallStackCount <= CallStackPos.size());
+ Record.push_back(CallStackPos[CallStackCount++]);
+ }
}
if (!PerModule)
llvm::append_range(Record, AI.Versions);
@@ -4384,8 +4403,11 @@ static void writeFunctionHeapProfileRecords(
Stream.EmitRecord(bitc::FS_ALLOC_CONTEXT_IDS, ContextIds,
ContextIdAbbvId);
}
- Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
- : bitc::FS_COMBINED_ALLOC_INFO,
+ Stream.EmitRecord(PerModule
+ ? bitc::FS_PERMODULE_ALLOC_INFO
+ : (CombinedIndexMemProfContext
+ ? bitc::FS_COMBINED_ALLOC_INFO
+ : bitc::FS_COMBINED_ALLOC_INFO_NO_CONTEXT),
Record, AllocAbbrev);
}
}
@@ -4847,7 +4869,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
unsigned CallsiteAbbrev = Stream.EmitAbbrev(std::move(Abbv));
Abbv = std::make_shared<BitCodeAbbrev>();
- Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO));
+ Abbv->Add(BitCodeAbbrevOp(CombinedIndexMemProfContext
+ ? bitc::FS_COMBINED_ALLOC_INFO
+ : bitc::FS_COMBINED_ALLOC_INFO_NO_CONTEXT));
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
// nummib x (alloc type, context radix tree index),
@@ -4857,13 +4881,6 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
- Abbv = std::make_shared<BitCodeAbbrev>();
- Abbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_RADIX_TREE_ARRAY));
- // n x entry
- Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
- Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
- unsigned RadixAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-
auto shouldImportValueAsDecl = [&](GlobalValueSummary *GVS) -> bool {
if (DecSummaries == nullptr)
return false;
@@ -4900,44 +4917,54 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
NameVals.clear();
};
- // First walk through all the functions and collect the allocation contexts in
- // their associated summaries, for use in constructing a radix tree of
- // contexts. Note that we need to do this in the same order as the functions
- // are processed further below since the call stack positions in the resulting
- // radix tree array are identified based on this order.
- MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> CallStacks;
- forEachSummary([&](GVInfo I, bool IsAliasee) {
- // Don't collect this when invoked for an aliasee, as it is not needed for
- // the alias summary. If the aliasee is to be imported, we will invoke this
- // separately with IsAliasee=false.
- if (IsAliasee)
- return;
- GlobalValueSummary *S = I.second;
- assert(S);
- auto *FS = dyn_cast<FunctionSummary>(S);
- if (!FS)
- return;
- collectMemProfCallStacks(
- FS,
- /*GetStackIndex*/
- [&](unsigned I) {
- // Get the corresponding index into the list of StackIds actually
- // being written for this combined index (which may be a subset in
- // the case of distributed indexes).
- assert(StackIdIndicesToIndex.contains(I));
- return StackIdIndicesToIndex[I];
- },
- CallStacks);
- });
- // Finalize the radix tree, write it out, and get the map of positions in the
- // linearized tree array.
DenseMap<CallStackId, LinearCallStackId> CallStackPos;
- if (!CallStacks.empty()) {
- CallStackPos =
- writeMemoryProfileRadixTree(std::move(CallStacks), Stream, RadixAbbrev);
+ if (CombinedIndexMemProfContext) {
+ Abbv = std::make_shared<BitCodeAbbrev>();
+ Abbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_RADIX_TREE_ARRAY));
+ // n x entry
+ Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+ Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+ unsigned RadixAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+ // First walk through all the functions and collect the allocation contexts
+ // in their associated summaries, for use in constructing a radix tree of
+ // contexts. Note that we need to do this in the same order as the functions
+ // are processed further below since the call stack positions in the
+ // resulting radix tree array are identified based on this order.
+ MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> CallStacks;
+ forEachSummary([&](GVInfo I, bool IsAliasee) {
+ // Don't collect this when invoked for an aliasee, as it is not needed for
+ // the alias summary. If the aliasee is to be imported, we will invoke
+ // this separately with IsAliasee=false.
+ if (IsAliasee)
+ return;
+ GlobalValueSummary *S = I.second;
+ assert(S);
+ auto *FS = dyn_cast<FunctionSummary>(S);
+ if (!FS)
+ return;
+ collectMemProfCallStacks(
+ FS,
+ /*GetStackIndex*/
+ [&](unsigned I) {
+ // Get the corresponding index into the list of StackIds actually
+ // being written for this combined index (which may be a subset in
+ // the case of distributed indexes).
+ assert(StackIdIndicesToIndex.contains(I));
+ return StackIdIndicesToIndex[I];
+ },
+ CallStacks);
+ });
+ // Finalize the radix tree, write it out, and get the map of positions in
+ // the linearized tree array.
+ if (!CallStacks.empty()) {
+ CallStackPos = writeMemoryProfileRadixTree(std::move(CallStacks), Stream,
+ RadixAbbrev);
+ }
}
- // Keep track of the current index into the CallStackPos map.
+ // Keep track of the current index into the CallStackPos map. Not used if
+ // CombinedIndexMemProfContext is false.
CallStackId CallStackCount = 0;
DenseSet<GlobalValue::GUID> DefOrUseGUIDs;
diff --git a/llvm/test/Assembler/thinlto-memprof-summary.ll b/llvm/test/Assembler/thinlto-memprof-summary.ll
index 69eafc967c2a3..4e4a928c1f024 100644
--- a/llvm/test/Assembler/thinlto-memprof-summary.ll
+++ b/llvm/test/Assembler/thinlto-memprof-summary.ll
@@ -1,5 +1,12 @@
;; Test memprof summary parsing (tests all types/fields in various combinations).
-; RUN: llvm-as %s -o - | llvm-dis -o - | FileCheck %s
+
+;; Force enable -combined-index-memprof-context to get the allocation context
+;; stack ids even in release builds.
+; RUN: llvm-as %s -o - -combined-index-memprof-context | llvm-dis -o - | FileCheck %s --check-prefixes=CHECK,CONTEXT
+
+;; Force disable -combined-index-memprof-context to block the allocation context
+;; stack ids even in non-release builds.
+; RUN: llvm-as %s -o - -combined-index-memprof-context=false | llvm-dis -o - | FileCheck %s --check-prefixes=CHECK,NOCONTEXT
; ModuleID = 'thinlto-memprof-summary.thinlto.bc'
@@ -17,8 +24,10 @@
; Make sure we get back from llvm-dis what we put in via llvm-as.
; CHECK: ^0 = module: (path: "thinlto-memprof-summary.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049))
-; CHECK: ^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (8632435727821051414)), (type: cold, stackIds: (15025054523792398438, 12345678)), (type: hot, stackIds: (987654321))))))))
+; CONTEXT: ^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (8632435727821051414)), (type: cold, stackIds: (15025054523792398438, 12345678)), (type: hot, stackIds: (987654321))))))))
+; NOCONTEXT: ^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: ()), (type: cold, stackIds: ()), (type: hot, stackIds: ())))))))
; CHECK: ^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (8632435727821051414)), (callee: ^1, clones: (0), stackIds: (15025054523792398438, 12345678)), (callee: ^1, clones: (0), stackIds: (23456789))))))
-; CHECK: ^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: (3456789)), (type: cold, stackIds: (456789)))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: (3456789)), (type: notcold, stackIds: (456789))))))))
+; CONTEXT: ^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: (3456789)), (type: cold, stackIds: (456789)))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: (3456789)), (type: notcold, stackIds: (456789))))))))
+; NOCONTEXT: ^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: ()), (type: cold, stackIds: ()))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: ()), (type: notcold, stackIds: ())))))))
; CHECK: ^4 = gv: (guid: 27, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^3)), callsites: ((callee: ^3, clones: (0, 1), stackIds: (3456789)), (callee: ^3, clones: (1, 1), stackIds: (456789))))))
; CHECK: ^5 = gv: (guid: 28, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), callsites: ((callee: null, clones: (0), stackIds: (8632435727821051414))))))
diff --git a/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll b/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll
index 63139cacd8fba..fc7d04fcf4d58 100644
--- a/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll
+++ b/llvm/test/ThinLTO/X86/memprof_direct_recursion.ll
@@ -33,6 +33,7 @@
; RUN: llvm-lto2 run %t/b.o %t/a.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
; RUN: -thinlto-distributed-indexes \
+; RUN: -combined-index-memprof-context \
; RUN: -r=%t/b.o,_Z3fooi,plx \
; RUN: -r=%t/b.o,aliasee,plx \
; RUN: -r=%t/b.o,a \
|
Since we currently only use the context information in the alloc info
summary in the LTO backend for assertion checking, there is no need to
write this into the combined summary index for distributed ThinLTO for
NDEBUG builds. Put this under a new -combined-index-memprof-context
option which is off by default for NDEBUG.
The advantage is that we save time (not having to sort in preparation
for building the radix trees), and space in the generated bitcode files.
We could also do so for the callsite info records, but those are smaller
and less expensive to prepare.