Skip to content

Commit 2718596

Browse files
authored
Support tracing thread locks with perfetto (#143)
- remove sampling and roctracer flat/timeline options - unused/unnecessary clutter - start pthread_gotcha before perfetto - remove pthread_mutex_gotcha validate - update timemory submodule with tid fix
1 parent e67afd3 commit 2718596

File tree

15 files changed

+122
-204
lines changed

15 files changed

+122
-204
lines changed

source/docs/runtime.md

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,6 @@ OMNITRACE_CRITICAL_TRACE_PER_ROW = 0
196196
OMNITRACE_CRITICAL_TRACE_SERIALIZE_NAMES = false
197197
OMNITRACE_DEBUG = false
198198
OMNITRACE_DL_VERBOSE = 0
199-
OMNITRACE_FLAT_SAMPLING = false
200199
OMNITRACE_INSTRUMENTATION_INTERVAL = 1
201200
OMNITRACE_KOKKOS_KERNEL_LOGGER = false
202201
OMNITRACE_PAPI_EVENTS = PAPI_TOT_CYC
@@ -206,17 +205,14 @@ OMNITRACE_PERFETTO_COMBINE_TRACES = true
206205
OMNITRACE_PERFETTO_FILE = perfetto-trace.proto
207206
OMNITRACE_PERFETTO_FILL_POLICY = discard
208207
OMNITRACE_PERFETTO_SHMEM_SIZE_HINT_KB = 4096
209-
OMNITRACE_ROCTRACER_FLAT_PROFILE = false
210208
OMNITRACE_ROCTRACER_HSA_ACTIVITY = false
211209
OMNITRACE_ROCTRACER_HSA_API = false
212210
OMNITRACE_ROCTRACER_HSA_API_TYPES =
213-
OMNITRACE_ROCTRACER_TIMELINE_PROFILE = false
214211
OMNITRACE_SAMPLING_CPUS =
215212
OMNITRACE_SAMPLING_DELAY = 0.5
216213
OMNITRACE_SAMPLING_FREQ = 10
217214
OMNITRACE_SAMPLING_GPUS = all
218215
OMNITRACE_TIME_OUTPUT = true
219-
OMNITRACE_TIMELINE_SAMPLING = false
220216
OMNITRACE_TIMEMORY_COMPONENTS = wall_clock
221217
OMNITRACE_TRACE_THREAD_LOCKS = false
222218
OMNITRACE_VERBOSE = 0
@@ -297,7 +293,6 @@ $ omnitrace-avail -S -bd
297293
| OMNITRACE_ENABLE_SIGNAL_HANDLER | Enable signals in timemory_init |
298294
| OMNITRACE_FILE_OUTPUT | Write output to files |
299295
| OMNITRACE_FLAT_PROFILE | Set the label hierarchy mode to defa... |
300-
| OMNITRACE_FLAT_SAMPLING | Ignore hierarchy in all statistical ... |
301296
| OMNITRACE_INPUT_EXTENSIONS | File extensions used when searching ... |
302297
| OMNITRACE_INPUT_PATH | Explicitly specify the input folder ... |
303298
| OMNITRACE_INPUT_PREFIX | Explicitly specify the prefix for in... |
@@ -328,11 +323,9 @@ $ omnitrace-avail -S -bd
328323
| OMNITRACE_PERFETTO_FILL_POLICY | Behavior when perfetto buffer is ful... |
329324
| OMNITRACE_PERFETTO_SHMEM_SIZE_HINT_KB | Hint for shared-memory buffer size i... |
330325
| OMNITRACE_PRECISION | Set the global output precision for ... |
331-
| OMNITRACE_ROCTRACER_FLAT_PROFILE | Ignore hierarchy in all kernels entr... |
332326
| OMNITRACE_ROCTRACER_HSA_ACTIVITY | Enable HSA activity tracing support |
333327
| OMNITRACE_ROCTRACER_HSA_API | Enable HSA API tracing support |
334328
| OMNITRACE_ROCTRACER_HSA_API_TYPES | HSA API type to collect |
335-
| OMNITRACE_ROCTRACER_TIMELINE_PROFILE | Create unique entries for every kern... |
336329
| OMNITRACE_SAMPLING_CPUS | CPUs to collect frequency informatio... |
337330
| OMNITRACE_SAMPLING_DELAY | Number of seconds to wait before the... |
338331
| OMNITRACE_SAMPLING_FREQ | Number of software interrupts per se... |
@@ -343,7 +336,6 @@ $ omnitrace-avail -S -bd
343336
| OMNITRACE_SUPPRESS_PARSING | Disable parsing environment |
344337
| OMNITRACE_TEXT_OUTPUT | Write text output files |
345338
| OMNITRACE_TIMELINE_PROFILE | Set the label hierarchy mode to defa... |
346-
| OMNITRACE_TIMELINE_SAMPLING | Create unique entries for every samp... |
347339
| OMNITRACE_TIMEMORY_COMPONENTS | List of components to collect via ti... |
348340
| OMNITRACE_TIME_FORMAT | Customize the folder generation when... |
349341
| OMNITRACE_TIME_OUTPUT | Output data to subfolder w/ a timest... |

source/lib/omnitrace/library.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,8 @@ omnitrace_set_env_hidden(const char* env_name, const char* env_val)
183183

184184
namespace
185185
{
186-
bool _set_mpi_called = false;
187-
std::function<void()> _start_gotcha_callback = []() {};
186+
bool _set_mpi_called = false;
187+
std::function<void()> _preinit_callback = []() {};
188188
} // namespace
189189

190190
extern "C" void
@@ -223,7 +223,7 @@ omnitrace_set_mpi_hidden(bool use, bool attached)
223223
std::to_string(use).c_str(), std::to_string(attached).c_str(),
224224
std::to_string(get_state()).c_str());
225225

226-
_start_gotcha_callback();
226+
_preinit_callback();
227227
}
228228

229229
//======================================================================================//
@@ -356,6 +356,9 @@ omnitrace_init_tooling_hidden()
356356

357357
OMNITRACE_SCOPED_SAMPLING_ON_CHILD_THREADS(false);
358358

359+
// start these gotchas once settings have been initialized
360+
get_init_bundle()->start();
361+
359362
if(get_use_sampling()) sampling::block_signals();
360363

361364
if(get_use_critical_trace())
@@ -554,11 +557,11 @@ omnitrace_init_hidden(const char* _mode, bool _is_binary_rewrite, const char* _a
554557

555558
if(!_set_mpi_called)
556559
{
557-
_start_gotcha_callback = []() { get_gotcha_bundle()->start(); };
560+
_preinit_callback = []() { get_preinit_bundle()->start(); };
558561
}
559562
else
560563
{
561-
get_gotcha_bundle()->start();
564+
get_preinit_bundle()->start();
562565
}
563566
}
564567

@@ -615,7 +618,7 @@ omnitrace_finalize_hidden(void)
615618
if(_debug_init) config::set_setting_value("OMNITRACE_DEBUG", _debug_value);
616619
} };
617620

618-
auto& _thread_bundle = thread_data<omnitrace_thread_bundle_t>::instance();
621+
auto& _thread_bundle = thread_data<thread_bundle_t>::instance();
619622
if(_thread_bundle) _thread_bundle->stop();
620623

621624
if(dmp::rank() == 0 && get_verbose() >= 0) fprintf(stderr, "\n");
@@ -644,7 +647,7 @@ omnitrace_finalize_hidden(void)
644647
}
645648
}
646649

647-
// stop the main bundle which shuts down the pthread gotchas
650+
// stop the main bundle which has stats for run
648651
if(get_main_bundle())
649652
{
650653
OMNITRACE_DEBUG_F("Stopping main bundle...\n");
@@ -690,12 +693,18 @@ omnitrace_finalize_hidden(void)
690693
}
691694
}
692695

696+
// stop the main gotcha which shuts down the pthread gotchas
697+
if(get_init_bundle())
698+
{
699+
OMNITRACE_DEBUG_F("Stopping main gotcha...\n");
700+
get_init_bundle()->stop();
701+
}
702+
693703
// stop the gotcha bundle
694-
if(get_gotcha_bundle())
704+
if(get_preinit_bundle())
695705
{
696706
OMNITRACE_VERBOSE_F(1, "Shutting down miscellaneous gotchas...\n");
697-
get_gotcha_bundle()->stop();
698-
get_gotcha_bundle().reset();
707+
get_preinit_bundle()->stop();
699708
component::mpi_gotcha::shutdown();
700709
}
701710

@@ -746,7 +755,7 @@ omnitrace_finalize_hidden(void)
746755
// if they are still running (e.g. thread-pool still alive), the
747756
// thread-specific data will be wrong if try to stop them from
748757
// the main thread.
749-
for(auto& itr : thread_data<omnitrace_thread_bundle_t>::instances())
758+
for(auto& itr : thread_data<thread_bundle_t>::instances())
750759
{
751760
if(itr && itr->get<comp::wall_clock>() &&
752761
!itr->get<comp::wall_clock>()->get_is_running())

source/lib/omnitrace/library/components/pthread_create_gotcha.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ pthread_create_gotcha::wrapper::wrapper(routine_t _routine, void* _arg,
144144
void*
145145
pthread_create_gotcha::wrapper::operator()() const
146146
{
147-
using thread_bundle_data_t = thread_data<omnitrace_thread_bundle_t>;
147+
using thread_bundle_data_t = thread_data<thread_bundle_t>;
148148

149149
if(is_shutdown && *is_shutdown)
150150
{
@@ -195,7 +195,7 @@ pthread_create_gotcha::wrapper::operator()() const
195195
threading::set_thread_name(TIMEMORY_JOIN(" ", "Thread", _tid).c_str());
196196
if(!thread_bundle_data_t::instances().at(_tid))
197197
{
198-
thread_data<omnitrace_thread_bundle_t>::construct(
198+
thread_data<thread_bundle_t>::construct(
199199
TIMEMORY_JOIN('/', "omnitrace/process", process::get_id(), "thread",
200200
_tid),
201201
quirk::config<quirk::auto_start>{});

source/lib/omnitrace/library/components/pthread_mutex_gotcha.cpp

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "library/debug.hpp"
2828
#include "library/runtime.hpp"
2929
#include "library/sampling.hpp"
30+
#include "library/thread_info.hpp"
3031
#include "library/utility.hpp"
3132

3233
#include <timemory/backends/threading.hpp>
@@ -98,8 +99,6 @@ pthread_mutex_gotcha::configure()
9899
pthread_mutex_gotcha_t::get_initializer() = []() {
99100
if(config::get_trace_thread_locks())
100101
{
101-
validate();
102-
103102
pthread_mutex_gotcha_t::configure(
104103
comp::gotcha_config<0, int, pthread_mutex_t*>{ "pthread_mutex_lock" });
105104

@@ -161,31 +160,6 @@ pthread_mutex_gotcha::shutdown()
161160
pthread_mutex_gotcha_t::disable();
162161
}
163162

164-
void
165-
pthread_mutex_gotcha::validate()
166-
{
167-
if(config::get_trace_thread_locks() && config::get_use_perfetto())
168-
{
169-
OMNITRACE_PRINT_F("\n");
170-
OMNITRACE_PRINT_F("\n");
171-
OMNITRACE_PRINT_F("\n");
172-
OMNITRACE_PRINT_F(
173-
"The overhead of all the mutex locking internally by perfetto is\n")
174-
OMNITRACE_PRINT_F(
175-
"so significant that all timing data is rendered meaningless.\n");
176-
OMNITRACE_PRINT_F(
177-
"However, mutex locking is effectively non-existant in timemory.\n");
178-
OMNITRACE_PRINT_F("If you want to trace the mutex locking:\n")
179-
OMNITRACE_PRINT_F(" OMNITRACE_USE_TIMEMORY=ON\n");
180-
OMNITRACE_PRINT_F(" OMNITRACE_USE_PERFETTO=OFF\n");
181-
OMNITRACE_PRINT_F("\n");
182-
OMNITRACE_PRINT_F("\n");
183-
OMNITRACE_PRINT_F("\n");
184-
OMNITRACE_FAIL_F("OMNITRACE_USE_PERFETTO and OMNITRACE_TRACE_THREAD_LOCKS cannot "
185-
"both be enabled.\n");
186-
}
187-
}
188-
189163
pthread_mutex_gotcha::pthread_mutex_gotcha(const gotcha_data_t& _data)
190164
: m_data{ &_data }
191165
{}
@@ -290,9 +264,9 @@ pthread_mutex_gotcha::operator()(int (*_callee)(pthread_t, void**), pthread_t _t
290264
bool
291265
pthread_mutex_gotcha::is_disabled()
292266
{
293-
return (get_state() != ::omnitrace::State::Active ||
294-
get_thread_state() != ThreadState::Enabled ||
295-
(get_use_sampling() && !sampling_enabled_on_child_threads()));
267+
static thread_local const auto& _info = thread_info::get();
268+
return (!_info || _info->is_offset || get_state() != ::omnitrace::State::Active ||
269+
get_thread_state() != ThreadState::Enabled);
296270
}
297271
} // namespace component
298272
} // namespace omnitrace

source/lib/omnitrace/library/components/pthread_mutex_gotcha.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ struct pthread_mutex_gotcha : comp::base<pthread_mutex_gotcha, void>
5454
// generate the gotcha wrappers
5555
static void configure();
5656
static void shutdown();
57-
static void validate();
5857

5958
int operator()(int (*)(pthread_mutex_t*), pthread_mutex_t*) const;
6059
int operator()(int (*)(pthread_spinlock_t*), pthread_spinlock_t*) const;

source/lib/omnitrace/library/config.cpp

Lines changed: 0 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -491,29 +491,6 @@ configure_settings(bool _init)
491491
std::to_string(_sigrt_range),
492492
0, "sampling", "advanced");
493493

494-
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_FLAT_SAMPLING",
495-
"Ignore hierarchy in all statistical sampling entries",
496-
_config->get_flat_profile(), "timemory", "sampling",
497-
"data_layout", "advanced");
498-
499-
OMNITRACE_CONFIG_SETTING(
500-
bool, "OMNITRACE_TIMELINE_SAMPLING",
501-
"Create unique entries for every sample when statistical sampling is enabled",
502-
_config->get_timeline_profile(), "timemory", "sampling", "data_layout",
503-
"advanced");
504-
505-
OMNITRACE_CONFIG_SETTING(
506-
bool, "OMNITRACE_ROCTRACER_FLAT_PROFILE",
507-
"Ignore hierarchy in all kernels entries with timemory backend",
508-
_config->get_flat_profile(), "timemory", "roctracer", "data_layout", "rocm",
509-
"advanced");
510-
511-
OMNITRACE_CONFIG_SETTING(
512-
bool, "OMNITRACE_ROCTRACER_TIMELINE_PROFILE",
513-
"Create unique entries for every kernel with timemory backend",
514-
_config->get_timeline_profile(), "timemory", "roctracer", "data_layout", "rocm",
515-
"advanced");
516-
517494
OMNITRACE_CONFIG_SETTING(bool, "OMNITRACE_ROCTRACER_HSA_ACTIVITY",
518495
"Enable HSA activity tracing support", true, "roctracer",
519496
"rocm", "advanced");
@@ -1615,34 +1592,6 @@ get_sampling_rtoffset()
16151592
return static_cast<tim::tsettings<int>&>(*_v->second).get();
16161593
}
16171594

1618-
bool
1619-
get_timeline_sampling()
1620-
{
1621-
static auto _v = get_config()->find("OMNITRACE_TIMELINE_SAMPLING");
1622-
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
1623-
}
1624-
1625-
bool
1626-
get_flat_sampling()
1627-
{
1628-
static auto _v = get_config()->find("OMNITRACE_FLAT_SAMPLING");
1629-
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
1630-
}
1631-
1632-
bool
1633-
get_roctracer_timeline_profile()
1634-
{
1635-
static auto _v = get_config()->find("OMNITRACE_ROCTRACER_TIMELINE_PROFILE");
1636-
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
1637-
}
1638-
1639-
bool
1640-
get_roctracer_flat_profile()
1641-
{
1642-
static auto _v = get_config()->find("OMNITRACE_ROCTRACER_FLAT_PROFILE");
1643-
return static_cast<tim::tsettings<bool>&>(*_v->second).get();
1644-
}
1645-
16461595
bool
16471596
get_trace_hsa_api()
16481597
{

source/lib/omnitrace/library/config.hpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -219,18 +219,6 @@ get_sampling_rtoffset();
219219
bool
220220
get_use_rcclp();
221221

222-
bool
223-
get_timeline_sampling();
224-
225-
bool
226-
get_flat_sampling();
227-
228-
bool
229-
get_roctracer_timeline_profile();
230-
231-
bool
232-
get_roctracer_flat_profile();
233-
234222
bool
235223
get_trace_hsa_api();
236224

source/lib/omnitrace/library/roctracer.cpp

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -238,12 +238,6 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
238238
(data->phase == ACTIVITY_API_PHASE_ENTER) ? "on-enter" : "on-exit");
239239

240240
static thread_local int64_t begin_timestamp = 0;
241-
static auto _scope = []() {
242-
auto _v = scope::config{};
243-
if(get_roctracer_timeline_profile()) _v += scope::timeline{};
244-
if(get_roctracer_flat_profile()) _v += scope::flat{};
245-
return _v;
246-
}();
247241

248242
switch(cid)
249243
{
@@ -320,7 +314,7 @@ hsa_api_callback(uint32_t domain, uint32_t cid, const void* callback_data, void*
320314
if(tasking::roctracer::get_task_group().pool())
321315
tasking::roctracer::get_task_group().exec(
322316
[_name, _beg_ns, _end_ns]() {
323-
roctracer_hsa_bundle_t _bundle{ _name, _scope };
317+
roctracer_hsa_bundle_t _bundle{ _name };
324318
_bundle.start()
325319
.store(std::plus<double>{},
326320
static_cast<double>(_end_ns - _beg_ns))
@@ -374,14 +368,8 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)
374368

375369
if(!_name) return;
376370

377-
auto _beg_ns = record->begin_ns + get_clock_skew();
378-
auto _end_ns = record->end_ns + get_clock_skew();
379-
static auto _scope = []() {
380-
auto _v = scope::config{};
381-
if(get_roctracer_timeline_profile()) _v += scope::timeline{};
382-
if(get_roctracer_flat_profile()) _v += scope::flat{};
383-
return _v;
384-
}();
371+
auto _beg_ns = record->begin_ns + get_clock_skew();
372+
auto _end_ns = record->end_ns + get_clock_skew();
385373

386374
if(get_use_perfetto())
387375
{
@@ -394,7 +382,7 @@ hsa_activity_callback(uint32_t op, activity_record_t* record, void* arg)
394382
auto _func = [_beg_ns, _end_ns, _name]() {
395383
if(get_use_timemory())
396384
{
397-
roctracer_hsa_bundle_t _bundle{ *_name, _scope };
385+
roctracer_hsa_bundle_t _bundle{ *_name };
398386
_bundle.start()
399387
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
400388
.stop();
@@ -836,16 +824,10 @@ hip_activity_callback(const char* begin, const char* end, void*)
836824

837825
const char* op_name =
838826
roctracer_op_string(record->domain, record->op, record->kind);
839-
auto _ns_skew = get_clock_skew();
840-
uint64_t _beg_ns = record->begin_ns + _ns_skew;
841-
uint64_t _end_ns = record->end_ns + _ns_skew;
842-
auto _corr_id = record->correlation_id;
843-
static auto _scope = []() {
844-
auto _v = scope::config{};
845-
if(get_roctracer_timeline_profile()) _v += scope::timeline{};
846-
if(get_roctracer_flat_profile()) _v += scope::flat{};
847-
return _v;
848-
}();
827+
auto _ns_skew = get_clock_skew();
828+
uint64_t _beg_ns = record->begin_ns + _ns_skew;
829+
uint64_t _end_ns = record->end_ns + _ns_skew;
830+
auto _corr_id = record->correlation_id;
849831

850832
auto& _keys = get_roctracer_key_data();
851833
auto& _tids = get_roctracer_tid_data();
@@ -936,7 +918,7 @@ hip_activity_callback(const char* begin, const char* end, void*)
936918
if(_found && _name != nullptr && get_use_timemory())
937919
{
938920
auto _func = [_beg_ns, _end_ns, _name]() {
939-
roctracer_bundle_t _bundle{ _name, _scope };
921+
roctracer_bundle_t _bundle{ _name };
940922
_bundle.start()
941923
.store(std::plus<double>{}, static_cast<double>(_end_ns - _beg_ns))
942924
.stop()

0 commit comments

Comments
 (0)