Skip to content

Added minimal support to do some timing of OM Runtime functionality #3095

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 38 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
ee16dee
merge from remote branch
AlexandreEichenberger Dec 19, 2024
5b6b918
added files
AlexandreEichenberger Dec 19, 2024
5e7e21f
fix tests
AlexandreEichenberger Dec 19, 2024
97d871a
update
AlexandreEichenberger Dec 20, 2024
903fcb4
update
AlexandreEichenberger Jan 9, 2025
0cb084d
update
AlexandreEichenberger Jan 9, 2025
7cc9a95
update
AlexandreEichenberger Jan 31, 2025
fb39de3
update
AlexandreEichenberger Feb 3, 2025
122c804
update
AlexandreEichenberger Feb 3, 2025
a4e3d3b
update
AlexandreEichenberger Feb 4, 2025
3092abb
update
AlexandreEichenberger Feb 4, 2025
e6806ee
update
AlexandreEichenberger Feb 6, 2025
2e1310e
update
AlexandreEichenberger Feb 7, 2025
bd52017
update
AlexandreEichenberger Feb 10, 2025
b98f5b2
update
AlexandreEichenberger Feb 13, 2025
b91b36d
update
AlexandreEichenberger Feb 14, 2025
706bd81
update
AlexandreEichenberger Feb 17, 2025
f60b856
update
AlexandreEichenberger Feb 19, 2025
14c98ff
update
AlexandreEichenberger Mar 3, 2025
e423712
update
AlexandreEichenberger Mar 5, 2025
f04f00c
update
AlexandreEichenberger Mar 7, 2025
9c9da7f
update
AlexandreEichenberger Mar 13, 2025
cae8d3c
added support for driver timing
AlexandreEichenberger Mar 13, 2025
a91e59e
update
AlexandreEichenberger Mar 13, 2025
16836b7
update
AlexandreEichenberger Mar 13, 2025
cd47c79
disable by default
AlexandreEichenberger Mar 13, 2025
76e957f
updates
AlexandreEichenberger Mar 14, 2025
3ae2950
add custom free
AlexandreEichenberger Mar 14, 2025
53f0139
cleanup of code
AlexandreEichenberger Mar 14, 2025
e1201dc
added old for value comparison
AlexandreEichenberger Mar 14, 2025
efd696a
added old for value comparison
AlexandreEichenberger Mar 14, 2025
bc9f2bb
update
AlexandreEichenberger Mar 14, 2025
6f8e36e
update
AlexandreEichenberger Mar 14, 2025
a890af6
remove printout for debugging
AlexandreEichenberger Mar 14, 2025
e8fd828
disable timing by default
AlexandreEichenberger Mar 14, 2025
04476a5
fix case where we return a constant
AlexandreEichenberger Mar 14, 2025
b0b24ef
leave copy for the constant output case (rare)
AlexandreEichenberger Mar 17, 2025
eaa85ed
responce to comments
AlexandreEichenberger Mar 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions include/onnx-mlir/Runtime/OMTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,18 @@ OM_EXTERNAL_VISIBILITY void omTensorDestroy(OMTensor *tensor);
*/
OM_EXTERNAL_VISIBILITY void *omTensorGetDataPtr(const OMTensor *tensor);

/**
* \brief OMTensor allocated data pointer getter.
*
* @param tensor pointer to the OMTensor
* @return pointer to the allocated memory buffer of the OMTensor,
* This should only be used when needing needing to create
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two needing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tx, added some additional comments on the difference between data and allocated pointers.

* cross-language operation to free OMTensor buffers allocated by the
* model and freed in a different language environment.
* NULL if the numerical data buffer is not set.
*/
OM_EXTERNAL_VISIBILITY void *omTensorGetAllocatedPtr(const OMTensor *tensor);

/**
* \brief OMTensor data shape getter.
*
Expand Down
16 changes: 11 additions & 5 deletions src/Runtime/OMInstrument.inc
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
* SPDX-License-Identifier: Apache-2.0
*/

//===--------- OMTensor.inc - C/C++ Neutral OMTensor Implementation--------===//
//===--- OMInstrument.inc - C/C++ Neutral Instrumentation Implementation---===//
//
// Copyright 2019-2020 The IBM Research Authors.
// Copyright 2019-2025 The IBM Research Authors.
//
// =============================================================================
//
// This file contains implementations of OMTensor data structures
// and helper functions.
// This file contains implementations of the OMInstrument calls.
//
//===----------------------------------------------------------------------===//

Expand Down Expand Up @@ -67,6 +66,12 @@ static char instrumentReportOpName[INSTRUMENT_OP_NAME_MASK + 1];
static char instrumentReportNodeName[INSTRUMENT_NODE_NAME_MASK + 1];
static FILE *fout = 0;

// Global variable to help OMInstrumentHelper.h to keep track of nesting level
// of timing operations.
int timing_nest_level = 0;
char timing_nest_strings[6][20] = {
"", " ", " ", " ", " ", " "};

#ifdef __MVS__
#define timersub(a, b, result) \
do { \
Expand Down Expand Up @@ -146,7 +151,8 @@ static void ReportMemory() {
char memOutput[200];
FILE *memPipe;
mypid = getpid();
int num_chars_written = snprintf(memCommand, sizeof(memCommand), "ps -o vsz='' -p %d", mypid);
int num_chars_written =
snprintf(memCommand, sizeof(memCommand), "ps -o vsz='' -p %d", mypid);
assert(num_chars_written >= 0 && "snprintf write error to memCommand");
memPipe = popen(memCommand, "r");
if (!memPipe) {
Expand Down
108 changes: 108 additions & 0 deletions src/Runtime/OMInstrumentHelper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* SPDX-License-Identifier: Apache-2.0
*/

//===--------- OMInstrumentHelper.h - Helper for Instrumentation ----------===//
//
// Copyright 2019-2025 The IBM Research Authors.
//
// =============================================================================
//
// This file contains helpers for gathering timing, enabled by setting:
//
// #define OM_DRIVER_TIMING 1
// #include "src/Runtime/OMInstrumentHelper.h"
//
// When not defined, all macros are empty, aka generate no code, so no
// overheads.
//
// In Linux: uses gettimeofday and timersub.
//
// TIMING_INIT(var) defines a timing var (context must hold for all timing
// operations).
// TIMING_START(var) starts the timer named var. TIMING_STOP(var)
// adds to the timer named var the difference between now and the last start.
// TIMING_PRINT(var) prints the cumulative time of timer named var.
//
// TIMING_INIT_START does both init and start.
// TIMING_STOP_PRINT does both stop and print.
//
//===----------------------------------------------------------------------===//

#ifndef OM_INSTRUMENT_HELPER_H
#define OM_INSTRUMENT_HELPER_H 1

// Set to 1 to disable all timing regardless of other flags.
#define OM_DRIVER_TIMING_DISABLE_ALL 1 /* 1 (unless when debugging perf) */

//===----------------------------------------------------------------------===//
// Timing support for MVS

#ifdef __MVS__
#define timersub(a, b, result) \
do { \
(result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
(result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
if ((result)->tv_usec < 0) { \
--(result)->tv_sec; \
(result)->tv_usec += 1000000; \
} \
} while (0);
#endif

//===----------------------------------------------------------------------===//
// Timing functions

#if OM_DRIVER_TIMING && !OM_DRIVER_TIMING_DISABLE_ALL
#include <stdio.h>
#include <sys/time.h>

// Global variable to help OMInstrumentHelper.h to keep track of nesting level
// of timing operations.
extern int timing_nest_level;
extern char timing_nest_strings[6][20];

#define TIMING_INIT(_var_name) \
/* Define variable in current scope. */ \
struct timeval _var_name, _var_name##_tmp; \
int _var_name##_nest_level = -1; \
_var_name.tv_sec = 0; \
_var_name.tv_usec = 0;

#define TIMING_START(_var_name) \
_var_name##_nest_level = timing_nest_level; \
++timing_nest_level; \
gettimeofday(&_var_name##_tmp, NULL);

#define TIMING_STOP(_var_name) \
{ /* Define variables in their own scope */ \
struct timeval start_time, stop_time, diff_time; \
start_time = _var_name##_tmp; \
gettimeofday(&stop_time, NULL); \
timersub(&stop_time, &start_time, &diff_time); \
_var_name.tv_sec += diff_time.tv_sec; \
_var_name.tv_usec += diff_time.tv_usec; \
--timing_nest_level; \
}

#define TIMING_PRINT(_var_name) \
if (_var_name##_nest_level >= 0) { /* was started at least once */ \
int l = _var_name##_nest_level <= 5 ? _var_name##_nest_level : 5; \
fprintf(stderr, "@OM_DRIVER, %s%s, %ld.%06ld\n", timing_nest_strings[l], \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can avoid using timing_nest_string by using %* to print out "" l times, e.g.

fprintf(stderr, "@OM_DRIVER, %*s%s, %ld.%06ld\n", l, "",
           #_var_name, (long int)_var_name.tv_sec, (long int)_var_name.tv_usec);

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will give this pattern a try.

#_var_name, (long int)_var_name.tv_sec, (long int)_var_name.tv_usec); \
}

#else
#define TIMING_INIT(_var_name)
#define TIMING_START(_var_name)
#define TIMING_STOP(_var_name)
#define TIMING_PRINT(_var_name)
#endif

// Combined calls.
#define TIMING_INIT_START(_var_name) \
TIMING_INIT(_var_name) TIMING_START(_var_name)
#define TIMING_STOP_PRINT(_var_name) \
TIMING_STOP(_var_name) TIMING_PRINT(_var_name)

#endif
30 changes: 16 additions & 14 deletions src/Runtime/OMTensor.inc
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,13 @@ int *__attribute__((__weak__)) __errno_location(void) { return &errno; }
#ifdef ENABLE_PYRUNTIME_LIGHT
// The implementation depends on src/Utility and llvm. Will be solved in
// another PR. Here are just dummy definitions for compilation
float om_f16_to_f32(uint16_t a) {
return (float) 0;
}
float om_f16_to_f32(uint16_t a) { return (float)0; }

uint16_t om_f32_to_f16(uint16_t a) {
return (uint16_t) 0;
}
uint16_t om_f32_to_f16(uint16_t a) { return (uint16_t)0; }
#endif


#define OM_DRIVER_TIMING 1
#include "src/Runtime/OMInstrumentHelper.h"

// On some platforms LLVM generates f16 conversion code that calls some
// of these C runtime functions.
Expand Down Expand Up @@ -267,11 +264,15 @@ void omTensorDestroy(OMTensor *tensor) {
if (!tensor)
return;
if (tensor->_owning) {
TIMING_INIT_START(tensor_destroy_ptr);
free(tensor->_allocatedPtr);
TIMING_STOP_PRINT(tensor_destroy_ptr);
}
TIMING_INIT_START(tensor_destroy_struct);
free(tensor->_shape);
free(tensor->_strides);
free(tensor);
TIMING_STOP_PRINT(tensor_destroy_struct);
}

/* OMTensor data getter */
Expand Down Expand Up @@ -496,7 +497,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {
/* Helper macros to print data for 1-4D tensors */
#define LOOP_1(INDEX, IV, UB) \
fprintf(fout, "["); \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
if (IV) \
fprintf(fout, ", "); \
indexes[(INDEX)] = (IV); \
Expand All @@ -507,7 +508,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {

#define LOOP_2(INDEX, IV, UB, ...) \
fprintf(fout, "["); \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
if (IV) \
fprintf(fout, ", "); \
indexes[(INDEX)] = (IV); \
Expand All @@ -517,7 +518,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {

#define LOOP_3(INDEX, IV, UB, ...) \
fprintf(fout, "["); \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
if (IV) \
fprintf(fout, ", "); \
indexes[(INDEX)] = (IV); \
Expand All @@ -527,7 +528,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {

#define LOOP_4(INDEX, IV, UB, ...) \
fprintf(fout, "["); \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
if (IV) \
fprintf(fout, ", "); \
indexes[(INDEX)] = (IV); \
Expand All @@ -537,7 +538,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {

#define LOOP_5(INDEX, IV, UB, ...) \
fprintf(fout, "["); \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
if (IV) \
fprintf(fout, ", "); \
indexes[(INDEX)] = (IV); \
Expand All @@ -547,7 +548,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {

#define LOOP_6(INDEX, IV, UB, ...) \
fprintf(fout, "["); \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
for (int64_t IV = 0; (IV) < (UB); ++(IV)) { \
if (IV) \
fprintf(fout, ", "); \
indexes[(INDEX)] = (IV); \
Expand Down Expand Up @@ -587,7 +588,8 @@ static void printData(FILE *fout, const OMTensor *tensor) {
} break;
case 6: {
int64_t indexes[6];
LOOP_6(0, i, shape[0], j, shape[1], k, shape[2], l, shape[3], m, shape[4], n, shape[5])
LOOP_6(0, i, shape[0], j, shape[1], k, shape[2], l, shape[3], m, shape[4],
n, shape[5])
} break;
default:
assert(false && "not implemented");
Expand Down
7 changes: 6 additions & 1 deletion src/Runtime/OMTensorList.inc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include "onnx-mlir/Runtime/OMTensorList.h"
#include <string.h>

#define OM_DRIVER_TIMING 1
#include "src/Runtime/OMInstrumentHelper.h"

struct OMTensorList {
#ifdef __cplusplus
Expand Down Expand Up @@ -74,6 +76,7 @@ OMTensorList *omTensorListCreate(OMTensor **tensors, int64_t n) {
if (!list)
return NULL;

TIMING_INIT_START(tensor_list_create)
size_t omts_bytes = sizeof(OMTensor *) * n;
list->_size = n;
list->_omts = (OMTensor **)malloc(omts_bytes);
Expand All @@ -84,18 +87,20 @@ OMTensorList *omTensorListCreate(OMTensor **tensors, int64_t n) {
}
// Copy the given OMTensors pointers to an array owned by OMTensorList
memcpy(list->_omts, tensors, omts_bytes);

TIMING_STOP_PRINT(tensor_list_create);
return list;
}

/* OMTensorList destroyer */
void omTensorListDestroy(OMTensorList *list) {
if (!list)
return;
TIMING_INIT_START(tensor_list_destroy)
for (int64_t i = 0; i < list->_size; i++)
omTensorDestroy(list->_omts[i]);
// Free the list as well as the pointers to the OMTensor array
omTensorListDestroyShallow(list);
TIMING_STOP_PRINT(tensor_list_destroy)
}

/* OMTensorList destroyer which does not destroy the tensors.
Expand Down
2 changes: 1 addition & 1 deletion src/Runtime/python/PyCompileAndRuntime.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Copyright 2021-2024 The IBM Research Authors.
#
################################################################################
# commom class `PyOMRuntime` called by python scripts
# Common class `PyOMRuntime` called by python scripts
################################################################################
import numpy as np

Expand Down
Loading
Loading