Skip to content

Commit 1d8279d

Browse files
authored
Reduce qualx logging noise (#1603)
This PR fixes #1535, reducing logging from qualx by changing most of the noisy logs to debug level. Signed-off-by: Lee Yang <[email protected]>
1 parent 9df6fa3 commit 1d8279d

File tree

4 files changed

+30
-29
lines changed

4 files changed

+30
-29
lines changed

user_tools/src/spark_rapids_tools/tools/qualx/model.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def train(
7777
drop=True
7878
)
7979
if cpu_aug_tbl.shape[0] < original_num_rows:
80-
logger.warning(
80+
logger.debug(
8181
'Removed %d rows with NaN label values', original_num_rows - cpu_aug_tbl.shape[0]
8282
)
8383

@@ -156,7 +156,7 @@ def predict(
156156
if missing:
157157
raise ValueError(f'Input is missing model features: {missing}')
158158
if extra:
159-
logger.warning('Input had extra features not present in model: %s', extra)
159+
logger.debug('Input had extra features not present in model: %s', extra)
160160

161161
x = cpu_aug_tbl[model_features]
162162
y = cpu_aug_tbl[label_col] if label_col else None
@@ -248,7 +248,7 @@ def extract_model_features(
248248
gpu_aug_tbl = df[df['runType'] == 'GPU']
249249
if gpu_aug_tbl.shape[0] > 0:
250250
if gpu_aug_tbl.shape[0] != cpu_aug_tbl.shape[0]:
251-
logger.warning(
251+
logger.debug(
252252
'Number of GPU rows (%d) does not match number of CPU rows (%d)',
253253
gpu_aug_tbl.shape[0],
254254
cpu_aug_tbl.shape[0],
@@ -276,7 +276,7 @@ def extract_model_features(
276276
if (
277277
num_na / num_rows > 0.05
278278
): # arbitrary threshold, misaligned sqlIDs still may 'match' most of the time
279-
logger.warning(
279+
logger.debug(
280280
'Percentage of NaN GPU durations is high: %d / %d. Per-sql actual speedups may be inaccurate.',
281281
num_na,
282282
num_rows,
@@ -315,7 +315,7 @@ def extract_model_features(
315315
raise ValueError(f'Input data is missing model features: {missing}')
316316
if extra:
317317
# remove extra columns
318-
logger.warning('Input data has extra features (removed): %s', extra)
318+
logger.debug('Input data has extra features (removed): %s', extra)
319319
feature_cols = [c for c in feature_cols if c not in extra]
320320

321321
# add train/val/test split column, if split function(s) provided

user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -760,19 +760,19 @@ def impute(full_tbl: pd.DataFrame) -> pd.DataFrame:
760760
"""Impute missing columns and delete extra columns."""
761761
actual_features = set(full_tbl.columns)
762762
if actual_features == expected_raw_features:
763-
logger.info('Dataset has all expected features')
763+
logger.debug('Dataset has all expected features')
764764
else:
765765
missing = sorted(expected_raw_features - actual_features)
766766
extra = sorted(actual_features - expected_raw_features)
767767
if missing:
768-
logger.warning('Imputing missing features: %s', missing)
768+
logger.debug('Imputing missing features: %s', missing)
769769
if 'fraction_supported' in missing:
770770
full_tbl['fraction_supported'] = 1.0
771771
missing.remove('fraction_supported')
772772
full_tbl.loc[:, missing] = 0
773773

774774
if extra:
775-
logger.warning('Removing extra features: %s', extra)
775+
logger.debug('Removing extra features: %s', extra)
776776
full_tbl = full_tbl.drop(columns=extra)
777777

778778
# one last check after modifications (update expected_raw_features if needed)
@@ -806,7 +806,7 @@ def scan_tbl(
806806
)
807807
except Exception as ex: # pylint: disable=broad-except
808808
if warn_on_error or abort_on_error:
809-
logger.warning('Failed to load %s for %s.', tb_name, app_id)
809+
logger.debug('Failed to load %s for %s.', tb_name, app_id)
810810
if abort_on_error:
811811
raise ScanTblError() from ex
812812
scan_result = pd.DataFrame()
@@ -1030,7 +1030,6 @@ def scan_tbl(
10301030
stage_times = total_stage_time.merge(
10311031
failed_stage_time, on='sqlID', how='inner'
10321032
)
1033-
stage_times.info()
10341033
sqls_to_drop = set(
10351034
stage_times.loc[
10361035
stage_times.Duration_y
@@ -1039,7 +1038,7 @@ def scan_tbl(
10391038
)
10401039

10411040
if sqls_to_drop:
1042-
logger.warning('Ignoring sqlIDs %s due to excessive failed/cancelled stage duration.', sqls_to_drop)
1041+
logger.debug('Ignoring sqlIDs %s due to excessive failed/cancelled stage duration.', sqls_to_drop)
10431042

10441043
if node_level_supp is not None and (qualtool_filter == 'stage'):
10451044
job_stage_agg_tbl = job_stage_agg_tbl[
@@ -1118,13 +1117,15 @@ def scan_tbl(
11181117
aborted_sql_ids = set()
11191118

11201119
if aborted_sql_ids:
1121-
logger.warning('Ignoring sqlIDs %s due to aborted jobs.', aborted_sql_ids)
1120+
logger.debug('Ignoring sqlIDs %s due to aborted jobs.', aborted_sql_ids)
11221121

11231122
sqls_to_drop = sqls_to_drop.union(aborted_sql_ids)
11241123

11251124
if sqls_to_drop:
11261125
logger.warning(
1127-
'Ignoring a total of %s sqlIDs due to stage/job failures.', len(sqls_to_drop)
1126+
'Ignoring a total of %s sqlIDs due to stage/job failures for %s.',
1127+
len(sqls_to_drop),
1128+
app_id
11281129
)
11291130
app_info_mg = app_info_mg.loc[~app_info_mg.sqlID.isin(sqls_to_drop)]
11301131

user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def _get_model(platform: str,
146146
xgb.Booster model file.
147147
"""
148148
model_path = _get_model_path(platform, model, variant)
149-
logger.info('Loading model from: %s', model_path)
149+
logger.debug('Loading model from: %s', model_path)
150150
xgb_model = xgb.Booster()
151151
xgb_model.load_model(model_path)
152152
return xgb_model
@@ -290,7 +290,7 @@ def _predict(
290290
if any(input_df['fraction_supported'] != 1.0)
291291
else 'raw'
292292
)
293-
logger.info('Predicting dataset (%s): %s', filter_str, dataset)
293+
logger.debug('Predicting dataset (%s): %s', filter_str, dataset)
294294
features, feature_cols, label_col = extract_model_features(input_df, {'default': split_fn})
295295
# note: dataset name is already stored in the 'appName' field
296296
try:
@@ -345,7 +345,7 @@ def _read_dataset_scores(
345345
nan_df['model'] + '/' + nan_df['platform'] + '/' + nan_df['dataset']
346346
)
347347
keys = list(nan_df['key'].unique())
348-
logger.warning('Dropped rows w/ NaN values from: %s: %s', eval_dir, keys)
348+
logger.debug('Dropped rows w/ NaN values from: %s: %s', eval_dir, keys)
349349

350350
return df
351351

@@ -395,7 +395,7 @@ def _read_platform_scores(
395395
nan_df['model'] + '/' + nan_df['platform'] + '/' + nan_df['dataset']
396396
)
397397
keys = list(nan_df['key'].unique())
398-
logger.warning('Dropped rows w/ NaN values from: %s: %s', eval_dir, keys)
398+
logger.debug('Dropped rows w/ NaN values from: %s: %s', eval_dir, keys)
399399

400400
# compute accuracy by platform
401401
scores = {}
@@ -507,7 +507,7 @@ def train(
507507
for ds_name, ds_meta in datasets.items():
508508
if 'split_function' in ds_meta:
509509
plugin_path = ds_meta['split_function']
510-
logger.info('Using split function for %s dataset from plugin: %s', ds_name, plugin_path)
510+
logger.debug('Using split function for %s dataset from plugin: %s', ds_name, plugin_path)
511511
plugin = load_plugin(plugin_path)
512512
split_functions[ds_name] = plugin.split_function
513513

@@ -613,7 +613,7 @@ def predict(
613613
'platform': platform,
614614
}
615615

616-
logger.info('Loading dataset: %s', dataset_name)
616+
logger.debug('Loading dataset: %s', dataset_name)
617617
profile_df = load_profiles(
618618
datasets=datasets,
619619
node_level_supp=node_level_supp,
@@ -655,7 +655,7 @@ def predict(
655655
if node_level_supp is not None and any(profile_df['fraction_supported'] != 1.0)
656656
else 'raw'
657657
)
658-
logger.info('Predicting dataset (%s): %s', filter_str, dataset_name)
658+
logger.debug('Predicting dataset (%s): %s', filter_str, dataset_name)
659659

660660
try:
661661
features_list = []
@@ -684,17 +684,17 @@ def predict(
684684
if output_info:
685685
# save features for troubleshooting
686686
output_file = output_info['features']['path']
687-
logger.info('Writing features to: %s', output_file)
687+
logger.debug('Writing features to: %s', output_file)
688688
features.to_csv(output_file, index=False)
689689

690690
feature_importance, shapley_values = compute_shapley_values(xgb_model, features)
691691

692692
output_file = output_info['featureImportance']['path']
693-
logger.info('Writing shapley feature importances to: %s', output_file)
693+
logger.debug('Writing shapley feature importances to: %s', output_file)
694694
feature_importance.to_csv(output_file)
695695

696696
output_file = output_info['shapValues']['path']
697-
logger.info('Writing shapley values to: %s', output_file)
697+
logger.debug('Writing shapley values to: %s', output_file)
698698
shapley_values.to_csv(output_file, index=False)
699699

700700
# compute per-app speedups
@@ -853,10 +853,10 @@ def evaluate(
853853
plugin = load_plugin(plugin_path)
854854
split_fn = plugin.split_function
855855

856-
logger.info('Loading qualification tool CSV files.')
856+
logger.debug('Loading qualification tool CSV files.')
857857
node_level_supp, qual_tool_output, _ = _get_qual_data(qual_dir)
858858

859-
logger.info('Loading profiler tool CSV files.')
859+
logger.debug('Loading profiler tool CSV files.')
860860
profile_df = load_profiles(datasets, profile_dir) # w/ GPU rows
861861
filtered_profile_df = load_profiles(
862862
datasets, profile_dir, node_level_supp, qual_tool_filter, qual_tool_output

user_tools/src/spark_rapids_tools/tools/qualx/util.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -364,8 +364,8 @@ def run_command(command: str) -> subprocess.CompletedProcess:
364364
try:
365365
result = future.result()
366366
logger.debug('Command completed: %s', command)
367-
logger.info(result.stdout)
368-
logger.info(result.stderr)
367+
logger.debug(result.stdout)
368+
logger.debug(result.stderr)
369369
except Exception as e: # pylint: disable=broad-except
370370
logger.error('Command failed: %s', command)
371371
logger.error(e)
@@ -435,15 +435,15 @@ def write_csv_reports(per_sql: pd.DataFrame, per_app: pd.DataFrame, output_info:
435435
try:
436436
if per_sql is not None:
437437
sql_predictions_path = output_info['perSql']['path']
438-
logger.info('Writing per-SQL predictions to: %s', sql_predictions_path)
438+
logger.debug('Writing per-SQL predictions to: %s', sql_predictions_path)
439439
per_sql.to_csv(sql_predictions_path)
440440
except Exception as e: # pylint: disable=broad-except
441441
logger.error('Error writing per-SQL predictions. Reason: %s', e)
442442

443443
try:
444444
if per_app is not None:
445445
app_predictions_path = output_info['perApp']['path']
446-
logger.info('Writing per-application predictions to: %s', app_predictions_path)
446+
logger.debug('Writing per-application predictions to: %s', app_predictions_path)
447447
per_app.to_csv(app_predictions_path)
448448
except Exception as e: # pylint: disable=broad-except
449449
logger.error('Error writing per-app predictions. Reason: %s', e)

0 commit comments

Comments
 (0)