Skip to content

Reduce qualx logging noise #1603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions user_tools/src/spark_rapids_tools/tools/qualx/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def train(
drop=True
)
if cpu_aug_tbl.shape[0] < original_num_rows:
logger.warning(
logger.debug(
'Removed %d rows with NaN label values', original_num_rows - cpu_aug_tbl.shape[0]
)

Expand Down Expand Up @@ -156,7 +156,7 @@ def predict(
if missing:
raise ValueError(f'Input is missing model features: {missing}')
if extra:
logger.warning('Input had extra features not present in model: %s', extra)
logger.debug('Input had extra features not present in model: %s', extra)

x = cpu_aug_tbl[model_features]
y = cpu_aug_tbl[label_col] if label_col else None
Expand Down Expand Up @@ -248,7 +248,7 @@ def extract_model_features(
gpu_aug_tbl = df[df['runType'] == 'GPU']
if gpu_aug_tbl.shape[0] > 0:
if gpu_aug_tbl.shape[0] != cpu_aug_tbl.shape[0]:
logger.warning(
logger.debug(
'Number of GPU rows (%d) does not match number of CPU rows (%d)',
gpu_aug_tbl.shape[0],
cpu_aug_tbl.shape[0],
Expand Down Expand Up @@ -276,7 +276,7 @@ def extract_model_features(
if (
num_na / num_rows > 0.05
): # arbitrary threshold, misaligned sqlIDs still may 'match' most of the time
logger.warning(
logger.debug(
'Percentage of NaN GPU durations is high: %d / %d. Per-sql actual speedups may be inaccurate.',
num_na,
num_rows,
Expand Down Expand Up @@ -315,7 +315,7 @@ def extract_model_features(
raise ValueError(f'Input data is missing model features: {missing}')
if extra:
# remove extra columns
logger.warning('Input data has extra features (removed): %s', extra)
logger.debug('Input data has extra features (removed): %s', extra)
feature_cols = [c for c in feature_cols if c not in extra]

# add train/val/test split column, if split function(s) provided
Expand Down
17 changes: 9 additions & 8 deletions user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,19 +760,19 @@ def impute(full_tbl: pd.DataFrame) -> pd.DataFrame:
"""Impute missing columns and delete extra columns."""
actual_features = set(full_tbl.columns)
if actual_features == expected_raw_features:
logger.info('Dataset has all expected features')
logger.debug('Dataset has all expected features')
else:
missing = sorted(expected_raw_features - actual_features)
extra = sorted(actual_features - expected_raw_features)
if missing:
logger.warning('Imputing missing features: %s', missing)
logger.debug('Imputing missing features: %s', missing)
if 'fraction_supported' in missing:
full_tbl['fraction_supported'] = 1.0
missing.remove('fraction_supported')
full_tbl.loc[:, missing] = 0

if extra:
logger.warning('Removing extra features: %s', extra)
logger.debug('Removing extra features: %s', extra)
full_tbl = full_tbl.drop(columns=extra)

# one last check after modifications (update expected_raw_features if needed)
Expand Down Expand Up @@ -806,7 +806,7 @@ def scan_tbl(
)
except Exception as ex: # pylint: disable=broad-except
if warn_on_error or abort_on_error:
logger.warning('Failed to load %s for %s.', tb_name, app_id)
logger.debug('Failed to load %s for %s.', tb_name, app_id)
if abort_on_error:
raise ScanTblError() from ex
scan_result = pd.DataFrame()
Expand Down Expand Up @@ -1030,7 +1030,6 @@ def scan_tbl(
stage_times = total_stage_time.merge(
failed_stage_time, on='sqlID', how='inner'
)
stage_times.info()
sqls_to_drop = set(
stage_times.loc[
stage_times.Duration_y
Expand All @@ -1039,7 +1038,7 @@ def scan_tbl(
)

if sqls_to_drop:
logger.warning('Ignoring sqlIDs %s due to excessive failed/cancelled stage duration.', sqls_to_drop)
logger.debug('Ignoring sqlIDs %s due to excessive failed/cancelled stage duration.', sqls_to_drop)

if node_level_supp is not None and (qualtool_filter == 'stage'):
job_stage_agg_tbl = job_stage_agg_tbl[
Expand Down Expand Up @@ -1118,13 +1117,15 @@ def scan_tbl(
aborted_sql_ids = set()

if aborted_sql_ids:
logger.warning('Ignoring sqlIDs %s due to aborted jobs.', aborted_sql_ids)
logger.debug('Ignoring sqlIDs %s due to aborted jobs.', aborted_sql_ids)

sqls_to_drop = sqls_to_drop.union(aborted_sql_ids)

if sqls_to_drop:
logger.warning(
'Ignoring a total of %s sqlIDs due to stage/job failures.', len(sqls_to_drop)
'Ignoring a total of %s sqlIDs due to stage/job failures for %s.',
len(sqls_to_drop),
app_id
)
app_info_mg = app_info_mg.loc[~app_info_mg.sqlID.isin(sqls_to_drop)]

Expand Down
24 changes: 12 additions & 12 deletions user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def _get_model(platform: str,
xgb.Booster model file.
"""
model_path = _get_model_path(platform, model, variant)
logger.info('Loading model from: %s', model_path)
logger.debug('Loading model from: %s', model_path)
xgb_model = xgb.Booster()
xgb_model.load_model(model_path)
return xgb_model
Expand Down Expand Up @@ -290,7 +290,7 @@ def _predict(
if any(input_df['fraction_supported'] != 1.0)
else 'raw'
)
logger.info('Predicting dataset (%s): %s', filter_str, dataset)
logger.debug('Predicting dataset (%s): %s', filter_str, dataset)
features, feature_cols, label_col = extract_model_features(input_df, {'default': split_fn})
# note: dataset name is already stored in the 'appName' field
try:
Expand Down Expand Up @@ -345,7 +345,7 @@ def _read_dataset_scores(
nan_df['model'] + '/' + nan_df['platform'] + '/' + nan_df['dataset']
)
keys = list(nan_df['key'].unique())
logger.warning('Dropped rows w/ NaN values from: %s: %s', eval_dir, keys)
logger.debug('Dropped rows w/ NaN values from: %s: %s', eval_dir, keys)

return df

Expand Down Expand Up @@ -395,7 +395,7 @@ def _read_platform_scores(
nan_df['model'] + '/' + nan_df['platform'] + '/' + nan_df['dataset']
)
keys = list(nan_df['key'].unique())
logger.warning('Dropped rows w/ NaN values from: %s: %s', eval_dir, keys)
logger.debug('Dropped rows w/ NaN values from: %s: %s', eval_dir, keys)

# compute accuracy by platform
scores = {}
Expand Down Expand Up @@ -507,7 +507,7 @@ def train(
for ds_name, ds_meta in datasets.items():
if 'split_function' in ds_meta:
plugin_path = ds_meta['split_function']
logger.info('Using split function for %s dataset from plugin: %s', ds_name, plugin_path)
logger.debug('Using split function for %s dataset from plugin: %s', ds_name, plugin_path)
plugin = load_plugin(plugin_path)
split_functions[ds_name] = plugin.split_function

Expand Down Expand Up @@ -613,7 +613,7 @@ def predict(
'platform': platform,
}

logger.info('Loading dataset: %s', dataset_name)
logger.debug('Loading dataset: %s', dataset_name)
profile_df = load_profiles(
datasets=datasets,
node_level_supp=node_level_supp,
Expand Down Expand Up @@ -655,7 +655,7 @@ def predict(
if node_level_supp is not None and any(profile_df['fraction_supported'] != 1.0)
else 'raw'
)
logger.info('Predicting dataset (%s): %s', filter_str, dataset_name)
logger.debug('Predicting dataset (%s): %s', filter_str, dataset_name)

try:
features_list = []
Expand Down Expand Up @@ -684,17 +684,17 @@ def predict(
if output_info:
# save features for troubleshooting
output_file = output_info['features']['path']
logger.info('Writing features to: %s', output_file)
logger.debug('Writing features to: %s', output_file)
features.to_csv(output_file, index=False)

feature_importance, shapley_values = compute_shapley_values(xgb_model, features)

output_file = output_info['featureImportance']['path']
logger.info('Writing shapley feature importances to: %s', output_file)
logger.debug('Writing shapley feature importances to: %s', output_file)
feature_importance.to_csv(output_file)

output_file = output_info['shapValues']['path']
logger.info('Writing shapley values to: %s', output_file)
logger.debug('Writing shapley values to: %s', output_file)
shapley_values.to_csv(output_file, index=False)

# compute per-app speedups
Expand Down Expand Up @@ -853,10 +853,10 @@ def evaluate(
plugin = load_plugin(plugin_path)
split_fn = plugin.split_function

logger.info('Loading qualification tool CSV files.')
logger.debug('Loading qualification tool CSV files.')
node_level_supp, qual_tool_output, _ = _get_qual_data(qual_dir)

logger.info('Loading profiler tool CSV files.')
logger.debug('Loading profiler tool CSV files.')
profile_df = load_profiles(datasets, profile_dir) # w/ GPU rows
filtered_profile_df = load_profiles(
datasets, profile_dir, node_level_supp, qual_tool_filter, qual_tool_output
Expand Down
8 changes: 4 additions & 4 deletions user_tools/src/spark_rapids_tools/tools/qualx/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,8 @@ def run_command(command: str) -> subprocess.CompletedProcess:
try:
result = future.result()
logger.debug('Command completed: %s', command)
logger.info(result.stdout)
logger.info(result.stderr)
logger.debug(result.stdout)
logger.debug(result.stderr)
except Exception as e: # pylint: disable=broad-except
logger.error('Command failed: %s', command)
logger.error(e)
Expand Down Expand Up @@ -435,15 +435,15 @@ def write_csv_reports(per_sql: pd.DataFrame, per_app: pd.DataFrame, output_info:
try:
if per_sql is not None:
sql_predictions_path = output_info['perSql']['path']
logger.info('Writing per-SQL predictions to: %s', sql_predictions_path)
logger.debug('Writing per-SQL predictions to: %s', sql_predictions_path)
per_sql.to_csv(sql_predictions_path)
except Exception as e: # pylint: disable=broad-except
logger.error('Error writing per-SQL predictions. Reason: %s', e)

try:
if per_app is not None:
app_predictions_path = output_info['perApp']['path']
logger.info('Writing per-application predictions to: %s', app_predictions_path)
logger.debug('Writing per-application predictions to: %s', app_predictions_path)
per_app.to_csv(app_predictions_path)
except Exception as e: # pylint: disable=broad-except
logger.error('Error writing per-app predictions. Reason: %s', e)
Expand Down