Skip to content

Enable PTS for multi-GPU and MPI rocFFT #603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions scripts/perf/perflib/bench.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2021 - 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -198,8 +198,9 @@ def run(bench,
match = line[len(matchTag):]

if proc.returncode == 0:
for m in re.finditer('Execution gpu time: ([ 0-9.]*) ms', cout,
re.MULTILINE):
for m in re.finditer(
'(?:Max rank time|Execution time):\s*([0-9. ]+)\s*ms', cout,
re.MULTILINE):
times.append(list(map(float, m.group(1).split(' '))))
else:
logging.info("PROCESS FAILED with return code " + str(proc.returncode))
Expand Down
94 changes: 91 additions & 3 deletions scripts/perf/perflib/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2021 - 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -27,6 +27,9 @@

import sys

import re
from collections import defaultdict

#
# Join shortcuts
#
Expand Down Expand Up @@ -266,6 +269,61 @@ class tokendata:
return slower, faster, new_significance


# Decomposition types for multi-process rocFFT
SPLIT_TYPES = {
(True, True, False): "SLOW_INOUT",
(True, False, False): "SLOW_IN",
(False, True, False): "SLOW_OUT",
(True, False, True): "SLOW_IN_FAST_OUT",
(True, True, True): "PENCIL_3D"
}


def get_split_dims(bricks):
coords = list(zip(*(b['lower'] for b in bricks)))
return [len(set(dim_vals)) > 1 for dim_vals in coords] # X, Y, Z


def get_decomposition_type(bricks, label="unknown"):
if not bricks:
return "UNKNOWN"

split_x, split_y, split_z = get_split_dims(bricks)

# Handle full 3D block case
if split_x and split_y and split_z:
return "PENCIL_3D"

if label == "ifield":
key = (split_x, False, False)
elif label == "ofield":
key = (False, split_z, split_x)
else:
return "UNKNOWN"

return SPLIT_TYPES.get(key, "UNKNOWN")


def get_proc_grid(bricks):
if not bricks:
return (0, 0, 0)
lowers = [b['lower'] for b in bricks]
grid_dims = []
for i in range(3): # X, Y, Z
unique_coords = sorted(set(coord[i] for coord in lowers))
grid_dims.append(len(unique_coords))
return tuple(grid_dims)


def print_mgpu_data_layout(bricks):
print("Layout per device/rank:")
for b in bricks:
rank_str = f"{b['rank']}" if b['rank'] is not None else "N/A"
print(
f" Rank {rank_str:<3} | Dev {b['dev']:<2} | Lower {b['lower']} Upper {b['upper']}"
)


#
# DAT files
#
Expand Down Expand Up @@ -346,11 +404,11 @@ def parse_token(token):
transform_type = ("forward" if words[1] == "forward" else
"backward") + "_" + words[0]

lendidx = -1
for idx in range(len(words)):
if words[idx] == "len":
lenidx = idx
break

for idx in range(lenidx + 1, len(words)):
if words[idx].isnumeric():
length.append(int(words[idx]))
Expand All @@ -373,7 +431,37 @@ def parse_token(token):
else:
break

return transform_type, placeness, length, batch, precision
bricks = defaultdict(list)
ranks = set()
gpus = set()
current_field = None

# Regex to get brick indices
pattern = r"(?:(ifield|ofield)_)?brick_lower_([0-9_]+)_upper_([0-9_]+)_stride_[0-9_]+_(?:rank_(\d+)_)?dev_(\d+)"
matches = re.findall(pattern, token)

for field_type, lower_str, upper_str, rank_str, dev_str in matches:
if field_type:
current_field = field_type

lower = tuple(map(int, lower_str.split('_')))
upper = tuple(map(int, upper_str.split('_')))
dev = int(dev_str)
rank = int(rank_str) if rank_str else None

gpus.add(dev)
if rank is not None:
ranks.add(rank)

bricks[current_field].append({
'dev': dev,
'rank': rank,
'lower': lower[1:], # skip batch dimension
'upper': upper[1:]
})

return transform_type, placeness, length, batch, precision, bricks, sorted(
gpus), sorted(ranks)


def read_dat(fname):
Expand Down
106 changes: 89 additions & 17 deletions scripts/perf/rocfft-perf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (C) 2021 - 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -273,9 +273,22 @@ def generate_pts_dat(dat):
rows = []
for row_idx, sample in enumerate(dat.get_samples()):
new_row = []

token = sample[0]
transform_type, placeness, length, batch, precision = perflib.utils.parse_token(
token)

# sanity check
if not token or token.count('_') < 3:
print(f"Skipping malformed token on row {row_idx}: '{token}'")
continue

try:
transform_type, placeness, length, batch, precision, bricks, gpus, ranks = perflib.utils.parse_token(
token)
except Exception as e:
print(
f"Error parsing token on row {row_idx}: '{token}'\nException: {e}"
)
continue

new_row.extend(input_params)
new_row.insert(1, token)
Expand All @@ -287,6 +300,52 @@ def generate_pts_dat(dat):
else:
print("multi-batch data format; exiting abnormally")
sys.exit(1)

# Multi-process and multi-gpu performance
ngpus_per_node = len(gpus)
nranks = max(1, len(ranks))

nnodes = max(1,
(nranks + 1) / ngpus_per_node) # assumes 1 gpu per rank

new_row.append(nranks)
new_row.append(ngpus_per_node)
new_row.append(nnodes)

print("Total GPUs:", ngpus_per_node, gpus)
print("Total ranks:", nranks, ranks)
print("First ifield brick:", bricks['ifield'][0])
print("First ofield brick:", bricks['ofield'][0])

# Input layout
input_split_type = perflib.utils.get_decomposition_type(
bricks['ifield'], label="ifield")
input_grid = perflib.utils.get_proc_grid(bricks['ifield'])
new_row.append(input_split_type)
new_row.extend(input_grid)

# Output layout
output_split_type = perflib.utils.get_decomposition_type(
bricks['ofield'], label="ofield")
output_grid = perflib.utils.get_proc_grid(bricks['ofield'])
new_row.append(output_split_type)
new_row.extend(output_grid)

# Print debug info
'''
print("Total GPUs used:", len(gpus))
print("Total ranks used:", len(ranks))
print("Input grid dimensions (X, Y, Z):", input_grid)
print("Output grid dimensions (X, Y, Z):", output_grid)
print("Input decomposition type:", input_split_type)
print("Output decomposition type:", output_split_type)

print("\nInput brick layout:")
perflib.utils.print_mgpu_data_layout(bricks['ifield'])
print("\nOutput brick layout:")
perflib.utils.print_mgpu_data_layout(bricks['ofield'])
'''

new_row.extend(
mdat_df.loc[row_idx,
['median_sample', 'median_low', 'median_high']].
Expand Down Expand Up @@ -314,11 +373,19 @@ def generate_pts_dat(dat):
header.append('ylength')
elif dimension == 3:
header.extend(['ylength', 'zlength'])

header.append('nbatch')

# Multi-process headers
header.extend([
'nbatch', 'median_sample', 'median_low', 'median_high', 'nsample',
'samples'
'nprocs', 'ngpus_per_node', 'nnodes', 'input_split_type', 'ingrid_x',
'ingrid_y', 'ingrid_z', 'output_split_type', 'outgrid_x', 'outgrid_y',
'outgrid_z'
])

header.extend(
['median_sample', 'median_low', 'median_high', 'nsample', 'samples'])

content = [header]
content.extend(rows)

Expand Down Expand Up @@ -366,22 +433,24 @@ def command_post(arguments):
docdir.mkdir(parents=True, exist_ok=True)

import scipy.stats, numpy

runs = perflib.utils.by_dat(all_runs)

group_outdirs = []
if arguments.ngroup == None:
group_outdirs.append(outdirs)
else:
totalgroups = (len(outdirs) + arguments.ngroup - 1) // arguments.ngroup
totalgroups = (len(outdirs) + arguments.ngroup -
1) // arguments.ngroup
for gidx in range(totalgroups):
group_outdirs.append(outdirs[gidx * arguments.ngroup :
(gidx + 1) * arguments.ngroup])
group_outdirs.append(
outdirs[gidx * arguments.ngroup:(gidx + 1) *
arguments.ngroup])
#print(group_runs)
#sys.exit(1)

for gidx, group in enumerate(group_outdirs):

refdir, *otherdirs = group

for dat_name, dat_runs in runs.items():
Expand All @@ -401,15 +470,16 @@ def command_post(arguments):
Avals = refdat.samples[token].times
Bvals = otherdat.samples[token].times
if arguments.measure == "median":
speedup = statistics.median(Avals) / statistics.median(
Bvals)
speedup = statistics.median(
Avals) / statistics.median(Bvals)
elif arguments.measure == "mean":
speedup = numpy.mean(Avals) / numpy.mean(Bvals)
low, high = perflib.analysis.ratio_confidence_interval(
Avals, Bvals)
pval = -1
if arguments.method == 'moods':
_, pval, _, _ = scipy.stats.median_test(Avals, Bvals)
_, pval, _, _ = scipy.stats.median_test(
Avals, Bvals)
elif arguments.method == 'ttest':
_, pval = scipy.stats.ttest_ind(Avals, Bvals)
elif arguments.method == 'mwu':
Expand All @@ -418,9 +488,11 @@ def command_post(arguments):
print("unsupported statistical method")
sys.exit(1)

speedups.append([sample.token, speedup, low, high, pval])
speedups.append(
[sample.token, speedup, low, high, pval])

path = docdir / ('group_' + str(gidx) + '-' + dat_name + '.sdat')
path = docdir / ('group_' + str(gidx) + '-' + dat_name +
'.sdat')
perflib.utils.write_tsv(path,
speedups,
meta=refdat.meta,
Expand Down Expand Up @@ -879,7 +951,7 @@ def main():
p.add_argument('--ngroup',
type=int,
help="size of comparison subgroup")

for p in [
post_parser, pdf_parser, test_parser, autoperf_parser, html_parser
]:
Expand Down
2 changes: 1 addition & 1 deletion shared/mpi_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -1101,4 +1101,4 @@ int mpi_worker_main(const char* de

MPI_Finalize();
return 0;
}
}
Loading