ROCm · af-ayala · Jun 16, 2025 · Jun 16, 2025 · Jun 16, 2025 · Jun 24, 2025
@@ -1,4 +1,4 @@
-# Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2021 - 2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -198,8 +198,9 @@ def run(bench,
             match = line[len(matchTag):]
 
     if proc.returncode == 0:
-        for m in re.finditer('Execution gpu time: ([ 0-9.]*) ms', cout,
-                             re.MULTILINE):
+        for m in re.finditer(
+                '(?:Max rank time|Execution time):\s*([0-9. ]+)\s*ms', cout,
+                re.MULTILINE):
             times.append(list(map(float, m.group(1).split(' '))))
     else:
         logging.info("PROCESS FAILED with return code " + str(proc.returncode))

@@ -1,4 +1,4 @@
-# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2021 - 2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -27,6 +27,9 @@
 
 import sys
 
+import re
+from collections import defaultdict
+
 #
 # Join shortcuts
 #
@@ -266,6 +269,61 @@ class tokendata:
     return slower, faster, new_significance
 
 
+# Decomposition types for multi-process rocFFT
+SPLIT_TYPES = {
+    (True, True, False): "SLOW_INOUT",
+    (True, False, False): "SLOW_IN",
+    (False, True, False): "SLOW_OUT",
+    (True, False, True): "SLOW_IN_FAST_OUT",
+    (True, True, True): "PENCIL_3D"
+}
+
+
+def get_split_dims(bricks):
+    coords = list(zip(*(b['lower'] for b in bricks)))
+    return [len(set(dim_vals)) > 1 for dim_vals in coords]  # X, Y, Z
+
+
+def get_decomposition_type(bricks, label="unknown"):
+    if not bricks:
+        return "UNKNOWN"
+
+    split_x, split_y, split_z = get_split_dims(bricks)
+
+    # Handle full 3D block case
+    if split_x and split_y and split_z:
+        return "PENCIL_3D"
+
+    if label == "ifield":
+        key = (split_x, False, False)
+    elif label == "ofield":
+        key = (False, split_z, split_x)
+    else:
+        return "UNKNOWN"
+
+    return SPLIT_TYPES.get(key, "UNKNOWN")
+
+
+def get_proc_grid(bricks):
+    if not bricks:
+        return (0, 0, 0)
+    lowers = [b['lower'] for b in bricks]
+    grid_dims = []
+    for i in range(3):  # X, Y, Z
+        unique_coords = sorted(set(coord[i] for coord in lowers))
+        grid_dims.append(len(unique_coords))
+    return tuple(grid_dims)
+
+
+def print_mgpu_data_layout(bricks):
+    print("Layout per device/rank:")
+    for b in bricks:
+        rank_str = f"{b['rank']}" if b['rank'] is not None else "N/A"
+        print(
+            f"  Rank {rank_str:<3} | Dev {b['dev']:<2} | Lower {b['lower']} Upper {b['upper']}"
+        )
+
+
 #
 # DAT files
 #
@@ -346,11 +404,11 @@ def parse_token(token):
     transform_type = ("forward" if words[1] == "forward" else
                       "backward") + "_" + words[0]
 
-    lendidx = -1
     for idx in range(len(words)):
         if words[idx] == "len":
             lenidx = idx
             break
+
     for idx in range(lenidx + 1, len(words)):
         if words[idx].isnumeric():
             length.append(int(words[idx]))
@@ -373,7 +431,37 @@ def parse_token(token):
         else:
             break
 
-    return transform_type, placeness, length, batch, precision
+    bricks = defaultdict(list)
+    ranks = set()
+    gpus = set()
+    current_field = None
+
+    # Regex to get brick indices
+    pattern = r"(?:(ifield|ofield)_)?brick_lower_([0-9_]+)_upper_([0-9_]+)_stride_[0-9_]+_(?:rank_(\d+)_)?dev_(\d+)"
+    matches = re.findall(pattern, token)
+
+    for field_type, lower_str, upper_str, rank_str, dev_str in matches:
+        if field_type:
+            current_field = field_type
+
+        lower = tuple(map(int, lower_str.split('_')))
+        upper = tuple(map(int, upper_str.split('_')))
+        dev = int(dev_str)
+        rank = int(rank_str) if rank_str else None
+
+        gpus.add(dev)
+        if rank is not None:
+            ranks.add(rank)
+
+        bricks[current_field].append({
+            'dev': dev,
+            'rank': rank,
+            'lower': lower[1:],  # skip batch dimension
+            'upper': upper[1:]
+        })
+
+    return transform_type, placeness, length, batch, precision, bricks, sorted(
+        gpus), sorted(ranks)
 
 
 def read_dat(fname):

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2021 - 2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -273,9 +273,22 @@ def generate_pts_dat(dat):
     rows = []
     for row_idx, sample in enumerate(dat.get_samples()):
         new_row = []
+
         token = sample[0]
-        transform_type, placeness, length, batch, precision = perflib.utils.parse_token(
-            token)
+
+        # sanity check
+        if not token or token.count('_') < 3:
+            print(f"Skipping malformed token on row {row_idx}: '{token}'")
+            continue
+
+        try:
+            transform_type, placeness, length, batch, precision, bricks, gpus, ranks = perflib.utils.parse_token(
+                token)
+        except Exception as e:
+            print(
+                f"Error parsing token on row {row_idx}: '{token}'\nException: {e}"
+            )
+            continue
 
         new_row.extend(input_params)
         new_row.insert(1, token)
@@ -287,6 +300,52 @@ def generate_pts_dat(dat):
         else:
             print("multi-batch data format; exiting abnormally")
             sys.exit(1)
+
+        # Multi-process and multi-gpu performance
+        ngpus_per_node = len(gpus)
+        nranks = max(1, len(ranks))
+
+        nnodes = max(1,
+                     (nranks + 1) / ngpus_per_node)  # assumes 1 gpu per rank
+
+        new_row.append(nranks)
+        new_row.append(ngpus_per_node)
+        new_row.append(nnodes)
+
+        print("Total GPUs:", ngpus_per_node, gpus)
+        print("Total ranks:", nranks, ranks)
+        print("First ifield brick:", bricks['ifield'][0])
+        print("First ofield brick:", bricks['ofield'][0])
+
+        # Input layout
+        input_split_type = perflib.utils.get_decomposition_type(
+            bricks['ifield'], label="ifield")
+        input_grid = perflib.utils.get_proc_grid(bricks['ifield'])
+        new_row.append(input_split_type)
+        new_row.extend(input_grid)
+
+        # Output layout
+        output_split_type = perflib.utils.get_decomposition_type(
+            bricks['ofield'], label="ofield")
+        output_grid = perflib.utils.get_proc_grid(bricks['ofield'])
+        new_row.append(output_split_type)
+        new_row.extend(output_grid)
+
+        # Print debug info
+        '''
+        print("Total GPUs used:", len(gpus))
+        print("Total ranks used:", len(ranks))
+        print("Input  grid dimensions (X, Y, Z):", input_grid)
+        print("Output grid dimensions (X, Y, Z):", output_grid)
+        print("Input  decomposition type:", input_split_type)
+        print("Output decomposition type:", output_split_type)
+
+        print("\nInput brick layout:")
+        perflib.utils.print_mgpu_data_layout(bricks['ifield'])
+        print("\nOutput brick layout:")
+        perflib.utils.print_mgpu_data_layout(bricks['ofield'])
+        '''
+
         new_row.extend(
             mdat_df.loc[row_idx,
                         ['median_sample', 'median_low', 'median_high']].
@@ -314,11 +373,19 @@ def generate_pts_dat(dat):
         header.append('ylength')
     elif dimension == 3:
         header.extend(['ylength', 'zlength'])
+
+    header.append('nbatch')
+
+    # Multi-process headers
     header.extend([
-        'nbatch', 'median_sample', 'median_low', 'median_high', 'nsample',
-        'samples'
+        'nprocs', 'ngpus_per_node', 'nnodes', 'input_split_type', 'ingrid_x',
+        'ingrid_y', 'ingrid_z', 'output_split_type', 'outgrid_x', 'outgrid_y',
+        'outgrid_z'
     ])
 
+    header.extend(
+        ['median_sample', 'median_low', 'median_high', 'nsample', 'samples'])
+
     content = [header]
     content.extend(rows)
 
@@ -366,22 +433,24 @@ def command_post(arguments):
         docdir.mkdir(parents=True, exist_ok=True)
 
         import scipy.stats, numpy
-        
+
         runs = perflib.utils.by_dat(all_runs)
-        
+
         group_outdirs = []
         if arguments.ngroup == None:
             group_outdirs.append(outdirs)
         else:
-            totalgroups = (len(outdirs) + arguments.ngroup - 1) // arguments.ngroup
+            totalgroups = (len(outdirs) + arguments.ngroup -
+                           1) // arguments.ngroup
             for gidx in range(totalgroups):
-                group_outdirs.append(outdirs[gidx * arguments.ngroup :
-                                             (gidx + 1) * arguments.ngroup])
+                group_outdirs.append(
+                    outdirs[gidx * arguments.ngroup:(gidx + 1) *
+                            arguments.ngroup])
         #print(group_runs)
         #sys.exit(1)
 
         for gidx, group in enumerate(group_outdirs):
-        
+
             refdir, *otherdirs = group
 
             for dat_name, dat_runs in runs.items():
@@ -401,15 +470,16 @@ def command_post(arguments):
                         Avals = refdat.samples[token].times
                         Bvals = otherdat.samples[token].times
                         if arguments.measure == "median":
-                            speedup = statistics.median(Avals) / statistics.median(
-                                Bvals)
+                            speedup = statistics.median(
+                                Avals) / statistics.median(Bvals)
                         elif arguments.measure == "mean":
                             speedup = numpy.mean(Avals) / numpy.mean(Bvals)
                         low, high = perflib.analysis.ratio_confidence_interval(
                             Avals, Bvals)
                         pval = -1
                         if arguments.method == 'moods':
-                            _, pval, _, _ = scipy.stats.median_test(Avals, Bvals)
+                            _, pval, _, _ = scipy.stats.median_test(
+                                Avals, Bvals)
                         elif arguments.method == 'ttest':
                             _, pval = scipy.stats.ttest_ind(Avals, Bvals)
                         elif arguments.method == 'mwu':
@@ -418,9 +488,11 @@ def command_post(arguments):
                             print("unsupported statistical method")
                             sys.exit(1)
 
-                        speedups.append([sample.token, speedup, low, high, pval])
+                        speedups.append(
+                            [sample.token, speedup, low, high, pval])
 
-                    path = docdir / ('group_' + str(gidx) + '-' + dat_name + '.sdat')
+                    path = docdir / ('group_' + str(gidx) + '-' + dat_name +
+                                     '.sdat')
                     perflib.utils.write_tsv(path,
                                             speedups,
                                             meta=refdat.meta,
@@ -879,7 +951,7 @@ def main():
         p.add_argument('--ngroup',
                        type=int,
                        help="size of comparison subgroup")
-        
+
     for p in [
             post_parser, pdf_parser, test_parser, autoperf_parser, html_parser
     ]:

@@ -1101,4 +1101,4 @@ int mpi_worker_main(const char*                                               de
 
     MPI_Finalize();
     return 0;
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -1101,4 +1101,4 @@ int mpi_worker_main(const char* de @@
         MPI_Finalize();
         return 0;
-    }
+    }