Skip to content

Commit 9be8e3f

Browse files
authored
Merge pull request #818 from douglasjacobsen/fix-hpl-foms
Update FOM definitions and split out HPL-MxP
2 parents cb82e32 + a599a93 commit 9be8e3f

File tree

5 files changed

+292
-90
lines changed

5 files changed

+292
-90
lines changed

lib/ramble/ramble/appkit.py

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from ramble.util.logger import logger as tty
2929

3030
from ramble.util.file_util import get_file_path
31+
from ramble.util.foms import FomType
3132

3233
from ramble.util.output_capture import OUTPUT_CAPTURE
3334

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
# Copyright 2022-2025 The Ramble Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4+
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5+
# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
6+
# option. This file may not be copied, modified, or distributed
7+
# except according to those terms.
8+
9+
from ramble.appkit import *
10+
11+
from ramble.base_app.builtin.hpl import Hpl as HplBase
12+
13+
14+
class NvidiaHplMxp(HplBase):
15+
"""This application defines how to run NVIDIA's optimized version of HPL,
16+
which is contained in NVIDIA's HPC-Benchmarks collection.
17+
18+
The NVIDIA HPC-Benchmarks collection provides four benchmarks (HPL,
19+
HPL-MxP, HPCG, and STREAM) widely used in the HPC community optimized for
20+
performance on NVIDIA accelerated HPC systems.
21+
22+
NVIDIA's HPL and HPL-MxP benchmarks provide software packages to solve a
23+
(random) dense linear system in double precision (64-bit) arithmetic and in
24+
mixed precision arithmetic using Tensor Cores, respectively, on
25+
distributed-memory computers equipped with NVIDIA GPUs, based on the Netlib HPL
26+
benchmark and HPL-MxP benchmark.
27+
28+
https://catalog.ngc.nvidia.com/orgs/nvidia/containers/hpc-benchmarks
29+
"""
30+
31+
name = "nvidia-hpl-mxp"
32+
33+
maintainers("douglasjacobsen")
34+
35+
tags("benchmark-app", "benchmark", "linpack", "optimized", "nvidia")
36+
37+
executable(
38+
"execute",
39+
'./hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}',
40+
use_mpi=True,
41+
)
42+
43+
workload("standard", executables=["execute"])
44+
workload("calculator", executables=["execute"])
45+
46+
workload_group("standard", workloads=["standard"], mode="append")
47+
workload_group("calculator", workloads=["calculator"], mode="append")
48+
workload_group(
49+
"all_workloads",
50+
workloads=["standard", "calculator"],
51+
)
52+
53+
workload_variable(
54+
"nvshmem_disable_cuda_vmm",
55+
default="1",
56+
description="",
57+
workload_group="all_workloads",
58+
)
59+
environment_variable(
60+
"NVSHMEM_DISABLE_CUDA_VMM",
61+
"{nvshmem_disable_cuda_vmm}",
62+
description="",
63+
workload_group="all_workloads",
64+
)
65+
66+
workload_variable(
67+
"hpl_fct_comm_policy",
68+
default="1",
69+
description="",
70+
workload_group="all_workloads",
71+
)
72+
environment_variable(
73+
"HPL_FCT_COMM_POLICY",
74+
"{hpl_fct_comm_policy}",
75+
description="",
76+
workload_group="all_workloads",
77+
)
78+
79+
workload_variable(
80+
"hpl_use_nvshmem",
81+
default="0",
82+
description="Whether to use NVSHMEM or not",
83+
workload_group="all_workloads",
84+
)
85+
environment_variable(
86+
"HPL_USE_NVSHMEM",
87+
"{hpl_use_nvshmem}",
88+
description="Whether or not to use NVSHMEM",
89+
workload_group="all_workloads",
90+
)
91+
92+
workload_variable(
93+
"hpl_p2p_as_bcast",
94+
default="0",
95+
description="0 = ncclBcast, 1 = ncclSend/Recv",
96+
workload_group="all_workloads",
97+
)
98+
environment_variable(
99+
"HPL_P2P_AS_BCAST",
100+
"{hpl_p2p_as_bcast}",
101+
description="Whether or not to use P2P for BCAST",
102+
workload_group="all_workloads",
103+
)
104+
105+
workload_variable(
106+
"pmix_mca_gds",
107+
default="^ds12",
108+
description="",
109+
workload_group="all_workloads",
110+
)
111+
environment_variable(
112+
"PMIX_MCA_gds",
113+
"{pmix_mca_gds}",
114+
description="PMIX MCA gds",
115+
workload_group="all_workloads",
116+
)
117+
118+
workload_variable(
119+
"ompi_mca_btl",
120+
default="^vader,tcp,openib,uct",
121+
description="",
122+
workload_group="all_workloads",
123+
)
124+
environment_variable(
125+
"OMPI_MCA_btl",
126+
"{ompi_mca_btl}",
127+
description="OpenMPI MCA btl",
128+
workload_group="all_workloads",
129+
)
130+
131+
workload_variable(
132+
"ompi_mca_pml",
133+
default="ucx",
134+
description="",
135+
workload_group="all_workloads",
136+
)
137+
environment_variable(
138+
"OMPI_MCA_pml",
139+
"{ompi_mca_pml}",
140+
description="OpenMPI MCA pml",
141+
workload_group="all_workloads",
142+
)
143+
144+
workload_variable(
145+
"ucx_net_devices",
146+
default="enp6s0,enp12s0,enp134s0,enp140s0",
147+
description="",
148+
workload_group="all_workloads",
149+
)
150+
environment_variable(
151+
"UCX_NET_DEVICES",
152+
"{ucx_net_devices}",
153+
description="UCX Net Devices",
154+
workload_group="all_workloads",
155+
)
156+
157+
workload_variable(
158+
"ucx_max_rndv_rails",
159+
default="4",
160+
description="",
161+
workload_group="all_workloads",
162+
)
163+
environment_variable(
164+
"UCX_MAX_RNDV_RAILS",
165+
"{ucx_max_rndv_rails}",
166+
description="UCX MAximum RNDV Rails",
167+
workload_group="all_workloads",
168+
)
169+
170+
workload_variable(
171+
"block_size",
172+
default="1024",
173+
description="Size of each block",
174+
workload_group="calculator",
175+
)
176+
177+
workload_variable(
178+
"nporder",
179+
default="row",
180+
description="Major order to use for matrix",
181+
values=["row", "column"],
182+
workload_group="all_workloads",
183+
)
184+
185+
workload_variable(
186+
"gpu_affinity",
187+
default="0:1:2:3:4:5:6:7",
188+
description="Colon delimited list of GPU IDs",
189+
workload_group="all_workloads",
190+
)
191+
192+
# MxP FOMs
193+
gflops_regex = (
194+
r"\s+GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)"
195+
)
196+
lu_gflops_regex = (
197+
r"\s+LU GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)"
198+
)
199+
figure_of_merit(
200+
"Total GFlops",
201+
fom_regex=gflops_regex,
202+
group_name="gflops",
203+
units="GFLOP/s",
204+
fom_type=FomType.THROUGHPUT,
205+
)
206+
figure_of_merit(
207+
"Per GPU GFlops",
208+
fom_regex=gflops_regex,
209+
group_name="per_gflops",
210+
units="GFLOP/s",
211+
fom_type=FomType.THROUGHPUT,
212+
)
213+
214+
figure_of_merit(
215+
"Total LU GFlops",
216+
fom_regex=lu_gflops_regex,
217+
group_name="gflops",
218+
units="GFLOP/s",
219+
fom_type=FomType.THROUGHPUT,
220+
)
221+
figure_of_merit(
222+
"Per GPU LU GFlops",
223+
fom_regex=lu_gflops_regex,
224+
group_name="per_gflops",
225+
units="GFLOP/s",
226+
fom_type=FomType.THROUGHPUT,
227+
)

var/ramble/repos/builtin/applications/nvidia-hpl/application.py

+11
Original file line numberDiff line numberDiff line change
@@ -200,3 +200,14 @@ class NvidiaHpl(HplBase):
200200
description="Colon delimited list of GPU IDs",
201201
workload_group="mxp",
202202
)
203+
204+
figure_of_merit(
205+
"Per GPU GFlops",
206+
fom_regex=r".*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)"
207+
+ r"\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+"
208+
+ r"(?P<gflops>\S+)\s+\(\s+(?P<per_gpu_gflops>\S+)\)",
209+
group_name="per_gpu_gflops",
210+
units="GFLOP/s",
211+
contexts=["problem-name"],
212+
fom_type=FomType.THROUGHPUT,
213+
)

var/ramble/repos/builtin/base_applications/hpcg/base_application.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -57,23 +57,25 @@ class Hpcg(ExecutableApplication):
5757

5858
figure_of_merit(
5959
"Status",
60-
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9]+\.[0-9]+)",
60+
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9\.]+)",
6161
group_name="status",
6262
units="",
6363
)
6464

6565
figure_of_merit(
66-
"Gflops",
67-
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9]+\.[0-9]+)",
66+
"GFlops",
67+
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9\.]+)",
6868
group_name="gflops",
6969
units="GFLOP/s",
70+
fom_type=FomType.THROUGHPUT,
7071
)
7172

7273
figure_of_merit(
7374
"Time",
74-
fom_regex=r"Final Summary::Results are.* execution time.*is=(?P<exec_time>[0-9]+\.[0-9]*)",
75+
fom_regex=r"Final Summary::Results are.* execution time.*is=(?P<exec_time>[0-9\.]*)",
7576
group_name="exec_time",
7677
units="s",
78+
fom_type=FomType.TIME,
7779
)
7880

7981
figure_of_merit(
@@ -106,9 +108,10 @@ class Hpcg(ExecutableApplication):
106108

107109
figure_of_merit(
108110
"HPCG 2.4 Rating",
109-
fom_regex=r"Final Summary::HPCG 2\.4 rating.*=(?P<rating>[0-9]+\.*[0-9]*)",
111+
fom_regex=r"Final Summary::HPCG 2\.4 rating.*=(?P<rating>[0-9\.]+)",
110112
group_name="rating",
111113
units="",
114+
fom_type=FomType.THROUGHPUT,
112115
)
113116

114117
register_template(

0 commit comments

Comments
 (0)