Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update FOM definitions and split out HPL-MxP #818

Merged
merged 1 commit into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/ramble/ramble/appkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ramble.util.logger import logger as tty

from ramble.util.file_util import get_file_path
from ramble.util.foms import FomType

from ramble.util.output_capture import OUTPUT_CAPTURE

Expand Down
227 changes: 227 additions & 0 deletions var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
# Copyright 2022-2025 The Ramble Authors
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

from ramble.appkit import *

from ramble.base_app.builtin.hpl import Hpl as HplBase


class NvidiaHplMxp(HplBase):
"""This application defines how to run NVIDIA's optimized version of HPL,
which is contained in NVIDIA's HPC-Benchmarks collection.

The NVIDIA HPC-Benchmarks collection provides four benchmarks (HPL,
HPL-MxP, HPCG, and STREAM) widely used in the HPC community optimized for
performance on NVIDIA accelerated HPC systems.

NVIDIA's HPL and HPL-MxP benchmarks provide software packages to solve a
(random) dense linear system in double precision (64-bit) arithmetic and in
mixed precision arithmetic using Tensor Cores, respectively, on
distributed-memory computers equipped with NVIDIA GPUs, based on the Netlib HPL
benchmark and HPL-MxP benchmark.

https://catalog.ngc.nvidia.com/orgs/nvidia/containers/hpc-benchmarks
"""

name = "nvidia-hpl-mxp"

maintainers("douglasjacobsen")

tags("benchmark-app", "benchmark", "linpack", "optimized", "nvidia")

executable(
"execute",
'./hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}',
use_mpi=True,
)

workload("standard", executables=["execute"])
workload("calculator", executables=["execute"])

workload_group("standard", workloads=["standard"], mode="append")
workload_group("calculator", workloads=["calculator"], mode="append")
workload_group(
"all_workloads",
workloads=["standard", "calculator"],
)

workload_variable(
"nvshmem_disable_cuda_vmm",
default="1",
description="",
workload_group="all_workloads",
)
environment_variable(
"NVSHMEM_DISABLE_CUDA_VMM",
"{nvshmem_disable_cuda_vmm}",
description="",
workload_group="all_workloads",
)

workload_variable(
"hpl_fct_comm_policy",
default="1",
description="",
workload_group="all_workloads",
)
environment_variable(
"HPL_FCT_COMM_POLICY",
"{hpl_fct_comm_policy}",
description="",
workload_group="all_workloads",
)

workload_variable(
"hpl_use_nvshmem",
default="0",
description="Whether to use NVSHMEM or not",
workload_group="all_workloads",
)
environment_variable(
"HPL_USE_NVSHMEM",
"{hpl_use_nvshmem}",
description="Whether or not to use NVSHMEM",
workload_group="all_workloads",
)

workload_variable(
"hpl_p2p_as_bcast",
default="0",
description="0 = ncclBcast, 1 = ncclSend/Recv",
workload_group="all_workloads",
)
environment_variable(
"HPL_P2P_AS_BCAST",
"{hpl_p2p_as_bcast}",
description="Whether or not to use P2P for BCAST",
workload_group="all_workloads",
)

workload_variable(
"pmix_mca_gds",
default="^ds12",
description="",
workload_group="all_workloads",
)
environment_variable(
"PMIX_MCA_gds",
"{pmix_mca_gds}",
description="PMIX MCA gds",
workload_group="all_workloads",
)

workload_variable(
"ompi_mca_btl",
default="^vader,tcp,openib,uct",
description="",
workload_group="all_workloads",
)
environment_variable(
"OMPI_MCA_btl",
"{ompi_mca_btl}",
description="OpenMPI MCA btl",
workload_group="all_workloads",
)

workload_variable(
"ompi_mca_pml",
default="ucx",
description="",
workload_group="all_workloads",
)
environment_variable(
"OMPI_MCA_pml",
"{ompi_mca_pml}",
description="OpenMPI MCA pml",
workload_group="all_workloads",
)

workload_variable(
"ucx_net_devices",
default="enp6s0,enp12s0,enp134s0,enp140s0",
description="",
workload_group="all_workloads",
)
environment_variable(
"UCX_NET_DEVICES",
"{ucx_net_devices}",
description="UCX Net Devices",
workload_group="all_workloads",
)

workload_variable(
"ucx_max_rndv_rails",
default="4",
description="",
workload_group="all_workloads",
)
environment_variable(
"UCX_MAX_RNDV_RAILS",
"{ucx_max_rndv_rails}",
description="UCX MAximum RNDV Rails",
workload_group="all_workloads",
)

workload_variable(
"block_size",
default="1024",
description="Size of each block",
workload_group="calculator",
)

workload_variable(
"nporder",
default="row",
description="Major order to use for matrix",
values=["row", "column"],
workload_group="all_workloads",
)

workload_variable(
"gpu_affinity",
default="0:1:2:3:4:5:6:7",
description="Colon delimited list of GPU IDs",
workload_group="all_workloads",
)

# MxP FOMs
gflops_regex = (
r"\s+GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)"
)
lu_gflops_regex = (
r"\s+LU GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)"
)
figure_of_merit(
"Total GFlops",
fom_regex=gflops_regex,
group_name="gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)
figure_of_merit(
"Per GPU GFlops",
fom_regex=gflops_regex,
group_name="per_gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)

figure_of_merit(
"Total LU GFlops",
fom_regex=lu_gflops_regex,
group_name="gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)
figure_of_merit(
"Per GPU LU GFlops",
fom_regex=lu_gflops_regex,
group_name="per_gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)
11 changes: 11 additions & 0 deletions var/ramble/repos/builtin/applications/nvidia-hpl/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,14 @@ class NvidiaHpl(HplBase):
description="Colon delimited list of GPU IDs",
workload_group="mxp",
)

figure_of_merit(
"Per GPU GFlops",
fom_regex=r".*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)"
+ r"\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+"
+ r"(?P<gflops>\S+)\s+\(\s+(?P<per_gpu_gflops>\S+)\)",
group_name="per_gpu_gflops",
units="GFLOP/s",
contexts=["problem-name"],
fom_type=FomType.THROUGHPUT,
)
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,25 @@ class Hpcg(ExecutableApplication):

figure_of_merit(
"Status",
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9]+\.[0-9]+)",
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9\.]+)",
group_name="status",
units="",
)

figure_of_merit(
"Gflops",
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9]+\.[0-9]+)",
"GFlops",
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9\.]+)",
group_name="gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)

figure_of_merit(
"Time",
fom_regex=r"Final Summary::Results are.* execution time.*is=(?P<exec_time>[0-9]+\.[0-9]*)",
fom_regex=r"Final Summary::Results are.* execution time.*is=(?P<exec_time>[0-9\.]*)",
group_name="exec_time",
units="s",
fom_type=FomType.TIME,
)

figure_of_merit(
Expand Down Expand Up @@ -106,9 +108,10 @@ class Hpcg(ExecutableApplication):

figure_of_merit(
"HPCG 2.4 Rating",
fom_regex=r"Final Summary::HPCG 2\.4 rating.*=(?P<rating>[0-9]+\.*[0-9]*)",
fom_regex=r"Final Summary::HPCG 2\.4 rating.*=(?P<rating>[0-9\.]+)",
group_name="rating",
units="",
fom_type=FomType.THROUGHPUT,
)

register_template(
Expand Down
Loading
Loading