Merge pull request #818 from douglasjacobsen/fix-hpl-foms

rfbgo · web-flow · commit 9be8e3fda6df · 2025-01-10T10:55:52.000-07:00
Update FOM definitions and split out HPL-MxP
diff --git a/lib/ramble/ramble/appkit.py b/lib/ramble/ramble/appkit.py
@@ -28,6 +28,7 @@
 from ramble.util.logger import logger as tty
 
 from ramble.util.file_util import get_file_path
+from ramble.util.foms import FomType
 
 from ramble.util.output_capture import OUTPUT_CAPTURE
 
diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
@@ -0,0 +1,227 @@
+# Copyright 2022-2025 The Ramble Authors
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+from ramble.appkit import *
+
+from ramble.base_app.builtin.hpl import Hpl as HplBase
+
+
+class NvidiaHplMxp(HplBase):
+    """This application defines how to run NVIDIA's optimized version of HPL,
+    which is contained in NVIDIA's HPC-Benchmarks collection.
+
+    The NVIDIA HPC-Benchmarks collection provides four benchmarks (HPL,
+    HPL-MxP, HPCG, and STREAM) widely used in the HPC community optimized for
+    performance on NVIDIA accelerated HPC systems.
+
+    NVIDIA's HPL and HPL-MxP benchmarks provide software packages to solve a
+    (random) dense linear system in double precision (64-bit) arithmetic and in
+    mixed precision arithmetic using Tensor Cores, respectively, on
+    distributed-memory computers equipped with NVIDIA GPUs, based on the Netlib HPL
+    benchmark and HPL-MxP benchmark.
+
+    https://catalog.ngc.nvidia.com/orgs/nvidia/containers/hpc-benchmarks
+    """
+
+    name = "nvidia-hpl-mxp"
+
+    maintainers("douglasjacobsen")
+
+    tags("benchmark-app", "benchmark", "linpack", "optimized", "nvidia")
+
+    executable(
+        "execute",
+        './hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}',
+        use_mpi=True,
+    )
+
+    workload("standard", executables=["execute"])
+    workload("calculator", executables=["execute"])
+
+    workload_group("standard", workloads=["standard"], mode="append")
+    workload_group("calculator", workloads=["calculator"], mode="append")
+    workload_group(
+        "all_workloads",
+        workloads=["standard", "calculator"],
+    )
+
+    workload_variable(
+        "nvshmem_disable_cuda_vmm",
+        default="1",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "NVSHMEM_DISABLE_CUDA_VMM",
+        "{nvshmem_disable_cuda_vmm}",
+        description="",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_fct_comm_policy",
+        default="1",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_FCT_COMM_POLICY",
+        "{hpl_fct_comm_policy}",
+        description="",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_use_nvshmem",
+        default="0",
+        description="Whether to use NVSHMEM or not",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_USE_NVSHMEM",
+        "{hpl_use_nvshmem}",
+        description="Whether or not to use NVSHMEM",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_p2p_as_bcast",
+        default="0",
+        description="0 = ncclBcast, 1 = ncclSend/Recv",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_P2P_AS_BCAST",
+        "{hpl_p2p_as_bcast}",
+        description="Whether or not to use P2P for BCAST",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "pmix_mca_gds",
+        default="^ds12",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "PMIX_MCA_gds",
+        "{pmix_mca_gds}",
+        description="PMIX MCA gds",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "ompi_mca_btl",
+        default="^vader,tcp,openib,uct",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "OMPI_MCA_btl",
+        "{ompi_mca_btl}",
+        description="OpenMPI MCA btl",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "ompi_mca_pml",
+        default="ucx",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "OMPI_MCA_pml",
+        "{ompi_mca_pml}",
+        description="OpenMPI MCA pml",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "ucx_net_devices",
+        default="enp6s0,enp12s0,enp134s0,enp140s0",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "UCX_NET_DEVICES",
+        "{ucx_net_devices}",
+        description="UCX Net Devices",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "ucx_max_rndv_rails",
+        default="4",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "UCX_MAX_RNDV_RAILS",
+        "{ucx_max_rndv_rails}",
+        description="UCX MAximum RNDV Rails",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "block_size",
+        default="1024",
+        description="Size of each block",
+        workload_group="calculator",
+    )
+
+    workload_variable(
+        "nporder",
+        default="row",
+        description="Major order to use for matrix",
+        values=["row", "column"],
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "gpu_affinity",
+        default="0:1:2:3:4:5:6:7",
+        description="Colon delimited list of GPU IDs",
+        workload_group="all_workloads",
+    )
+
+    # MxP FOMs
+    gflops_regex = (
+        r"\s+GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)"
+    )
+    lu_gflops_regex = (
+        r"\s+LU GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)"
+    )
+    figure_of_merit(
+        "Total GFlops",
+        fom_regex=gflops_regex,
+        group_name="gflops",
+        units="GFLOP/s",
+        fom_type=FomType.THROUGHPUT,
+    )
+    figure_of_merit(
+        "Per GPU GFlops",
+        fom_regex=gflops_regex,
+        group_name="per_gflops",
+        units="GFLOP/s",
+        fom_type=FomType.THROUGHPUT,
+    )
+
+    figure_of_merit(
+        "Total LU GFlops",
+        fom_regex=lu_gflops_regex,
+        group_name="gflops",
+        units="GFLOP/s",
+        fom_type=FomType.THROUGHPUT,
+    )
+    figure_of_merit(
+        "Per GPU LU GFlops",
+        fom_regex=lu_gflops_regex,
+        group_name="per_gflops",
+        units="GFLOP/s",
+        fom_type=FomType.THROUGHPUT,
+    )
diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py
@@ -200,3 +200,14 @@ class NvidiaHpl(HplBase):
         description="Colon delimited list of GPU IDs",
         workload_group="mxp",
     )
+
+    figure_of_merit(
+        "Per GPU GFlops",
+        fom_regex=r".*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)"
+        + r"\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+"
+        + r"(?P<gflops>\S+)\s+\(\s+(?P<per_gpu_gflops>\S+)\)",
+        group_name="per_gpu_gflops",
+        units="GFLOP/s",
+        contexts=["problem-name"],
+        fom_type=FomType.THROUGHPUT,
+    )
diff --git a/var/ramble/repos/builtin/base_applications/hpcg/base_application.py b/var/ramble/repos/builtin/base_applications/hpcg/base_application.py
@@ -57,23 +57,25 @@ class Hpcg(ExecutableApplication):
 
     figure_of_merit(
         "Status",
-        fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9]+\.[0-9]+)",
+        fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9\.]+)",
         group_name="status",
         units="",
     )
 
     figure_of_merit(
-        "Gflops",
-        fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9]+\.[0-9]+)",
+        "GFlops",
+        fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9\.]+)",
         group_name="gflops",
         units="GFLOP/s",
+        fom_type=FomType.THROUGHPUT,
     )
 
     figure_of_merit(
         "Time",
-        fom_regex=r"Final Summary::Results are.* execution time.*is=(?P<exec_time>[0-9]+\.[0-9]*)",
+        fom_regex=r"Final Summary::Results are.* execution time.*is=(?P<exec_time>[0-9\.]*)",
         group_name="exec_time",
         units="s",
+        fom_type=FomType.TIME,
     )
 
     figure_of_merit(
@@ -106,9 +108,10 @@ class Hpcg(ExecutableApplication):
 
     figure_of_merit(
         "HPCG 2.4 Rating",
-        fom_regex=r"Final Summary::HPCG 2\.4 rating.*=(?P<rating>[0-9]+\.*[0-9]*)",
+        fom_regex=r"Final Summary::HPCG 2\.4 rating.*=(?P<rating>[0-9\.]+)",
         group_name="rating",
         units="",
+        fom_type=FomType.THROUGHPUT,
     )
 
     register_template(
diff --git a/var/ramble/repos/builtin/base_applications/hpl/base_application.py b/var/ramble/repos/builtin/base_applications/hpl/base_application.py