Skip to content

Commit 1830c01

Browse files
committed
Move TORCH_NCCL_HIGH_PRIORITY to nemo/lightning/run/plugins.py
Signed-off-by: Guyue Huang <[email protected]>
1 parent e5f0eb5 commit 1830c01

File tree

2 files changed

+3
-1
lines changed

2 files changed

+3
-1
lines changed

nemo/lightning/run/plugins.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,9 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
405405
assert isinstance(self.nccl_pp_comm_chunksize, int) and self.nccl_pp_comm_chunksize > 1
406406
executor.env_vars["NCCL_P2P_NET_CHUNKSIZE"] = str(self.nccl_pp_comm_chunksize)
407407

408+
# Enable high priority for NCCL communications
409+
executor.env_vars["TORCH_NCCL_HIGH_PRIORITY"] = "1"
410+
408411
# Improve perf by steering power to tensor cores, may not work on all systems
409412
if self.enable_vboost and isinstance(executor, run.SlurmExecutor):
410413
vboost_cmd = self.get_vboost_srun_cmd(executor.nodes, executor.tunnel.job_dir)

scripts/performance/executors.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ def slurm_executor(
6666
"NVTE_FLASH_ATTN": "1", # Enable Flash Attention, which is needed to enable cuDNN fused attention
6767
"NVTE_FUSED_ATTN": "1", # Enable cuDNN fused attention
6868
"NEMO_LOG_MEMORY_USAGE": "1", # Print memory allocation
69-
"TORCH_NCCL_HIGH_PRIORITY": "1", # Enable high priority for NCCL communication in pytorch
7069
}
7170

7271
custom_bash_cmds = [] if custom_bash_cmds is None else custom_bash_cmds

0 commit comments

Comments
 (0)