-
Notifications
You must be signed in to change notification settings - Fork 117
/
Copy path2.full_finetune_distributed.sbatch
52 lines (41 loc) · 1.5 KB
/
2.full_finetune_distributed.sbatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/bin/bash
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#SBATCH --nodes=1 # number of nodes to use
#SBATCH --job-name=full_ft # name of your job
#SBATCH --exclusive # job has exclusive use of the resource, no sharing
set -ex;
###########################
###### User Variables #####
###########################
GPUS_PER_NODE=4 # 4 for G5.12x, 8 for P4/P5
###########################
## Environment Variables ##
###########################
## Plenty of EFA level variables
## Comment out for non-efa instances (G4d, P3)
## For G5.12x, Comment out RDMA and Fork safe
## For G4dn and other G5, comment out all
# export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
# export FI_EFA_FORK_SAFE=1
export FI_LOG_LEVEL=1
export FI_PROVIDER=efa
export NCCL_DEBUG=INFO
## Switching SYNC_MEMOPS to zero can boost throughput with FSDP
## Disables CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
## Reduces memory synchronizations
## https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html
export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
###########################
####### Torch Dist #######
###########################
declare -a TORCHRUN_ARGS=(
--nproc_per_node=$GPUS_PER_NODE \
--nnodes=$SLURM_JOB_NUM_NODES \
--rdzv_id=$SLURM_JOB_ID \
--rdzv_backend=c10d \
--rdzv_endpoint=$(hostname) \
)
export TORCHTUNE=./pt_torchtune/bin/tune
export TRAIN_CONFIG=./llama2_7B_full.yaml
srun -l ${TORCHTUNE} run "${TORCHRUN_ARGS[@]}" full_finetune_distributed --config ${TRAIN_CONFIG}