Skip to content

Commit a3e2222

Browse files
committed
Merge branch 'ko3n1g/chore/release-benchmarks-dev' into 'main'
ci: Benchmark release tests suite with TE2.2 on H100 See merge request ADLR/megatron-lm!3458
2 parents 0dea9a5 + 80d66ec commit a3e2222

File tree

34 files changed

+17
-12
lines changed

34 files changed

+17
-12
lines changed

tests/functional_tests/shell_test_utils/run_ci_test.sh

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
set -exo pipefail
44

5+
# Increase soft limit for number of open files to match hard limit
6+
ulimit -Sn $(ulimit -Hn)
7+
8+
# Increase soft limit for number of processes to match hard limit
9+
ulimit -Su $(ulimit -Hu)
10+
511
echo "------ARGUMENTS LIST --------"
612
# Use eval to properly handle quoted arguments
713
eval "set -- $@"
@@ -17,7 +23,7 @@ for ARGUMENT in "$@"; do
1723
VALUE="${VALUE#\'}"
1824

1925
# Properly quote the value to preserve spaces and special characters
20-
export "$KEY"="$VALUE"
26+
export "$KEY"="$(eval echo $VALUE)"
2127
echo "$KEY=$VALUE"
2228
done
2329
echo "---------------------------------"
@@ -69,7 +75,7 @@ IS_NEMO_TEST=$([[ $(echo "$TRAINING_SCRIPT_PATH" | tr '[:upper:]' '[:lower:]') =
6975
export IS_NEMO_TEST
7076

7177
# Adjust model_config for lightweight mode
72-
if [[ "$MODE" == "pretraining" ]]; then
78+
if [[ "$MODE" == "pretraining" && "$TEST_TYPE" != "release" ]]; then
7379
if [[ "$ENABLE_LIGHTWEIGHT_MODE" == "true" && "$IS_NEMO_TEST" == "true" ]]; then
7480
yq -i '.MODEL_ARGS."trainer.max_steps" = 2' $TRAINING_PARAMS_PATH
7581
TRAIN_ITERS=$(cat $TRAINING_PARAMS_PATH |
@@ -132,12 +138,9 @@ for i in $(seq 1 $N_REPEAT); do
132138

133139
bash $ROOT_DIR/tests/functional_tests/shell_test_utils/_run_training.sh
134140

135-
IS_FROZEN_RESUME_BUT_NO_CHECKPOINT=$([[ "$TEST_TYPE" = "frozen-resume" && -z "$(ls -A "$_CHECKPOINT_LOAD_PATH" 2>/dev/null)" ]] && echo "true" || echo "false")
141+
if [[ "$TEST_TYPE" = "frozen-resume" && -z "$(ls -A "$_CHECKPOINT_LOAD_PATH" 2>/dev/null)" ]]; then
142+
echo "No frozen checkpoint found. Will skip second run."
136143

137-
if [[ "$IS_FROZEN_RESUME_BUT_NO_CHECKPOINT" == "true" && ${RECORD_CHECKPOINTS} != "true" ]]; then
138-
echo "No frozen checkpoint found, but test type is frozen-resume. Will abort."
139-
exit 1
140-
elif [[ "$IS_FROZEN_RESUME_BUT_NO_CHECKPOINT" == "true" && ${RECORD_CHECKPOINTS} == "true" ]]; then
141144
export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
142145
rm -rf "$CHECKPOINT_SAVE_PATH/iter_0000$TRAIN_ITERS"
143146
echo $((TRAIN_ITERS / 2)) >$CHECKPOINT_SAVE_PATH/latest_checkpointed_iteration.txt

tests/functional_tests/test_cases/bert/bert_release/golden_values_0.13.0_dgx_h100_dev.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.13.0_dgx_h100_dev.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ MODEL_ARGS:
2121
--sequence-parallel: true
2222
--disable-bias-linear: true
2323
--micro-batch-size: 4
24-
--rampup-batch-size: "384 384 97656250"
24+
--rampup-batch-size: "[384 384 97656250]"
2525
--global-batch-size: 1152
2626
--train-samples: 19531250
2727
--manual-gc: true
@@ -81,7 +81,7 @@ MODEL_ARGS:
8181
--log-validation-ppl-to-tensorboard: true
8282
--log-throughput: true
8383
--log-interval: 100
84-
--tensorboard-dir: ${OUTPUT_PATH}/tensorboard
84+
--tensorboard-dir: ${TENSORBOARD_PATH}
8585
--wandb-project: megatron-core-release-runs
8686
--wandb-exp-name: ${WANDB_EXPERIMENT}
8787
# Add mixed precision args

tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ MODEL_ARGS:
6868
--eval-iters: 32
6969
--eval-interval: 2000
7070
# Add checkpointing args
71-
--load: ${OUTPUT_PATH}/checkpoints
72-
--save: ${OUTPUT_PATH}/checkpoints
71+
--save: ${CHECKPOINT_SAVE_PATH}
72+
--load: ${CHECKPOINT_LOAD_PATH}
7373
--save-interval: 1000
7474
# Add initialization args
7575
--init-method-std: 0.0134
@@ -81,7 +81,7 @@ MODEL_ARGS:
8181
--log-validation-ppl-to-tensorboard: true
8282
--log-throughput: true
8383
--log-interval: 100
84-
--tensorboard-dir: ${OUTPUT_PATH}/tensorboard
84+
--tensorboard-dir: ${TENSORBOARD_PATH}
8585
--wandb-project: megatron-core-release-runs
8686
--wandb-exp-name: ${WANDB_EXPERIMENT}
8787
# Add mixed precision args

0 commit comments

Comments
 (0)