fix main_train minor issue

zhuhanqing · zhuhanqing · commit b6d218608f0a · 2025-01-02T18:04:56.000-06:00
diff --git a/README.md b/README.md
@@ -110,10 +110,21 @@ For APOLLO and APOLLO-Mini, we have the following arguments
   - **`channel`**: Applies gradient scaling at the channel level (APOLLO)
   - **`tensor`**: Applies gradient scaling at the tensor level (APOLLO-Mini).
 
-#### `scale`
-- Governs the scaling factor for gradient updates. Can be tuned for better performance.
-    - `1` for APOLLO by default (validated on A100).
-    - `128` for APOLLO-Mini by default. You can scale it larger, especially when the model is large.
+#### **`scale`**
+The `scale` parameter plays a crucial role in heuristically adjusting gradient updates to compensate for scaling factor approximation errors arising from the use of a lower rank. Proper tuning of this parameter can significantly improve performance:
+- **`1`**: Default value for APOLLO (validated on A100 GPUs).
+- **`128`**: Default value for APOLLO-Mini. For larger models, experimenting with higher values is recommended.
+
+#### `--scale_front`
+
+To stabilize training, we adopt the **Norm-Growth Limiter (NL)** from [Fira](https://github.com/xichen-fy/Fira), which has shown to be slightly more effective than traditional gradient clipping.
+
+There are two ways to apply the Norm-Growth Limiter based on when it's used relative to the heuristical (`scale`):
+1. **After Scaling**: NL is applied after the gradient is multiplied by the `scale`.
+   - Recommended for smaller models or when training involves fewer warmup steps.
+   - Enable this by setting `--scale_front`.
+2. **Before Scaling**: NL is applied before the gradient is scaled.
+   - With sufficient warmup steps, both methods yield similar performance for large models.
 
 ---
 
diff --git a/main_pretrain.py b/main_pretrain.py
@@ -289,7 +289,7 @@ def main(args):
     total_svd_count = 0
 
     for batch_idx, batch in enumerate(dataloader):
-        if update_step != 0 and batch_idx <= args.gradient_accumulation * update_step:
+        if update_step != 0 and batch_idx < args.gradient_accumulation * update_step:
             continue  # skipping learned data when resuming from checkpointing
 
         global_step += 1
diff --git a/scripts/pretrain_c4/llama_130m_apollo.sh b/scripts/pretrain_c4/llama_130m_apollo.sh
@@ -1,5 +1,5 @@
 # LLaMA-130M, APOLLO, 4 A100, 1 Node
-num_rank=256
+num_rank=192 # use exact 1/4 of llama 130M model dimension
 scale_type=channel
 proj_type=random
 apollo_scale=1 # A6000 uses a smaller one to avoid loss spikes
@@ -14,6 +14,7 @@ torchrun --standalone --nproc_per_node 4 main_pretrain.py \
     --warmup_steps 2000 \
     --num_training_steps 20000 \
     --optimizer apollo_adamw \
+    --scale_front \
     --apollo_scale ${apollo_scale} \
     --rank ${num_rank} \
     --scale_type ${scale_type} \
diff --git a/scripts/pretrain_c4/llama_130m_apollo_mini.sh b/scripts/pretrain_c4/llama_130m_apollo_mini.sh
@@ -2,7 +2,7 @@
 num_rank=1
 scale_type=tensor
 proj_type=random
-apollo_scale=128
+apollo_scale=192.0 # exact 1/4 of llama model dimension
 
 torchrun --standalone --nproc_per_node 4 main_pretrain.py \
     --model_config configs/llama_130m.json \
@@ -14,6 +14,7 @@ torchrun --standalone --nproc_per_node 4 main_pretrain.py \
     --warmup_steps 2000 \
     --num_training_steps 20000 \
     --optimizer apollo_adamw \
+    --scale_front \
     --apollo_scale ${apollo_scale} \
     --rank ${num_rank} \
     --scale_type ${scale_type} \
diff --git a/scripts/pretrain_c4/llama_60m_apollo.sh b/scripts/pretrain_c4/llama_60m_apollo.sh
@@ -14,6 +14,7 @@ torchrun --standalone --nproc_per_node 1 main_pretrain.py \
     --warmup_steps 1000 \
     --num_training_steps 10000 \
     --optimizer apollo_adamw \
+    --scale_front \
     --apollo_scale ${apollo_scale} \
     --rank ${num_rank} \
     --scale_type ${scale_type} \
diff --git a/scripts/pretrain_c4/llama_60m_apollo_mini.sh b/scripts/pretrain_c4/llama_60m_apollo_mini.sh
@@ -16,6 +16,7 @@ torchrun --standalone --nproc_per_node 1 main_pretrain.py \
     --warmup_steps 1000 \
     --num_training_steps 10000 \
     --optimizer apollo_adamw \
+    --scale_front \
     --apollo_scale ${apollo_scale} \
     --rank ${num_rank} \
     --scale_type ${scale_type} \
diff --git a/utils/argparse.py b/utils/argparse.py
@@ -77,6 +77,7 @@ def parse_args(args):
     parser.add_argument("--proj", type=str, default="random") # "random" or "svd"
     parser.add_argument("--scale_type", type=str, default="tensor") # "tensor" or "channel"
     parser.add_argument("--apollo_scale", type=float, default=1.0) # scale for gradient scaling factor
+    parser.add_argument("--scale_front", action='store_true') # put the nl before or after scale the gradient with the apollo_scale
 
     args = parser.parse_args(args)
     args = check_args_torchrun_main(args)
diff --git a/utils/setup.py b/utils/setup.py
@@ -147,7 +147,7 @@ def setup_optimization(args, model, trainable_params, param_groups, id_lowrank_p
         optimizer = GaLoreAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay)
 
     elif args.optimizer.lower() == "apollo_adamw":
-        optimizer = APOLLOAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay)
+        optimizer = APOLLOAdamW(param_groups, lr=args.lr, weight_decay=args.weight_decay, scale_front=args.scale_front)
 
     elif args.optimizer.lower() == "q_apollo":
         optimizer = QAPOLLOAdamW(