NVIDIA · PytLab · May 5, 2025 · May 5, 2025 · May 6, 2025 · May 6, 2025
diff --git a/README.md b/README.md
@@ -530,3 +530,6 @@ branch](https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme).
   AGREEMENT](https://www.nvidia.com/en-us/data-center/products/nvidia-ai-enterprise/eula/).
   By pulling and using the container, you accept the terms and
   conditions of this license.
+
+
+PR test 
diff --git a/examples/multimodal/speech_llm/modular_audio_gpt_train.py b/examples/multimodal/speech_llm/modular_audio_gpt_train.py
@@ -14,12 +14,15 @@
 
 import torch.multiprocessing as mp
 from omegaconf.omegaconf import OmegaConf, open_dict
+from one_logger_utils.nemo import OneLoggerNeMoCallback
 
 from nemo.collections.multimodal.speech_llm.models.modular_models import ModularAudioGPTModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronLMPPTrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils import logging, model_utils
+from nemo.utils.callback_group import init_global_callback_group
 from nemo.utils.exp_manager import exp_manager
+from nemo.utils.meta_info_manager import MetaInfoManager
 
 mp.set_start_method("spawn", force=True)
 
@@ -53,6 +56,9 @@ def main(cfg) -> None:
     with open_dict(cfg):
         cfg.model.precision = cfg.trainer.precision
 
+    one_logger_cb = OneLoggerNeMoCallback(callback_config=MetaInfoManager(cfg).get_metadata())
+    init_global_callback_group(callbacks=[one_logger_cb])
+
     precision = cfg.trainer.precision
     trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
     cfg.trainer.precision = precision

diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -85,6 +85,7 @@
     )
     from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
 
+
 __all__ = ["ModularAudioGPTModel", "CrossAttendModularAudioGPTModel"]
 
 

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -625,7 +625,6 @@ def setup_mcore_distributed_parallel(self):
             # by calling model_module.broadcast_params() if the model is randomly initialized.
 
     def configure_optimizers(self):
-
         if self.with_distributed_adam and not self.use_mcore_dist_optim:
 
             # Special handling for embedding grads

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -68,7 +68,6 @@
         get_num_microbatches,
     )
 
-
 __all__ = ['MegatronGPTSFTModel']
 
 

diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -32,6 +32,7 @@
     NLPFSDPStrategy,
     PipelineMixedPrecisionPlugin,
 )
+from nemo.lightning.pytorch.callbacks.callback_group import CallbackGroup
 from nemo.utils import logging
 from nemo.utils.callbacks.dist_ckpt_io import (
     AsyncFinalizableCheckpointIO,
@@ -199,6 +200,7 @@ def create_trainer(self, callbacks=None) -> Trainer:
         precision = self.cfg.trainer.precision
         strategy = self._training_strategy()
         plugins = self._plugins()
+        callbacks.extend(CallbackGroup.get_instance().callbacks)
         callbacks = self._callbacks(callbacks)
         trainer = Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
         # Restore the precision value after Trainer is built.
@@ -227,6 +229,7 @@ def _callbacks(self, callbacks: Optional[list]) -> list:
     def create_trainer(self, callbacks=None) -> Trainer:
         strategy = self._training_strategy()
         plugins = self._plugins()
+        callbacks.extend(CallbackGroup.get_instance().callbacks)
         callbacks = self._callbacks(callbacks)
         return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
 

diff --git a/nemo/core/classes/common.py b/nemo/core/classes/common.py
@@ -15,6 +15,7 @@
 
 """Interfaces common to all Neural Modules and Models."""
 from __future__ import annotations
+
 import copy
 import hashlib
 import inspect
@@ -42,6 +43,7 @@
 from nemo.core.config.templates.model_card import NEMO_DEFAULT_MODEL_CARD_TEMPLATE
 from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.core.neural_types import NeuralType, NeuralTypeComparisonResult
+from nemo.lightning.pytorch.callbacks.callback_group import CallbackGroup
 from nemo.utils import logging
 from nemo.utils.cloud import maybe_download_from_cloud
 from nemo.utils.data_utils import resolve_cache_dir
@@ -739,6 +741,7 @@ def from_pretrained(
         Returns:
             A model instance of a particular model class or its underlying config (if return_config is set).
         """
+        CallbackGroup.get_instance().on_load_checkpoint_start()
         if save_restore_connector is None:
             save_restore_connector = SaveRestoreConnector()
 
@@ -772,6 +775,7 @@ def from_pretrained(
             trainer=trainer,
             save_restore_connector=save_restore_connector,
         )
+        CallbackGroup.get_instance().on_load_checkpoint_end()
         return instance
 
     @classmethod

diff --git a/nemo/core/classes/modelPT.py b/nemo/core/classes/modelPT.py
@@ -47,6 +47,7 @@
 from nemo.core.classes.common import Model
 from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
 from nemo.core.optim import McoreDistributedOptimizer, prepare_lr_scheduler
+from nemo.lightning.pytorch.callbacks.callback_group import CallbackGroup, wrap_methods_with_callbacks
@@ -50,3 +50,3 @@
 from nemo.core.optim import McoreDistributedOptimizer, prepare_lr_scheduler
-from nemo.lightning.pytorch.callbacks.callback_group import CallbackGroup, wrap_methods_with_callbacks
+from nemo.lightning.pytorch.callbacks.callback_group import wrap_methods_with_callbacks
 from nemo.utils import logging, model_utils
@@ -50,3 +50,3 @@
 from nemo.core.optim import McoreDistributedOptimizer, prepare_lr_scheduler
-from nemo.lightning.pytorch.callbacks.callback_group import CallbackGroup, wrap_methods_with_callbacks
+from nemo.lightning.pytorch.callbacks.callback_group import wrap_methods_with_callbacks
 from nemo.utils import logging, model_utils
 from nemo.utils import logging, model_utils
 from nemo.utils.app_state import AppState
 from nemo.utils.debug_hook import register_debug_hooks
@@ -224,6 +225,7 @@
 
     def __init_subclass__(cls) -> None:
         cls._save_restore_connector = SaveRestoreConnector()
+        wrap_methods_with_callbacks(cls)
 
     def on_fit_start(self) -> None:
         if self.cfg.get("dump_debug_info", False):
@@ -2126,3 +2128,6 @@
             return copy.deepcopy(optim_config)
         else:
             return OmegaConf.create(optim_config)
+
+
+ModelPT = wrap_setup_training_data(ModelPT)
diff --git a/nemo/lightning/pytorch/callbacks/callback_group.py b/nemo/lightning/pytorch/callbacks/callback_group.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from typing import Callable
+
+from lightning.pytorch.callbacks import Callback
+
+
+class CallbackGroup:
+    """A class for hosting a collection of callback objects.
+
+    It is used to execute callback functions of multiple callback objects with the same method name.
+    When callbackgroup.func(args) is executed, internally it loops through the objects in
+    self._callbacks and runs self._callbacks[0].func(args), self._callbacks[1].func(args), etc.
+    The method name and arguments should match.
+
+    Attributes:
+        _callbacks (list[Callback]): List of callback objects.
+    """
+
+    _instance = None
+
+    @classmethod
+    def get_instance(cls) -> 'CallbackGroup':
+        """Get the singleton instance of the CallbackGroup.
+        Args:
+            cls (CallbackGroup): The class of the CallbackGroup.
+        Returns:
+            CallbackGroup: The singleton instance of the CallbackGroup.
+        """
+        if cls._instance is None:
+            cls._instance = CallbackGroup()
+        return cls._instance
+
+    def __init__(self) -> None:
+        """Initializes the list of callback objects."""
+        self._callbacks = []
+
+    def register(self, callback: Callback) -> None:
+        """Register a callback to the callback group.
+
+        Args:
+            callback (Callback): The callback to register.
+        """
+        self._callbacks.append(callback)
+
+    def __getattr__(self, method_name: str) -> Callable:
+        """Loops through the callback objects to call the corresponding callback function.
+
+        Args:
+            method_name (str): Callback method name.
+        """
+
+        def multi_callback_wrapper(*args, **kwargs) -> None:
+            for callback in self._callbacks:
+                assert hasattr(callback, method_name)
+                method = getattr(callback, method_name)
+                assert callable(method)
+                _ = method(*args, **kwargs)
+
+        return multi_callback_wrapper
+
+    @property
+    def callbacks(self):
+        """Return callbacks in order.
+
+        Returns:
+            list: callback objects
+        """
+        return self._callbacks
+
+
+class Callback(Callback):
+    """The base class for all callbacks. It inherits the pytorch lightning callback so the callback can be also passed to PTL trainer to reuse.
+    Below list extra callback functions in NeMo.
+    """
+
+    def on_dataloader_init_start(self):
+        """Called at the start of the data loading."""
+
+    def on_dataloader_init_end(self):
+        """Called at the end of the data loading."""
+
+    def on_model_init_start(self):
+        """Called at the start of the model initialization."""
+
+    def on_model_init_end(self):
+        """Called at the end of the model initialization."""
+
+    def on_optimizer_init_start(self) -> None:
+        """Called at the beginning of optimizer initialization."""
+
+    def on_optimizer_init_end(self) -> None:
+        """Called at the end of optimizer initialization."""
+
+    def on_load_checkpoint_start(self) -> None:
+        """Called at the beginning of loading checkpoint."""
+
+    def on_load_checkpoint_end(self) -> None:
+        """Called at the end of loading checkpoint."""
+
+    def on_save_checkpoint_start(self, iteration: int = 0) -> None:
+        """Called when start saving a checkpoint."""
+
+    def on_save_checkpoint_end(self, iteration: int = 0) -> None:
+        """Called when saving checkpoint (sync part) call ends."""
+
+    def on_save_checkpoint_success(self, iteration: int = 0) -> None:
+        """Called when checkpoint is saved successfully."""
+
+
+CB_WRAP_RULES = {
+    # The function name is the name of the method to wrap.
+    # The start_hook and end_hook are the names of the methods to call before and after the original method.
+    # The callback_method_name is the name of the method to call in the callback group.
+    # Example:
+    # function name: {
+    #     "start_hook": callback_method_name,
+    #     "end_hook": callback_method_name
+    # }
+    "setup_training_data": {"start_hook": "on_dataloader_init_start", "end_hook": "on_dataloader_init_end"},
+    "setup_optimization": {"start_hook": "on_optimizer_init_start", "end_hook": "on_optimizer_init_end"},
+    "restore_from_pretrained_models": {"start_hook": "on_load_checkpoint_start", "end_hook": "on_load_checkpoint_end"},
+    "__init__": {"start_hook": "on_model_init_start", "end_hook": "on_model_init_end"},
+    "configure_optimizers": {"start_hook": "on_optimizer_init_start", "end_hook": "on_optimizer_init_end"},
+    "setup_training_dataloader": {"start_hook": "on_dataloader_init_start", "end_hook": "on_dataloader_init_end"},
+}
+
+
+def _make_callback_wrapped_method(original_method):
+    """Wrap a method with the start and end hooks of the callback group.
+
+    Args:
+        original_method (Callable): The original method to wrap.
+        hooks (dict): The hooks to call.
+    """
+    callback_group = CallbackGroup.get_instance()
+    hooks = CB_WRAP_RULES.get(original_method.__name__)
+
+    is_classmethod = isinstance(original_method, classmethod)
+
+    if not hooks:
+        return original_method
+
+    @functools.wraps(original_method)
+    def wrapped_instance_method(self, *args, **kwargs):
+        if hasattr(callback_group, hooks["start_hook"]):
+            getattr(callback_group, hooks["start_hook"])()
+        result = original_method(self, *args, **kwargs)
+        if hasattr(callback_group, hooks["end_hook"]):
+            getattr(callback_group, hooks["end_hook"])()
+        return result
+
+    @functools.wraps(original_method)
+    def wrapped_class_method(*args, **kwargs):
+        if hasattr(callback_group, hooks["start_hook"]):
+            getattr(callback_group, hooks["start_hook"])()
+        result = original_method(*args, **kwargs)
+        if hasattr(callback_group, hooks["end_hook"]):
+            getattr(callback_group, hooks["end_hook"])()
+        return result
+
+    if is_classmethod:
+        return classmethod(wrapped_class_method)
+    else:
+        return wrapped_instance_method
+
+
+def wrap_methods_with_callbacks(cls) -> None:
+    """Wrap class/instance methods with the start and end hooks of the callback group.
+
+    Args:
+        cls (type): The class to wrap the methods of.
+    """
+    for method_name in CB_WRAP_RULES.keys():
+        if method_name in cls.__dict__:
+            original_method = cls.__dict__[method_name]
+            cls.__dict__[method_name] = _make_callback_wrapped_method(original_method)
diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -29,6 +29,7 @@
 
 from nemo.lightning.ckpt_utils import ckpt_to_dir
 from nemo.lightning.io.pl import TrainerContext
+from nemo.lightning.pytorch.callbacks.callback_group import CallbackGroup
 from nemo.utils import logging
 from nemo.utils.app_state import AppState
 
@@ -567,6 +568,7 @@ def _save_checkpoint(self, trainer: 'lightning.pytorch.Trainer', filepath: str)
             ValueError: (mcore) async_save with EMA not supported
             ValueError: (mcore) Async save requires async compatible CheckpointIO
         """
+        CallbackGroup.get_instance().on_save_checkpoint_start()
 
         from nemo.utils.get_rank import is_global_rank_zero
 
@@ -598,6 +600,7 @@ def _save_checkpoint(self, trainer: 'lightning.pytorch.Trainer', filepath: str)
                     rank_zero_info(f"Saving EMA weights to separate checkpoint {filepath}")
                 super()._save_checkpoint(trainer, filepath)
             self.remove_checkpoint_unfinished_marker(filepath, barrier_before=True)
+            CallbackGroup.get_instance().on_save_checkpoint_success(global_step=trainer.global_step)
         else:
             # Determine whether to include optimizer states in the checkpoint
             # optimizer states are included when
@@ -632,6 +635,7 @@ def _save_checkpoint(self, trainer: 'lightning.pytorch.Trainer', filepath: str)
                 logging.info(f'Scheduled async checkpoint save for {filepath}')
             else:
                 finalize_fn()
+            CallbackGroup.get_instance().on_save_checkpoint_end(global_step=trainer.global_step)
 
     def _get_finalize_save_checkpoint_callback(
         self, trainer: 'lightning.pytorch.Trainer', filepath: str, global_step: int
@@ -655,6 +659,7 @@ def _cb():
                 return
 
             logging.info(f'Async checkpoint save for step {global_step} ({filepath}) finalized successfully.')
+            CallbackGroup.get_instance().on_save_checkpoint_success(global_step=trainer.global_step)
 
             if str(filepath) in self.ckpts_to_link:
                 self._link_checkpoint(trainer, filepath, self.ckpts_to_link.pop(filepath), override_async=True)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -85,6 +85,7 @@
		)
		from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches


		__all__ = ["ModularAudioGPTModel", "CrossAttendModularAudioGPTModel"]


Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -68,7 +68,6 @@
		get_num_microbatches,
		)


		__all__ = ['MegatronGPTSFTModel']


Expand Down