-
Notifications
You must be signed in to change notification settings - Fork 3k
Add CallbackGroup & Metadata factory function #13437
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
642e360
1badf29
3bf3367
2b51e12
249dad3
db2b15d
d921d64
3e32f1a
e1074f6
d79f4f1
263f7e9
81cd1d9
4852936
48d6d87
2ba6cc5
c908b53
bd39d8f
ba4e4a6
515136c
bc030f7
2ed58f4
35d2f2c
ddc99fb
ca6ff4d
9f11d01
64e0e03
181bb3e
61d631c
8eb4fc6
2900246
4acbc2c
dffccfa
60eb727
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
from nemo.lightning.fabric.plugins import FabricMegatronMixedPrecision | ||
from nemo.lightning.fabric.strategies import FabricMegatronStrategy | ||
from nemo.lightning.nemo_logger import NeMoLogger | ||
from nemo.lightning.one_logger_callback import OneLoggerNeMoCallback | ||
from nemo.lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint | ||
from nemo.lightning.pytorch.optim import ( | ||
LRSchedulerModule, | ||
|
@@ -72,6 +73,7 @@ def _is_slurm_interactive_mode(): | |
"lr_scheduler", | ||
"NeMoLogger", | ||
"ModelCheckpoint", | ||
"OneLoggerNeMoCallback", | ||
"OptimizerModule", | ||
"Trainer", | ||
"configure_no_restart_validation_training_loop", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -165,6 +165,31 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = | |
self._setup_trainer_loggers(trainer, _dir, version) | ||
self._setup_trainer_model_checkpoint(trainer, log_dir=log_dir, ckpt=self.ckpt) | ||
|
||
# Configure OneLogger callback | ||
try: | ||
from omegaconf import OmegaConf | ||
|
||
from nemo.utils.exp_manager import configure_onelogger | ||
|
||
# Create a minimal config for OneLogger | ||
cfg = OmegaConf.create( | ||
{ | ||
"exp_manager": { | ||
"wandb_logger_kwargs": { | ||
"project": "nemo_experiments", | ||
"name": self.name, | ||
"id": version or None, | ||
} | ||
} | ||
} | ||
) | ||
|
||
# Configure OneLogger | ||
configure_onelogger(cfg, trainer) | ||
logging.info("OneLogger configured successfully") | ||
except Exception as e: | ||
logging.warning(f"Failed to configure OneLogger: {e}") | ||
|
||
self._setup_files_to_move(log_dir, app_state) | ||
self._setup_file_logging(log_dir) | ||
|
||
|
Original file line number | Diff line number | Diff line change | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,150 @@ | |||||||||||||||||
""" | |||||||||||||||||
OneLogger callback for NeMo training. | |||||||||||||||||
|
|||||||||||||||||
This module provides a callback that integrates OneLogger telemetry with NeMo training. | |||||||||||||||||
""" | |||||||||||||||||
|
|||||||||||||||||
import functools | |||||||||||||||||
import logging | |||||||||||||||||
Check noticeCode scanning / CodeQL Unused import Note
Import of 'logging' is not used.
Copilot AutofixAI 2 days ago To fix the issue, the unused
Suggested changeset
1
nemo/lightning/one_logger_callback.py
Copilot is powered by AI and may make mistakes. Always verify output.
Unable to commit as this autofix suggestion is now outdated
Positive FeedbackNegative Feedback
Refresh and try again.
|
|||||||||||||||||
from typing import Any, Dict, List, Optional, Type | |||||||||||||||||
|
|||||||||||||||||
import nv_one_logger.training_telemetry.api.callbacks as CB | |||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eric: it is better to publish to public index, if we have difficulties, make sure this won't fail with import guard, and maybe ref to the open sourced one logger repo. |
|||||||||||||||||
import pytorch_lightning as pl | |||||||||||||||||
|
@@ -11,3 +11,3 @@ | ||
import nv_one_logger.training_telemetry.api.callbacks as CB | ||
import pytorch_lightning as pl | ||
|
||
import torch |
Fixed
Show fixed
Hide fixed
Check notice
Code scanning / CodeQL
Unused import Note
Show autofix suggestion
Hide autofix suggestion
Copilot Autofix
AI 2 days ago
To fix the problem, the unused torch
import on line 13 should be removed. This will eliminate the unnecessary dependency and make the code cleaner. No other changes are required since the removal of this import does not affect the functionality of the code.
-
Copy modified line R13
@@ -12,3 +12,3 @@ | ||
import pytorch_lightning as pl | ||
import torch | ||
|
||
from pytorch_lightning import Trainer |
Check notice
Code scanning / CodeQL
Unused import Note
Show autofix suggestion
Hide autofix suggestion
Copilot Autofix
AI 2 days ago
To fix the problem, we will remove the unused import statement from pytorch_lightning.plugins.io import AsyncCheckpointIO
on line 17. This will clean up the code and eliminate the unnecessary dependency without affecting the functionality of the program.
-
Copy modified line R17
@@ -16,3 +16,3 @@ | ||
from pytorch_lightning.core import LightningModule | ||
from pytorch_lightning.plugins.io import AsyncCheckpointIO | ||
|
||
from pytorch_lightning.utilities import rank_zero_only |
Uh oh!
There was an error while loading. Please reload this page.