Merge pull request #57 from openspeech-team/checkpoint-n-step

sooftware · web-flow · commit b59dfd5041b4 · 2021-07-22T02:30:05.000+09:00
Add CheckpointEveryNSteps class for save checkpoint every N steps.
diff --git a/openspeech/callbacks.py b/openspeech/callbacks.py
@@ -0,0 +1,53 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, W
+
+import os
+import pytorch_lightning as pl
+
+
+class CheckpointEveryNSteps(pl.Callback):
+    """
+    Save a checkpoint every N steps, instead of Lightning's default that checkpoints
+    based on validation loss.
+
+    Args:
+        save_step_frequency: how often to save in steps
+        use_modelcheckpoint_filename: just use the ModelCheckpoint callback's default filename, don't use ours.
+    """
+
+    def __init__(
+        self,
+        save_step_frequency,
+        use_modelcheckpoint_filename=False,
+    ) -> None:
+        self.save_step_frequency = save_step_frequency
+        self.use_modelcheckpoint_filename = use_modelcheckpoint_filename
+
+    def on_batch_end(self, trainer: pl.Trainer, _):
+        """ Check if we should save a checkpoint after every train batch """
+        epoch = trainer.current_epoch
+        global_step = trainer.global_step
+        if global_step % self.save_step_frequency == 0:
+            if self.use_modelcheckpoint_filename:
+                filename = trainer.checkpoint_callback.filename
+            else:
+                filename = f"{epoch=}_{global_step=}.ckpt"
+            ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filename)
+            trainer.save_checkpoint(ckpt_path)
diff --git a/openspeech/dataclass/configurations.py b/openspeech/dataclass/configurations.py
@@ -202,6 +202,9 @@ class BaseTrainerConfigs(OpenspeechDataclass):
     max_epochs: int = field(
         default=20, metadata={"help": "Stop training once this number of epochs is reached."}
     )
+    save_checkpoint_n_steps: int = field(
+        default=10000, metadata={"help": "Save a checkpoint every N steps."}
+    )
     auto_scale_batch_size: str = field(
         default="binsearch", metadata={"help": "If set to True, will initially run a batch size finder trying to find "
                                                "the largest batch size that fits into memory."}
diff --git a/openspeech/utils.py b/openspeech/utils.py
@@ -29,6 +29,8 @@
 from omegaconf import DictConfig, OmegaConf
 from pytorch_lightning.callbacks import LearningRateMonitor
 
+from .callbacks import CheckpointEveryNSteps
+
 PYTORCH_IMPORT_ERROR = """
 Openspeech requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
 installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
@@ -228,7 +230,10 @@ def get_pl_trainer(
                              logger=logger,
                              auto_scale_batch_size=configs.trainer.auto_scale_batch_size,
                              max_epochs=configs.trainer.max_epochs,
-                             callbacks=[LearningRateMonitor(logging_interval='step')])
+                             callbacks=[
+                                 LearningRateMonitor(logging_interval='step'),
+                                 CheckpointEveryNSteps(configs.save_checkpoint_n_steps)
+                             ])
     elif configs.trainer.name == "gpu":
         trainer = pl.Trainer(accelerator=configs.trainer.accelerator,
                              gpus=num_devices,
@@ -239,7 +244,10 @@ def get_pl_trainer(
                              logger=logger,
                              auto_scale_batch_size=configs.trainer.auto_scale_batch_size,
                              max_epochs=configs.trainer.max_epochs,
-                             callbacks=[LearningRateMonitor(logging_interval='step')])
+                             callbacks=[
+                                 LearningRateMonitor(logging_interval='step'),
+                                 CheckpointEveryNSteps(configs.save_checkpoint_n_steps)
+                             ])
     elif configs.trainer.name == "tpu":
         trainer = pl.Trainer(accelerator=configs.trainer.accelerator,
                              tpu_cores=configs.trainer.tpu_cores,
@@ -250,7 +258,10 @@ def get_pl_trainer(
                              logger=logger,
                              auto_scale_batch_size=configs.trainer.auto_scale_batch_size,
                              max_epochs=configs.trainer.max_epochs,
-                             callbacks=[LearningRateMonitor(logging_interval='step')])
+                             callbacks=[
+                                 LearningRateMonitor(logging_interval='step'),
+                                 CheckpointEveryNSteps(configs.save_checkpoint_n_steps)
+                             ])
     elif configs.trainer.name == "gpu-fp16":
         trainer = pl.Trainer(precision=configs.trainer.precision,
                              accelerator=configs.trainer.accelerator,
@@ -275,7 +286,10 @@ def get_pl_trainer(
                              logger=logger,
                              auto_scale_batch_size=configs.trainer.auto_scale_batch_size,
                              max_epochs=configs.trainer.max_epochs,
-                             callbacks=[LearningRateMonitor(logging_interval='step')])
+                             callbacks=[
+                                 LearningRateMonitor(logging_interval='step'),
+                                 CheckpointEveryNSteps(configs.save_checkpoint_n_steps)
+                             ])
     elif configs.trainer.name == "cpu-fp64":
         trainer = pl.Trainer(precision=configs.trainer.precision,
                              accelerator=configs.trainer.accelerator,
@@ -286,7 +300,10 @@ def get_pl_trainer(
                              logger=logger,
                              auto_scale_batch_size=configs.trainer.auto_scale_batch_size,
                              max_epochs=configs.trainer.max_epochs,
-                             callbacks=[LearningRateMonitor(logging_interval='step')])
+                             callbacks=[
+                                 LearningRateMonitor(logging_interval='step'),
+                                 CheckpointEveryNSteps(configs.save_checkpoint_n_steps)
+                             ])
     else:
         raise ValueError(f"Unsupported trainer: {configs.trainer.name}")