Be sure to close the TensorBoard writer (#4731)

dirkgr · epwalsh · web-flow · commit 00bb6c59b3ac · 2020-10-19T11:32:05.000-07:00
* Be sure to close the tensorboard writer

* Changelog

* unindent

Co-authored-by: Evan Pete Walsh &lt;epwalsh10@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -81,6 +81,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed a bug in distributed training where the vocab would be saved from every worker, when it should have been saved by only the local master process.
 - Fixed a bug in the calculation of rouge metrics during distributed training where the total sequence count was not being aggregated across GPUs.
 - Fixed `allennlp.nn.util.add_sentence_boundary_token_ids()` to use `device` parameter of input tensor.
+- Be sure to close the TensorBoard writer even when training doesn't finish.
 - Fixed the docstring for `PyTorchSeq2VecWrapper`.
 
 ## [v1.1.0](https://github.com/allenai/allennlp/releases/tag/v1.1.0) - 2020-09-08
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
@@ -965,6 +965,13 @@ def train(self) -> Dict[str, Any]:
         """
         Trains the supplied model with the supplied parameters.
         """
+        try:
+            return self._try_train()
+        finally:
+            # make sure pending events are flushed to disk and files are closed properly
+            self._tensorboard.close()
+
+    def _try_train(self) -> Dict[str, Any]:
         try:
             epoch_counter = self._restore_checkpoint()
         except RuntimeError:
@@ -1068,7 +1075,8 @@ def train(self) -> Dict[str, Any]:
 
             if self._serialization_dir and self._master:
                 common_util.dump_metrics(
-                    os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"), metrics
+                    os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"),
+                    metrics,
                 )
 
             # The Scheduler API is agnostic to whether your schedule requires a validation metric -
@@ -1106,9 +1114,6 @@ def train(self) -> Dict[str, Any]:
         for callback in self._end_callbacks:
             callback(self, metrics=metrics, epoch=epoch, is_master=self._master)
 
-        # make sure pending events are flushed to disk and files are closed properly
-        self._tensorboard.close()
-
         # Load the best model state before returning
         best_model_state = self._checkpointer.best_model_state()
         if best_model_state: