allenai · epwalsh · May 26, 2021 · May 25, 2021 · May 25, 2021 · May 25, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - When `PretrainedTransformerIndexer` folds long sequences, it no longer loses the information from token type ids.
 - Fixed documentation for `GradientDescentTrainer.cuda_device`.
+- Fixed `wandb` callback to work in distributed training.
 
 
 ## [v2.4.0](https://github.com/allenai/allennlp/releases/tag/v2.4.0) - 2021-04-22

diff --git a/allennlp/training/callbacks/log_writer.py b/allennlp/training/callbacks/log_writer.py
@@ -148,7 +148,7 @@ def on_batch(
         batch_grad_norm: Optional[float] = None,
         **kwargs,
     ) -> None:
-        if not is_training and not is_primary:
+        if not is_training or not is_primary:
             return None
         assert self.trainer is not None
 

diff --git a/allennlp/training/callbacks/wandb.py b/allennlp/training/callbacks/wandb.py
@@ -88,11 +88,7 @@ def __init__(
 
         self._watch_model = watch_model
         self._files_to_save = files_to_save
-
-        import wandb
-
-        self.wandb = wandb
-        self.wandb.init(
+        self._wandb_kwargs: Dict[str, Any] = dict(
             dir=os.path.abspath(serialization_dir),
             project=project,
             entity=entity,
@@ -105,9 +101,6 @@ def __init__(
             **(wandb_kwargs or {}),
         )
 
-        for fpath in self._files_to_save:
-            self.wandb.save(os.path.join(serialization_dir, fpath), base_path=serialization_dir)
-
     @overrides
     def log_scalars(
         self,
@@ -122,7 +115,7 @@ def log_tensors(
         self, tensors: Dict[str, torch.Tensor], log_prefix: str = "", epoch: Optional[int] = None
     ) -> None:
         self._log(
-            {k: self.wandb.Histogram(v.cpu().data.numpy().flatten()) for k, v in tensors.items()},
+            {k: self.wandb.Histogram(v.cpu().data.numpy().flatten()) for k, v in tensors.items()},  # type: ignore
             log_prefix=log_prefix,
             epoch=epoch,
         )
@@ -134,12 +127,31 @@ def _log(
             dict_to_log = {f"{log_prefix}/{k}": v for k, v in dict_to_log.items()}
         if epoch is not None:
             dict_to_log["epoch"] = epoch
-        self.wandb.log(dict_to_log, step=self.trainer._batch_num_total)  # type: ignore[union-attr]
+        self.wandb.log(dict_to_log, step=self.trainer._batch_num_total)  # type: ignore
 
     @overrides
     def on_start(
         self, trainer: "GradientDescentTrainer", is_primary: bool = True, **kwargs
     ) -> None:
         super().on_start(trainer, is_primary=is_primary, **kwargs)
+
+        if not is_primary:
+            return None
+
+        import wandb
+
+        self.wandb = wandb
+        self.wandb.init(**self._wandb_kwargs)
+
+        for fpath in self._files_to_save:
+            self.wandb.save(  # type: ignore
+                os.path.join(self.serialization_dir, fpath), base_path=self.serialization_dir
+            )
+
         if self._watch_model:
-            self.wandb.watch(self.trainer.model)  # type: ignore[union-attr]
+            self.wandb.watch(self.trainer.model)  # type: ignore
+
+    @overrides
+    def close(self) -> None:
+        super().close()
+        self.wandb.finish()  # type: ignore