zero grad optimization (#4673)

epwalsh · web-flow · commit c3b5ed74eabb · 2020-09-28T10:09:53.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -49,6 +49,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed a bug where `cached_path()` would fail if passed a `cache_dir` with the user home shortcut `~/`.
 - Fixed a bug in our doc building script where markdown links did not render properly
   if the "href" part of the link (the part inside the `()`) was on a new line.
+- Changed how gradients are zeroed out with an optimization. See [this video from NVIDIA](https://www.youtube.com/watch?v=9mS1fIYj1So)
+  at around the 9 minute mark.
 
 
 ## [v1.1.0](https://github.com/allenai/allennlp/releases/tag/v1.1.0) - 2020-09-08
diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py
@@ -287,8 +287,12 @@ def search_learning_rate(
 
         for param_group in trainer.optimizer.param_groups:
             param_group["lr"] = current_lr
+            # Zero gradients.
+            # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()`
+            # because it avoids a read op when the gradients are first updated below.
+            for p in param_group["params"]:
+                p.grad = None
 
-        trainer.optimizer.zero_grad()
         loss = trainer.batch_outputs(batch, for_training=True)["loss"]
         loss.backward()
         loss = loss.detach().cpu().item()
diff --git a/allennlp/common/testing/model_test_case.py b/allennlp/common/testing/model_test_case.py
@@ -277,7 +277,8 @@ def check_model_computes_gradients_correctly(
         disable_dropout: bool = True,
     ):
         print("Checking gradients")
-        model.zero_grad()
+        for p in model.parameters():
+            p.grad = None
         model.train()
 
         original_dropouts: Dict[str, float] = {}
diff --git a/allennlp/predictors/predictor.py b/allennlp/predictors/predictor.py
@@ -109,7 +109,11 @@ def get_gradients(self, instances: List[Instance]) -> Tuple[Dict[str, Any], Dict
             )
 
             loss = outputs["loss"]
-            self._model.zero_grad()
+            # Zero gradients.
+            # NOTE: this is actually more efficient than calling `self._model.zero_grad()`
+            # because it avoids a read op when the gradients are first updated below.
+            for p in self._model.parameters():
+                p.grad = None
             loss.backward()
 
         for hook in hooks:
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
@@ -581,7 +581,12 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
             self._batch_num_total += 1
             batch_num_total = self._batch_num_total
 
-            self.optimizer.zero_grad()
+            # Zero gradients.
+            # NOTE: this is actually more efficient than calling `self.optimizer.zero_grad()`
+            # because it avoids a read op when the gradients are first updated below.
+            for param_group in self.optimizer.param_groups:
+                for p in param_group["params"]:
+                    p.grad = None
 
             batch_group_outputs = []
             for batch in batch_group: