Remove allennlp sparse_clip_grad and replace with torch clip_grad_norm_. (#4159)

bellecarrell · epwalsh · web-flow · commit c09833c3a2b2 · 2020-05-01T16:30:32.000-07:00
* Remove allennlp sparse_clip_grad and replace with torch clip_grad_norm_.

* Put test_sparse_clip_grad() back in trainer_test.

* upgrade torch to at least 1.5.0

Co-authored-by: Evan Pete Walsh &lt;epwalsh10@gmail.com&gt;
diff --git a/allennlp/tests/training/trainer_test.py b/allennlp/tests/training/trainer_test.py
@@ -15,6 +15,7 @@
     amp = None
 import torch
 from torch.utils.data import DataLoader
+from torch.nn.utils import clip_grad_norm_
 from allennlp.data.dataloader import DataLoader as AllennlpDataLoader
 
 from allennlp.common.checks import ConfigurationError
@@ -36,7 +37,6 @@
 from allennlp.training.learning_rate_schedulers import ExponentialLearningRateScheduler
 from allennlp.training.momentum_schedulers import MomentumScheduler
 from allennlp.training.moving_average import ExponentialMovingAverage
-from allennlp.training.util import sparse_clip_norm
 from allennlp.data import allennlp_collate
 
 
@@ -1019,7 +1019,7 @@ def test_sparse_clip_grad(self):
         assert embedding.weight.grad.is_sparse
 
         # Now try to clip the gradients.
-        _ = sparse_clip_norm([embedding.weight], 1.5)
+        _ = clip_grad_norm_([embedding.weight], 1.5)
         # Final norm should be 1.5
         grad = embedding.weight.grad.coalesce()
         self.assertAlmostEqual(grad._values().norm(2.0).item(), 1.5, places=5)
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
@@ -16,7 +16,7 @@
 import torch.distributed as dist
 import torch.optim.lr_scheduler
 from torch.nn.parallel import DistributedDataParallel
-
+from torch.nn.utils import clip_grad_norm_
 
 from allennlp.common import Lazy, Registrable, Tqdm
 from allennlp.common import util as common_util
@@ -393,7 +393,7 @@ def rescale_gradients(self) -> Optional[float]:
                 ]
             else:
                 parameters_to_clip = [p for p in self.model.parameters() if p.grad is not None]
-            return training_util.sparse_clip_norm(parameters_to_clip, self._grad_norm)
+            return clip_grad_norm_(parameters_to_clip, self._grad_norm)
         else:
             return None
 
diff --git a/allennlp/training/util.py b/allennlp/training/util.py
@@ -10,6 +10,7 @@
 import torch
 import torch.distributed as dist
 from torch.utils.data import DataLoader, Dataset
+from torch.nn.utils import clip_grad_norm_
 
 from allennlp.common.checks import check_for_gpu, ConfigurationError
 from allennlp.common.params import Params
@@ -30,52 +31,6 @@ class HasBeenWarned:
     tqdm_ignores_underscores = False
 
 
-def sparse_clip_norm(parameters, max_norm, norm_type=2) -> float:
-    """Clips gradient norm of an iterable of parameters.
-
-    The norm is computed over all gradients together, as if they were
-    concatenated into a single vector. Gradients are modified in-place.
-    Supports sparse gradients.
-
-    # Parameters
-
-    parameters : `(Iterable[torch.Tensor])`
-        An iterable of Tensors that will have gradients normalized.
-    max_norm : `float`
-        The max norm of the gradients.
-    norm_type : `float`
-        The type of the used p-norm. Can be `'inf'` for infinity norm.
-
-    # Returns
-
-    Total norm of the parameters (viewed as a single vector).
-    """
-    parameters = list(filter(lambda p: p.grad is not None, parameters))
-    max_norm = float(max_norm)
-    norm_type = float(norm_type)
-    if norm_type == float("inf"):
-        total_norm = max(p.grad.data.abs().max() for p in parameters)
-    else:
-        total_norm = 0
-        for p in parameters:
-            if p.grad.is_sparse:
-                # need to coalesce the repeated indices before finding norm
-                grad = p.grad.data.coalesce()
-                param_norm = grad._values().norm(norm_type)
-            else:
-                param_norm = p.grad.data.norm(norm_type)
-            total_norm += param_norm ** norm_type
-        total_norm = total_norm ** (1.0 / norm_type)
-    clip_coef = max_norm / (total_norm + nn_util.tiny_value_of_dtype(total_norm.dtype))
-    if clip_coef < 1:
-        for p in parameters:
-            if p.grad.is_sparse:
-                p.grad.data._values().mul_(clip_coef)
-            else:
-                p.grad.data.mul_(clip_coef)
-    return total_norm
-
-
 def move_optimizer_to_cuda(optimizer):
     """
     Move the optimizer state to GPU, if necessary.
@@ -318,7 +273,7 @@ def rescale_gradients(model: Model, grad_norm: Optional[float] = None) -> Option
     """
     if grad_norm:
         parameters_to_clip = [p for p in model.parameters() if p.grad is not None]
-        return sparse_clip_norm(parameters_to_clip, grad_norm)
+        return clip_grad_norm_(parameters_to_clip, grad_norm)
     return None
 
 
diff --git a/setup.py b/setup.py
@@ -50,7 +50,7 @@
     license="Apache",
     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
     install_requires=[
-        "torch>1.3.1,<1.6.0",
+        "torch>=1.5.0,<1.6.0",
         "jsonnet>=0.10.0 ; sys.platform != 'win32'",
         "overrides==2.8.0",
         "nltk",