Skip to content
This repository was archived by the owner on Dec 16, 2022. It is now read-only.

Commit c09833c

Browse files
Remove allennlp sparse_clip_grad and replace with torch clip_grad_norm_. (#4159)
* Remove allennlp sparse_clip_grad and replace with torch clip_grad_norm_. * Put test_sparse_clip_grad() back in trainer_test. * upgrade torch to at least 1.5.0 Co-authored-by: Evan Pete Walsh <[email protected]>
1 parent 42a4e63 commit c09833c

File tree

4 files changed

+7
-52
lines changed

4 files changed

+7
-52
lines changed

allennlp/tests/training/trainer_test.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
amp = None
1616
import torch
1717
from torch.utils.data import DataLoader
18+
from torch.nn.utils import clip_grad_norm_
1819
from allennlp.data.dataloader import DataLoader as AllennlpDataLoader
1920

2021
from allennlp.common.checks import ConfigurationError
@@ -36,7 +37,6 @@
3637
from allennlp.training.learning_rate_schedulers import ExponentialLearningRateScheduler
3738
from allennlp.training.momentum_schedulers import MomentumScheduler
3839
from allennlp.training.moving_average import ExponentialMovingAverage
39-
from allennlp.training.util import sparse_clip_norm
4040
from allennlp.data import allennlp_collate
4141

4242

@@ -1019,7 +1019,7 @@ def test_sparse_clip_grad(self):
10191019
assert embedding.weight.grad.is_sparse
10201020

10211021
# Now try to clip the gradients.
1022-
_ = sparse_clip_norm([embedding.weight], 1.5)
1022+
_ = clip_grad_norm_([embedding.weight], 1.5)
10231023
# Final norm should be 1.5
10241024
grad = embedding.weight.grad.coalesce()
10251025
self.assertAlmostEqual(grad._values().norm(2.0).item(), 1.5, places=5)

allennlp/training/trainer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import torch.distributed as dist
1717
import torch.optim.lr_scheduler
1818
from torch.nn.parallel import DistributedDataParallel
19-
19+
from torch.nn.utils import clip_grad_norm_
2020

2121
from allennlp.common import Lazy, Registrable, Tqdm
2222
from allennlp.common import util as common_util
@@ -393,7 +393,7 @@ def rescale_gradients(self) -> Optional[float]:
393393
]
394394
else:
395395
parameters_to_clip = [p for p in self.model.parameters() if p.grad is not None]
396-
return training_util.sparse_clip_norm(parameters_to_clip, self._grad_norm)
396+
return clip_grad_norm_(parameters_to_clip, self._grad_norm)
397397
else:
398398
return None
399399

allennlp/training/util.py

+2-47
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import torch
1111
import torch.distributed as dist
1212
from torch.utils.data import DataLoader, Dataset
13+
from torch.nn.utils import clip_grad_norm_
1314

1415
from allennlp.common.checks import check_for_gpu, ConfigurationError
1516
from allennlp.common.params import Params
@@ -30,52 +31,6 @@ class HasBeenWarned:
3031
tqdm_ignores_underscores = False
3132

3233

33-
def sparse_clip_norm(parameters, max_norm, norm_type=2) -> float:
34-
"""Clips gradient norm of an iterable of parameters.
35-
36-
The norm is computed over all gradients together, as if they were
37-
concatenated into a single vector. Gradients are modified in-place.
38-
Supports sparse gradients.
39-
40-
# Parameters
41-
42-
parameters : `(Iterable[torch.Tensor])`
43-
An iterable of Tensors that will have gradients normalized.
44-
max_norm : `float`
45-
The max norm of the gradients.
46-
norm_type : `float`
47-
The type of the used p-norm. Can be `'inf'` for infinity norm.
48-
49-
# Returns
50-
51-
Total norm of the parameters (viewed as a single vector).
52-
"""
53-
parameters = list(filter(lambda p: p.grad is not None, parameters))
54-
max_norm = float(max_norm)
55-
norm_type = float(norm_type)
56-
if norm_type == float("inf"):
57-
total_norm = max(p.grad.data.abs().max() for p in parameters)
58-
else:
59-
total_norm = 0
60-
for p in parameters:
61-
if p.grad.is_sparse:
62-
# need to coalesce the repeated indices before finding norm
63-
grad = p.grad.data.coalesce()
64-
param_norm = grad._values().norm(norm_type)
65-
else:
66-
param_norm = p.grad.data.norm(norm_type)
67-
total_norm += param_norm ** norm_type
68-
total_norm = total_norm ** (1.0 / norm_type)
69-
clip_coef = max_norm / (total_norm + nn_util.tiny_value_of_dtype(total_norm.dtype))
70-
if clip_coef < 1:
71-
for p in parameters:
72-
if p.grad.is_sparse:
73-
p.grad.data._values().mul_(clip_coef)
74-
else:
75-
p.grad.data.mul_(clip_coef)
76-
return total_norm
77-
78-
7934
def move_optimizer_to_cuda(optimizer):
8035
"""
8136
Move the optimizer state to GPU, if necessary.
@@ -318,7 +273,7 @@ def rescale_gradients(model: Model, grad_norm: Optional[float] = None) -> Option
318273
"""
319274
if grad_norm:
320275
parameters_to_clip = [p for p in model.parameters() if p.grad is not None]
321-
return sparse_clip_norm(parameters_to_clip, grad_norm)
276+
return clip_grad_norm_(parameters_to_clip, grad_norm)
322277
return None
323278

324279

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
license="Apache",
5151
packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
5252
install_requires=[
53-
"torch>1.3.1,<1.6.0",
53+
"torch>=1.5.0,<1.6.0",
5454
"jsonnet>=0.10.0 ; sys.platform != 'win32'",
5555
"overrides==2.8.0",
5656
"nltk",

0 commit comments

Comments
 (0)