Skip to content
This repository was archived by the owner on Dec 16, 2022. It is now read-only.

Commit cec9209

Browse files
authored
Several micro optimizations (#4833)
* benchmark transfers * create tensors directl on device when possible * fix
1 parent 48a4865 commit cec9209

20 files changed

+89
-39
lines changed

.dockerignore

+3
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@
33
**/__pycache__
44
.gitignore
55
.git
6+
.coverage
7+
.benchmarks
8+
.mypy_cache

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ __pycache__
4444

4545
.coverage
4646
.pytest_cache/
47+
.benchmarks
4748

4849
# documentation build artifacts
4950

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1515

1616
### Fixed
1717

18+
- Fixed a lot of instances where tensors were first created and then sent to a device
19+
with `.to(device)`. Instead, these tensors are now created directly on the target device.
1820
- Fixed issue with `GradientDescentTrainer` when constructed with `validation_data_loader=None` and `learning_rate_scheduler!=None`.
1921
- Fixed a bug when removing all handlers in root logger.
2022
- `ShardedDatasetReader` now inherits parameters from `base_reader` when required.

allennlp/interpret/saliency_interpreters/smooth_gradient.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def _register_forward_hook(self, stdev: float):
5858
def forward_hook(module, inputs, output):
5959
# Random noise = N(0, stdev * (max-min))
6060
scale = output.detach().max() - output.detach().min()
61-
noise = torch.randn(output.shape).to(output.device) * stdev * scale
61+
noise = torch.randn(output.shape, device=output.device) * stdev * scale
6262

6363
# Add the random noise
6464
output.add_(noise)

allennlp/modules/sampled_softmax_loss.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def forward(
155155

156156
if embeddings.shape[0] == 0:
157157
# empty batch
158-
return torch.tensor(0.0).to(embeddings.device)
158+
return torch.tensor(0.0, device=embeddings.device)
159159

160160
if not self.training:
161161
return self._forward_eval(embeddings, targets)

allennlp/nn/util.py

-2
Original file line numberDiff line numberDiff line change
@@ -1548,7 +1548,6 @@ def add_sentence_boundary_token_ids(
15481548
The new mask for the tensor, taking into account the appended tokens
15491549
marking the beginning and end of the sentence.
15501550
"""
1551-
# TODO: matthewp, profile this transfer
15521551
sequence_lengths = mask.sum(dim=1).detach().cpu().numpy()
15531552
tensor_shape = list(tensor.data.shape)
15541553
new_shape = list(tensor_shape)
@@ -1603,7 +1602,6 @@ def remove_sentence_boundaries(
16031602
new_mask : `torch.BoolTensor`
16041603
The new mask for the tensor of shape `(batch_size, timesteps - 2)`.
16051604
"""
1606-
# TODO: matthewp, profile this transfer
16071605
sequence_lengths = mask.sum(dim=1).detach().cpu().numpy()
16081606
tensor_shape = list(tensor.data.shape)
16091607
new_shape = list(tensor_shape)

allennlp/training/metrics/attachment_scores.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ def __call__( # type: ignore
8888
dist.all_reduce(unlabeled_exact_match, op=dist.ReduceOp.SUM)
8989
dist.all_reduce(correct_labels_and_indices, op=dist.ReduceOp.SUM)
9090
dist.all_reduce(labeled_exact_match, op=dist.ReduceOp.SUM)
91-
total_sentences = torch.tensor(total_sentences).to(device)
92-
total_words = torch.tensor(total_words).to(device)
91+
total_sentences = torch.tensor(total_sentences, device=device)
92+
total_words = torch.tensor(total_words, device=device)
9393
dist.all_reduce(total_sentences, op=dist.ReduceOp.SUM)
9494
dist.all_reduce(total_words, op=dist.ReduceOp.SUM)
9595
total_sentences = total_sentences.item()

allennlp/training/metrics/average.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ def __call__(self, value):
3232
_count = 1
3333
if is_distributed():
3434
device = torch.device("cuda" if dist.get_backend() == "nccl" else "cpu")
35-
count = torch.tensor(_count).to(device)
36-
total_value = torch.tensor(_total_value).to(device)
35+
count = torch.tensor(_count, device=device)
36+
total_value = torch.tensor(_total_value, device=device)
3737
dist.all_reduce(count, op=dist.ReduceOp.SUM)
3838
dist.all_reduce(total_value, op=dist.ReduceOp.SUM)
3939
_count = count.item()

allennlp/training/metrics/bleu.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ def __call__(
127127
predictions, gold_targets, ngram_size
128128
)
129129
if is_distributed():
130-
_precision_matches = torch.tensor(precision_matches).to(device)
131-
_precision_totals = torch.tensor(precision_totals).to(device)
130+
_precision_matches = torch.tensor(precision_matches, device=device)
131+
_precision_totals = torch.tensor(precision_totals, device=device)
132132
dist.all_reduce(_precision_matches, op=dist.ReduceOp.SUM)
133133
dist.all_reduce(_precision_totals, op=dist.ReduceOp.SUM)
134134
precision_matches = _precision_matches.item() / world_size
@@ -150,8 +150,8 @@ def __call__(
150150
_reference_lengths = valid_gold_targets_mask.sum().item()
151151

152152
if is_distributed():
153-
prediction_lengths = torch.tensor(_prediction_lengths).to(device)
154-
reference_lengths = torch.tensor(_reference_lengths).to(device)
153+
prediction_lengths = torch.tensor(_prediction_lengths, device=device)
154+
reference_lengths = torch.tensor(_reference_lengths, device=device)
155155
dist.all_reduce(prediction_lengths, op=dist.ReduceOp.SUM)
156156
dist.all_reduce(reference_lengths, op=dist.ReduceOp.SUM)
157157
_prediction_lengths = prediction_lengths.item()

allennlp/training/metrics/covariance.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,10 @@ def __call__(
111111

112112
# # Note: this gives an approximate aggregation of the covariance.
113113
# device = gold_labels.device
114-
# delta_mean_prediction = torch.tensor(delta_mean_prediction).to(device)
115-
# delta_mean_label = torch.tensor(delta_mean_label).to(device)
116-
# delta_co_moment = torch.tensor(delta_co_moment).to(device)
117-
# _total_count = torch.tensor(updated_count).to(device)
114+
# delta_mean_prediction = torch.tensor(delta_mean_prediction, device=device)
115+
# delta_mean_label = torch.tensor(delta_mean_label, device=device)
116+
# delta_co_moment = torch.tensor(delta_co_moment, device=device)
117+
# _total_count = torch.tensor(updated_count, device=device)
118118
# dist.all_reduce(delta_mean_prediction, op=dist.ReduceOp.SUM)
119119
# dist.all_reduce(delta_mean_label, op=dist.ReduceOp.SUM)
120120
# dist.all_reduce(delta_co_moment, op=dist.ReduceOp.SUM)

allennlp/training/metrics/entropy.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def __call__(
4343
_count = 1
4444

4545
if is_distributed():
46-
count = torch.tensor(_count).to(device)
46+
count = torch.tensor(_count, device=device)
4747
dist.all_reduce(_entropy, op=dist.ReduceOp.SUM)
4848
dist.all_reduce(count, op=dist.ReduceOp.SUM)
4949
_count = count.item()

allennlp/training/metrics/evalb_bracketing_scorer.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,9 @@ def __call__(self, predicted_trees: List[Tree], gold_trees: List[Tree]) -> None:
155155

156156
if is_distributed():
157157
device = torch.device("cuda" if dist.get_backend() == "nccl" else "cpu")
158-
correct_predicted_brackets = torch.tensor(_correct_predicted_brackets).to(device)
159-
predicted_brackets = torch.tensor(_predicted_brackets).to(device)
160-
gold_brackets = torch.tensor(_gold_brackets).to(device)
158+
correct_predicted_brackets = torch.tensor(_correct_predicted_brackets, device=device)
159+
predicted_brackets = torch.tensor(_predicted_brackets, device=device)
160+
gold_brackets = torch.tensor(_gold_brackets, device=device)
161161
dist.all_reduce(correct_predicted_brackets, op=dist.ReduceOp.SUM)
162162
dist.all_reduce(predicted_brackets, op=dist.ReduceOp.SUM)
163163
dist.all_reduce(gold_brackets, op=dist.ReduceOp.SUM)

allennlp/training/metrics/fbeta_measure.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def __call__(
142142
# Watch it:
143143
# The total numbers of true positives under all _predicted_ classes are zeros.
144144
if true_positives_bins.shape[0] == 0:
145-
true_positive_sum = torch.zeros(num_classes, device=predictions.device)
145+
true_positive_sum = torch.zeros(num_classes, device=device)
146146
else:
147147
true_positive_sum = torch.bincount(
148148
true_positives_bins.long(), minlength=num_classes
@@ -154,7 +154,7 @@ def __call__(
154154
if pred_bins.shape[0] != 0:
155155
pred_sum = torch.bincount(pred_bins, minlength=num_classes).float()
156156
else:
157-
pred_sum = torch.zeros(num_classes, device=predictions.device)
157+
pred_sum = torch.zeros(num_classes, device=device)
158158

159159
gold_labels_bins = gold_labels[mask].long()
160160
if gold_labels.shape[0] != 0:
@@ -165,9 +165,7 @@ def __call__(
165165
self._total_sum += mask.sum().to(torch.float)
166166

167167
if is_distributed():
168-
true_positive_sum = torch.tensor(true_positive_sum).to(device)
169-
pred_sum = torch.tensor(pred_sum).to(device)
170-
true_sum = torch.tensor(true_sum).to(device)
168+
true_positive_sum = torch.tensor(true_positive_sum, device=device)
171169
dist.all_reduce(true_positive_sum, op=dist.ReduceOp.SUM)
172170
dist.all_reduce(pred_sum, op=dist.ReduceOp.SUM)
173171
dist.all_reduce(true_sum, op=dist.ReduceOp.SUM)

allennlp/training/metrics/fbeta_multi_label_measure.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,9 @@ def __call__(
156156
self._total_sum += mask.expand_as(gold_labels).sum().to(torch.float)
157157

158158
if is_distributed():
159-
true_positive_sum = torch.tensor(true_positive_sum).to(device)
160-
pred_sum = torch.tensor(pred_sum).to(device)
161-
true_sum = torch.tensor(true_sum).to(device)
159+
true_positive_sum = torch.tensor(true_positive_sum, device=device)
160+
pred_sum = torch.tensor(pred_sum, device=device)
161+
true_sum = torch.tensor(true_sum, device=device)
162162
dist.all_reduce(true_positive_sum, op=dist.ReduceOp.SUM)
163163
dist.all_reduce(pred_sum, op=dist.ReduceOp.SUM)
164164
dist.all_reduce(true_sum, op=dist.ReduceOp.SUM)

allennlp/training/metrics/mean_absolute_error.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ def __call__(
4747
_absolute_error = torch.sum(absolute_errors)
4848

4949
if is_distributed():
50-
absolute_error = torch.tensor(_absolute_error).to(device)
51-
total_count = torch.tensor(_total_count).to(device)
50+
absolute_error = torch.tensor(_absolute_error, device=device)
51+
total_count = torch.tensor(_total_count, device=device)
5252
dist.all_reduce(absolute_error, op=dist.ReduceOp.SUM)
5353
dist.all_reduce(total_count, op=dist.ReduceOp.SUM)
5454
_absolute_error = absolute_error.item()

allennlp/training/metrics/rouge.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def _get_rouge_l_score(
113113

114114
if is_distributed():
115115
device = predicted_tokens.device
116-
_total_f1 = torch.tensor(total_f1).to(device)
116+
_total_f1 = torch.tensor(total_f1, device=device)
117117
dist.all_reduce(_total_f1, op=dist.ReduceOp.SUM)
118118
total_f1 = _total_f1.item()
119119

@@ -162,9 +162,9 @@ def _get_rouge_n_stats(
162162

163163
if is_distributed():
164164
device = predicted_tokens.device
165-
_total_recall = torch.tensor(total_recall).to(device)
166-
_total_precision = torch.tensor(total_precision).to(device)
167-
_total_f1 = torch.tensor(total_f1).to(device)
165+
_total_recall = torch.tensor(total_recall, device=device)
166+
_total_precision = torch.tensor(total_precision, device=device)
167+
_total_f1 = torch.tensor(total_f1, device=device)
168168
dist.all_reduce(_total_recall, op=dist.ReduceOp.SUM)
169169
dist.all_reduce(_total_precision, op=dist.ReduceOp.SUM)
170170
dist.all_reduce(_total_f1, op=dist.ReduceOp.SUM)
@@ -209,7 +209,7 @@ def __call__(
209209
sequence_count = len(predictions)
210210
if is_distributed():
211211
device = predictions.device
212-
_sequence_count = torch.tensor(sequence_count).to(device)
212+
_sequence_count = torch.tensor(sequence_count, device=device)
213213
dist.all_reduce(_sequence_count, op=dist.ReduceOp.SUM)
214214
sequence_count = _sequence_count.item()
215215
self._total_sequence_count += sequence_count

allennlp/training/metrics/sequence_accuracy.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ def __call__(
7373
_correct_count = correct
7474

7575
if is_distributed():
76-
correct_count = torch.tensor(_correct_count).to(device)
77-
total_count = torch.tensor(_total_count).to(device)
76+
correct_count = torch.tensor(_correct_count, device=device)
77+
total_count = torch.tensor(_total_count, device=device)
7878
dist.all_reduce(correct_count, op=dist.ReduceOp.SUM)
7979
dist.all_reduce(total_count, op=dist.ReduceOp.SUM)
8080
_correct_count = correct_count.item()

allennlp/training/metrics/unigram_recall.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ def __call__(
8383
_total_count = predictions.size()[0]
8484

8585
if is_distributed():
86-
correct_count = torch.tensor(_correct_count).to(device)
87-
total_count = torch.tensor(_total_count).to(device)
86+
correct_count = torch.tensor(_correct_count, device=device)
87+
total_count = torch.tensor(_total_count, device=device)
8888
dist.all_reduce(correct_count, op=dist.ReduceOp.SUM)
8989
dist.all_reduce(total_count, op=dist.ReduceOp.SUM)
9090
_correct_count = correct_count.item()

benchmarks/nn/util_bench.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import torch
2+
3+
from allennlp.nn import util
4+
from allennlp.common.testing import requires_gpu
5+
6+
7+
@requires_gpu
8+
def bench_add_sentence_boundary_token_ids(benchmark):
9+
device = torch.device("cuda")
10+
# shape: (32, 50)
11+
tensor = torch.tensor([[3] * 50] * 32, device=device)
12+
# shape: (32, 50)
13+
mask = torch.tensor([[True] * 50, [True] * 30 + [False] * 20] * 16, device=device)
14+
begin_token = 1
15+
end_token = 2
16+
benchmark(util.add_sentence_boundary_token_ids, tensor, mask, begin_token, end_token)
17+
18+
19+
@requires_gpu
20+
def bench_remove_sentence_boundaries(benchmark):
21+
device = torch.device("cuda")
22+
# shape: (32, 50, 1)
23+
tensor = torch.tensor([[3] * 50] * 32, device=device).unsqueeze(-1)
24+
# shape: (32, 50)
25+
mask = torch.tensor([[True] * 50, [True] * 30 + [False] * 20] * 16, device=device)
26+
benchmark(util.remove_sentence_boundaries, tensor, mask)
27+
28+
29+
@requires_gpu
30+
def bench_create_tensor_then_send_to_device(benchmark):
31+
device = torch.device("cuda:0")
32+
33+
def create_tensor():
34+
return torch.rand((32, 50)).to(device)
35+
36+
benchmark(create_tensor)
37+
38+
39+
@requires_gpu
40+
def bench_create_tensor_directly_on_device(benchmark):
41+
device = torch.device("cuda:0")
42+
43+
def create_tensor():
44+
return torch.rand((32, 50), device=device)
45+
46+
benchmark(create_tensor)

benchmarks/pytest.ini

+2
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@
66
python_files = *_bench.py
77
python_functions = bench_* *_bench
88
python_classes =
9+
markers =
10+
gpu: marks tests that need at least one GPU

0 commit comments

Comments
 (0)