allenai
diff --git a/‎CHANGELOG.md
+1 b/‎CHANGELOG.md
+1
diff --git a/‎allennlp/common/testing/__init__.py
+55 b/‎allennlp/common/testing/__init__.py
+55
diff --git a/‎allennlp/common/testing/distributed_test.py
+70 b/‎allennlp/common/testing/distributed_test.py
+70
diff --git a/‎allennlp/models/simple_tagger.py
+1-1 b/‎allennlp/models/simple_tagger.py
+1-1
diff --git a/‎allennlp/training/metrics/attachment_scores.py
+24-3 b/‎allennlp/training/metrics/attachment_scores.py
+24-3
diff --git a/‎allennlp/training/metrics/auc.py
+34 b/‎allennlp/training/metrics/auc.py
+34
diff --git a/‎allennlp/training/metrics/average.py
+13 b/‎allennlp/training/metrics/average.py
+13
diff --git a/‎allennlp/training/metrics/bleu.py
+23 b/‎allennlp/training/metrics/bleu.py
+23
@@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed testing models that only return a loss when they are in training mode.
 - Fixed a bug in `FromParams` that caused silent failure in case of the parameter type being `Optional[Union[...]]`.
 - Fixed a bug where the program crashes if `evaluation_data_loader` is a `AllennlpLazyDataset`.
+- Fixed evaluation of all metrics when using distributed training.
 
 ### Added
 
 
@@ -1,11 +1,16 @@
 """
 Utilities and helpers for writing tests.
 """
+from typing import Dict, Any, Optional, Union, Tuple, List
 import torch
+from torch.testing import assert_allclose
 import pytest
 
 from allennlp.common.testing.test_case import AllenNlpTestCase
 from allennlp.common.testing.model_test_case import ModelTestCase
+from allennlp.common.testing.distributed_test import run_distributed_test
+
+from allennlp.training.metrics import Metric
 
 
 _available_devices = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
@@ -45,3 +50,53 @@ def cpu_or_gpu(test_method):
     Decorator to indicate that a test should run on both CPU and GPU
     """
     return pytest.mark.gpu(test_method)
+
+
+# Helpers for testing distributed metrics
+
+
+def assert_metrics_values(
+    metrics: Dict[str, Any],
+    desired_values: Dict[str, Any],
+    rtol: float = 0.0001,
+    atol: float = 1e-05,
+):
+    for key in metrics:
+        assert_allclose(metrics[key], desired_values[key], rtol=rtol, atol=atol)
+
+
+def global_distributed_metric(
+    global_rank: int,
+    world_size: int,
+    gpu_id: Union[int, torch.device],
+    metric: Metric,
+    metric_kwargs: Dict[str, List[Any]],
+    desired_values: Dict[str, Any],
+    exact: Union[bool, Tuple[float, float]] = True,
+):
+    kwargs = {}
+
+    # Use the arguments meant for the process with rank `global_rank`.
+    for argname in metric_kwargs:
+        kwargs[argname] = metric_kwargs[argname][global_rank]
+
+    metric(**kwargs)
+
+    metrics = metric.get_metric(False)
+    if not isinstance(metrics, Dict) and not isinstance(desired_values, Dict):
+        metrics = {"metric_value": metrics}
+        desired_values = {"metric_value": desired_values}
+
+    # Call `assertion_metrics_values` to check if the metrics have the desired values.
+    if isinstance(exact, bool):
+        if exact:
+            rtol = 0.0
+            atol = 0.0
+        else:
+            rtol = 0.0001
+            atol = 1e-05
+    else:
+        rtol = exact[0]
+        atol = exact[1]
+
+    assert_metrics_values(metrics, desired_values, rtol, atol)  # type: ignore
@@ -0,0 +1,70 @@
+import datetime
+from typing import List, Dict, Any, Tuple, Callable
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from allennlp.common.checks import check_for_gpu
+
+
+def init_process(
+    process_rank: int,
+    distributed_device_ids: List[int] = None,
+    world_size: int = 1,
+    func: Callable = None,
+    func_args: Tuple = None,
+    func_kwargs: Dict[str, Any] = None,
+    master_addr: str = "127.0.0.1",
+    master_port: int = 29500,
+):
+    assert world_size > 1
+
+    global_rank = process_rank
+
+    gpu_id = distributed_device_ids[process_rank]  # type: ignore
+
+    if gpu_id >= 0:
+        torch.cuda.set_device(int(gpu_id))
+        dist.init_process_group(
+            backend="nccl",
+            init_method=f"tcp://{master_addr}:{master_port}",
+            world_size=world_size,
+            rank=global_rank,
+        )
+    else:
+        dist.init_process_group(
+            backend="gloo",
+            init_method=f"tcp://{master_addr}:{master_port}",
+            world_size=world_size,
+            rank=global_rank,
+            timeout=datetime.timedelta(seconds=120),
+        )
+
+    func(global_rank, world_size, gpu_id, *func_args, **func_kwargs)
+
+    dist.barrier()
+
+
+def run_distributed_test(
+    device_ids: List[int] = [-1, -1], func: Callable = None, *args, **kwargs,
+):
+    """
+    This runs the `func` in a simulated distributed environment.
+
+    # Parameters
+
+    device_ids: `List[int]`
+        List of devices. There need to be at least 2 devices. Default is [-1, -1].
+
+    func: `Callable`
+        `func` needs to be global for spawning the processes, so that it can be pickled.
+    """
+
+    check_for_gpu(device_ids)
+    nprocs = world_size = len(device_ids)
+    mp.start_processes(
+        init_process,
+        args=(device_ids, world_size, func, args, kwargs),
+        nprocs=nprocs,
+        start_method="fork",
+    )
@@ -213,7 +213,7 @@ def get_metrics(self, reset: bool = False) -> Dict[str, float]:
         }
 
         if self.calculate_span_f1:
-            f1_dict = self._f1_metric.get_metric(reset=reset)
+            f1_dict = self._f1_metric.get_metric(reset)
             if self._verbose_metrics:
                 metrics_to_return.update(f1_dict)
             else:
 
@@ -1,8 +1,10 @@
-from typing import Optional, List
+from typing import Optional, List, Union
 
 from overrides import overrides
 import torch
+import torch.distributed as dist
 
+from allennlp.common.util import is_distributed
 from allennlp.training.metrics.metric import Metric
 
 
@@ -57,6 +59,7 @@ def __call__(  # type: ignore
             predicted_indices, predicted_labels, gold_indices, gold_labels, mask
         )
         predicted_indices, predicted_labels, gold_indices, gold_labels, mask = detached
+        device = predicted_indices.device
 
         if mask is None:
             mask = torch.ones_like(predicted_indices).bool()
@@ -78,14 +81,30 @@ def __call__(  # type: ignore
         correct_labels_and_indices = correct_indices * correct_labels
         labeled_exact_match = (correct_labels_and_indices + ~mask).prod(dim=-1)
 
+        if is_distributed():
+            dist.all_reduce(correct_indices, op=dist.ReduceOp.SUM)
+            dist.all_reduce(unlabeled_exact_match, op=dist.ReduceOp.SUM)
+            dist.all_reduce(correct_labels_and_indices, op=dist.ReduceOp.SUM)
+            dist.all_reduce(labeled_exact_match, op=dist.ReduceOp.SUM)
+
         self._unlabeled_correct += correct_indices.sum()
         self._exact_unlabeled_correct += unlabeled_exact_match.sum()
         self._labeled_correct += correct_labels_and_indices.sum()
         self._exact_labeled_correct += labeled_exact_match.sum()
         self._total_sentences += correct_indices.size(0)
         self._total_words += correct_indices.numel() - (~mask).sum()
 
-    def get_metric(self, reset: bool = False):
+        if is_distributed():
+            _total_sentences = torch.tensor(self._total_sentences).to(device)
+            _total_words = torch.tensor(self._total_words).to(device)
+            dist.all_reduce(_total_sentences, op=dist.ReduceOp.SUM)
+            dist.all_reduce(_total_words, op=dist.ReduceOp.SUM)
+            self._total_sentences = _total_sentences.item()
+            self._total_words = _total_words.item()
+
+    def get_metric(
+        self, reset: bool = False, cuda_device: Union[int, torch.device] = torch.device("cpu"),
+    ):
         """
         # Returns
 
@@ -95,6 +114,7 @@ def get_metric(self, reset: bool = False):
         labeled_attachment_score = 0.0
         unlabeled_exact_match = 0.0
         labeled_exact_match = 0.0
+
         if self._total_words > 0.0:
             unlabeled_attachment_score = float(self._unlabeled_correct) / float(self._total_words)
             labeled_attachment_score = float(self._labeled_correct) / float(self._total_words)
@@ -105,12 +125,13 @@ def get_metric(self, reset: bool = False):
             labeled_exact_match = float(self._exact_labeled_correct) / float(self._total_sentences)
         if reset:
             self.reset()
-        return {
+        metrics = {
             "UAS": unlabeled_attachment_score,
             "LAS": labeled_attachment_score,
             "UEM": unlabeled_exact_match,
             "LEM": labeled_exact_match,
         }
+        return metrics
 
     @overrides
     def reset(self):
 
@@ -2,8 +2,10 @@
 
 from overrides import overrides
 import torch
+import torch.distributed as dist
 from sklearn import metrics
 
+from allennlp.common.util import is_distributed
 from allennlp.common.checks import ConfigurationError
 from allennlp.training.metrics.metric import Metric
 
@@ -82,7 +84,38 @@ def __call__(
             [self._all_gold_labels, torch.masked_select(gold_labels, mask).long()], dim=0
         )
 
+        if is_distributed():
+            world_size = dist.get_world_size()
+            device = gold_labels.device
+
+            # Check if batch lengths are equal.
+            _all_batch_lengths = [torch.tensor(0) for i in range(world_size)]
+            dist.all_gather(
+                _all_batch_lengths, torch.tensor(len(self._all_predictions), device=device)
+            )
+            _all_batch_lengths = [batch_length.item() for batch_length in _all_batch_lengths]
+
+            if len(set(_all_batch_lengths)) > 1:
+                # Subsequent dist.all_gather() calls currently do not handle tensors of different length.
+                raise RuntimeError(
+                    "Distributed aggregation for AUC is currently not supported for batches of unequal length."
+                )
+
+            _all_predictions = [
+                torch.zeros(self._all_predictions.shape, device=device) for i in range(world_size)
+            ]
+
+            _all_gold_labels = [
+                torch.zeros(self._all_gold_labels.shape, device=device, dtype=torch.long)
+                for i in range(world_size)
+            ]
+            dist.all_gather(_all_predictions, self._all_predictions)
+            dist.all_gather(_all_gold_labels, self._all_gold_labels)
+            self._all_predictions = torch.cat(_all_predictions, dim=0)
+            self._all_gold_labels = torch.cat(_all_gold_labels, dim=0)
+
     def get_metric(self, reset: bool = False):
+
         if self._all_gold_labels.shape[0] == 0:
             return 0.5
         false_positive_rates, true_positive_rates, _ = metrics.roc_curve(
@@ -91,6 +124,7 @@ def get_metric(self, reset: bool = False):
             pos_label=self._positive_label,
         )
         auc = metrics.auc(false_positive_rates, true_positive_rates)
+
         if reset:
             self.reset()
         return auc
 
@@ -1,5 +1,9 @@
 from overrides import overrides
 
+import torch
+import torch.distributed as dist
+
+from allennlp.common.util import is_distributed
 from allennlp.training.metrics.metric import Metric
 
 
@@ -26,6 +30,14 @@ def __call__(self, value):
         """
         self._total_value += list(self.detach_tensors(value))[0]
         self._count += 1
+        if is_distributed():
+            device = torch.device("cpu")
+            _count = torch.tensor(self._count).to(device)
+            _total_value = torch.tensor(self._total_value).to(device)
+            dist.all_reduce(_count, op=dist.ReduceOp.SUM)
+            dist.all_reduce(_total_value, op=dist.ReduceOp.SUM)
+            self._count = _count.item()
+            self._total_value = _total_value.item()
 
     @overrides
     def get_metric(self, reset: bool = False):
@@ -34,6 +46,7 @@ def get_metric(self, reset: bool = False):
 
         The average of all values that were passed to `__call__`.
         """
+
         average_value = self._total_value / self._count if self._count > 0 else 0
         if reset:
             self.reset()
 
@@ -4,7 +4,9 @@
 
 from overrides import overrides
 import torch
+import torch.distributed as dist
 
+from allennlp.common.util import is_distributed
 from allennlp.training.metrics.metric import Metric
 
 
@@ -116,10 +118,21 @@ def __call__(
         None
         """
         predictions, gold_targets = self.detach_tensors(predictions, gold_targets)
+        device = gold_targets.device
+        if is_distributed():
+            world_size = dist.get_world_size()
+
         for ngram_size, _ in enumerate(self._ngram_weights, start=1):
             precision_matches, precision_totals = self._get_modified_precision_counts(
                 predictions, gold_targets, ngram_size
             )
+            if is_distributed():
+                _precision_matches = torch.tensor(precision_matches).to(device)
+                _precision_totals = torch.tensor(precision_totals).to(device)
+                dist.all_reduce(_precision_matches, op=dist.ReduceOp.SUM)
+                dist.all_reduce(_precision_totals, op=dist.ReduceOp.SUM)
+                precision_matches = _precision_matches.item() / world_size
+                precision_totals = _precision_totals.item() / world_size
             self._precision_matches[ngram_size] += precision_matches
             self._precision_totals[ngram_size] += precision_totals
         if not self._exclude_indices:
@@ -133,8 +146,17 @@ def __call__(
             valid_gold_targets_mask = get_valid_tokens_mask(gold_targets, self._exclude_indices)
             self._reference_lengths += valid_gold_targets_mask.sum().item()
 
+        if is_distributed():
+            _prediction_lengths = torch.tensor(self._prediction_lengths).to(device)
+            _reference_lengths = torch.tensor(self._reference_lengths).to(device)
+            dist.all_reduce(_prediction_lengths, op=dist.ReduceOp.SUM)
+            dist.all_reduce(_reference_lengths, op=dist.ReduceOp.SUM)
+            self._prediction_lengths = _prediction_lengths.item()
+            self._reference_lengths = _reference_lengths.item()
+
     @overrides
     def get_metric(self, reset: bool = False) -> Dict[str, float]:
+
         brevity_penalty = self._get_brevity_penalty()
         ngram_scores = (
             weight
@@ -145,6 +167,7 @@ def get_metric(self, reset: bool = False) -> Dict[str, float]:
             for n, weight in enumerate(self._ngram_weights, start=1)
         )
         bleu = brevity_penalty * math.exp(sum(ngram_scores))
+
         if reset:
             self.reset()
         return {"BLEU": bleu}
Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ def get_metrics(self, reset: bool = False) -> Dict[str, float]:`
`213`	`213`	`}`
`214`	`214`
`215`	`215`	`if self.calculate_span_f1:`
`216`		`- f1_dict = self._f1_metric.get_metric(reset=reset)`
	`216`	`+ f1_dict = self._f1_metric.get_metric(reset)`
`217`	`217`	`if self._verbose_metrics:`
`218`	`218`	`metrics_to_return.update(f1_dict)`
`219`	`219`	`else:`