Evaluate on a token-weighted basis. (#2183)

brendan-ai2 · web-flow · commit e6ad6e9a90b5 · 2019-01-10T12:50:45.000-08:00
- Allow models to return a `"batch_weight"` key that will be used to weight each batch's loss. - Per @matt-peters' suggestion. - Performed in Calypso: https://github.com/allenai/calypso/blob/master/calypso/train.py#L699 - Remove unused "loss_scale". This was never set in the training config, so it should be fairly safe.
diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
@@ -81,6 +81,11 @@ def add_subparser(self, name: str, parser: argparse._SubParsersAction) -> argpar
                                default="",
                                help='a JSON structure used to override the experiment configuration')
 
+        subparser.add_argument('--batch-weight-key',
+                               type=str,
+                               default="",
+                               help='If non-empty, name of metric used to weight the loss on a per-batch basis.')
+
         subparser.set_defaults(func=evaluate_from_args)
 
         return subparser
@@ -89,7 +94,8 @@ def add_subparser(self, name: str, parser: argparse._SubParsersAction) -> argpar
 def evaluate(model: Model,
              instances: Iterable[Instance],
              data_iterator: DataIterator,
-             cuda_device: int) -> Dict[str, Any]:
+             cuda_device: int,
+             batch_weight_key: str) -> Dict[str, Any]:
     _warned_tqdm_ignores_underscores = False
     check_for_gpu(cuda_device)
     with torch.no_grad():
@@ -101,21 +107,34 @@ def evaluate(model: Model,
         logger.info("Iterating over dataset")
         generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
 
+        # Number of batches in instances.
         batch_count = 0
+        # Number of batches where the model produces a loss.
         loss_count = 0
+        # Cumulative weighted loss
         total_loss = 0.0
+        # Cumulative weight across all batches.
+        total_weight = 0.0
 
         for batch in generator_tqdm:
             batch_count += 1
             batch = util.move_to_device(batch, cuda_device)
-            loss = model(**batch).get("loss")
+            output_dict = model(**batch)
+            loss = output_dict.get("loss")
 
             metrics = model.get_metrics()
 
             if loss is not None:
                 loss_count += 1
-                metrics["loss"] = loss.item()
-                total_loss += loss.item()
+                if batch_weight_key:
+                    weight = output_dict[batch_weight_key].item()
+                else:
+                    weight = 1.0
+
+                total_weight += weight
+                total_loss += loss.item() * weight
+                # Report the average loss so far.
+                metrics["loss"] = total_loss / total_weight
 
             if (not _warned_tqdm_ignores_underscores and
                         any(metric_name.startswith("_") for metric_name in metrics)):
@@ -128,10 +147,11 @@ def evaluate(model: Model,
 
         final_metrics = model.get_metrics(reset=True)
         if loss_count > 0:
+            # Sanity check
             if loss_count != batch_count:
                 raise RuntimeError("The model you are trying to evaluate only sometimes " +
                                    "produced a loss!")
-            final_metrics["loss"] = total_loss/batch_count
+            final_metrics["loss"] = total_loss / total_weight
 
         return final_metrics
 
@@ -168,7 +188,7 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
     iterator = DataIterator.from_params(iterator_params)
     iterator.index_with(model.vocab)
 
-    metrics = evaluate(model, instances, iterator, args.cuda_device)
+    metrics = evaluate(model, instances, iterator, args.cuda_device, args.batch_weight_key)
 
     logger.info("Finished evaluating.")
     logger.info("Metrics:")
diff --git a/allennlp/commands/fine_tune.py b/allennlp/commands/fine_tune.py
@@ -70,6 +70,11 @@ def add_subparser(self, name: str, parser: argparse._SubParsersAction) -> argpar
                                default=False,
                                help='outputs tqdm status on separate lines and slows tqdm refresh rate')
 
+        subparser.add_argument('--batch-weight-key',
+                               type=str,
+                               default="",
+                               help='If non-empty, name of metric used to weight the loss on a per-batch basis.')
+
         subparser.set_defaults(func=fine_tune_model_from_args)
 
         return subparser
@@ -84,15 +89,17 @@ def fine_tune_model_from_args(args: argparse.Namespace):
                                     serialization_dir=args.serialization_dir,
                                     overrides=args.overrides,
                                     extend_vocab=args.extend_vocab,
-                                    file_friendly_logging=args.file_friendly_logging)
+                                    file_friendly_logging=args.file_friendly_logging,
+                                    batch_weight_key=args.batch_weight_key)
 
 
 def fine_tune_model_from_file_paths(model_archive_path: str,
                                     config_file: str,
                                     serialization_dir: str,
                                     overrides: str = "",
                                     extend_vocab: bool = False,
-                                    file_friendly_logging: bool = False) -> Model:
+                                    file_friendly_logging: bool = False,
+                                    batch_weight_key: str = "") -> Model:
     """
     A wrapper around :func:`fine_tune_model` which loads the model archive from a file.
 
@@ -121,14 +128,16 @@ def fine_tune_model_from_file_paths(model_archive_path: str,
                            params=params,
                            serialization_dir=serialization_dir,
                            extend_vocab=extend_vocab,
-                           file_friendly_logging=file_friendly_logging)
+                           file_friendly_logging=file_friendly_logging,
+                           batch_weight_key=batch_weight_key)
 
 
 def fine_tune_model(model: Model,
                     params: Params,
                     serialization_dir: str,
                     extend_vocab: bool = False,
-                    file_friendly_logging: bool = False) -> Model:
+                    file_friendly_logging: bool = False,
+                    batch_weight_key: str = "") -> Model:
     """
     Fine tunes the given model, using a set of parameters that is largely identical to those used
     for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored,
@@ -248,7 +257,13 @@ def fine_tune_model(model: Model,
     archive_model(serialization_dir, files_to_archive=params.files_to_archive)
 
     if test_data and evaluate_on_test:
-        test_metrics = evaluate(model, test_data, iterator, cuda_device=trainer._cuda_devices[0])  # pylint: disable=protected-access
+        test_metrics = evaluate(
+                model,
+                test_data,
+                iterator,
+                cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access
+                batch_weight_key=batch_weight_key
+        )
         for key, value in test_metrics.items():
             metrics["test_" + key] = value
 
diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
@@ -385,7 +385,9 @@ def train_model(params: Params,
         logger.info("The model will be evaluated using the best epoch weights.")
         test_metrics = evaluate(
                 best_model, test_data, validation_iterator or iterator,
-                cuda_device=trainer._cuda_devices[0] # pylint: disable=protected-access
+                cuda_device=trainer._cuda_devices[0], # pylint: disable=protected-access,
+                # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
+                batch_weight_key=""
         )
         for key, value in test_metrics.items():
             metrics["test_" + key] = value
diff --git a/allennlp/common/testing/model_test_case.py b/allennlp/common/testing/model_test_case.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Any, Dict, Set, Union
+from typing import Any, Dict, Set, Union, Iterable
 
 from numpy.testing import assert_allclose
 import torch
@@ -197,7 +197,18 @@ def check_model_computes_gradients_correctly(model: Model,
                 print(f"Parameter: {name} had incorrect gradient: {grad}")
             raise Exception("Incorrect gradients found. See stdout for more info.")
 
-    def ensure_batch_predictions_are_consistent(self):
+    def ensure_batch_predictions_are_consistent(
+            self,
+            keys_to_ignore: Iterable[str] = ()):
+        """
+        Ensures that the model performs the same on a batch of instances as on individual instances.
+        Ignores metrics matching the regexp .*loss.* and those specified explicitly.
+
+        Parameters
+        ----------
+        keys_to_ignore : ``Iterable[str]``, optional (default=())
+            Names of metrics that should not be taken into account, e.g. "batch_weight".
+        """
         self.model.eval()
         single_predictions = []
         for i, instance in enumerate(self.instances):
@@ -215,6 +226,8 @@ def ensure_batch_predictions_are_consistent(self):
                     # Loss is particularly unstable; we'll just be satisfied if everything else is
                     # close.
                     continue
+                if key in keys_to_ignore:
+                    continue
                 single_predicted = single_predicted[0]
                 batch_predicted = batch_predictions[key][i]
                 if isinstance(single_predicted, torch.Tensor):
diff --git a/allennlp/models/bidirectional_lm.py b/allennlp/models/bidirectional_lm.py
@@ -1,5 +1,3 @@
-from typing import Union
-
 from allennlp.data.vocabulary import Vocabulary
 from allennlp.models.language_model import LanguageModel
 from allennlp.models.model import Model
@@ -33,10 +31,6 @@ class BidirectionalLanguageModel(LanguageModel):
     dropout: ``float``, optional (default: None)
         If specified, dropout is applied to the contextualized embeddings before computation of
         the softmax. The contextualized embeddings themselves are returned without dropout.
-    loss_scale: ``Union[float, str]``, optional (default: 1.0)
-        This scaling factor is applied to the average language model loss.
-        You can also specify ``"n_samples"`` in which case we compute total
-        loss across all predictions.
     num_samples: ``int``, optional (default: None)
         If provided, the model will use ``SampledSoftmaxLoss``
         with the specified number of samples. Otherwise, it will use
@@ -49,15 +43,13 @@ def __init__(self,
                  text_field_embedder: TextFieldEmbedder,
                  contextualizer: Seq2SeqEncoder,
                  dropout: float = None,
-                 loss_scale: Union[float, str] = 1.0,
                  num_samples: int = None,
                  sparse_embeddings: bool = False,
                  initializer: InitializerApplicator = None) -> None:
         super().__init__(vocab=vocab,
                          text_field_embedder=text_field_embedder,
                          contextualizer=contextualizer,
                          dropout=dropout,
-                         loss_scale=loss_scale,
                          num_samples=num_samples,
                          sparse_embeddings=sparse_embeddings,
                          bidirectional=True,
diff --git a/allennlp/models/language_model.py b/allennlp/models/language_model.py
@@ -77,10 +77,6 @@ class LanguageModel(Model):
     dropout: ``float``, optional (default: None)
         If specified, dropout is applied to the contextualized embeddings before computation of
         the softmax. The contextualized embeddings themselves are returned without dropout.
-    loss_scale: ``Union[float, str]``, optional (default: 1.0)
-        This scaling factor is applied to the average language model loss.
-        You can also specify ``"n_samples"`` in which case we compute total
-        loss across all predictions.
     num_samples: ``int``, optional (default: None)
         If provided, the model will use ``SampledSoftmaxLoss``
         with the specified number of samples. Otherwise, it will use
@@ -97,7 +93,6 @@ def __init__(self,
                  text_field_embedder: TextFieldEmbedder,
                  contextualizer: Seq2SeqEncoder,
                  dropout: float = None,
-                 loss_scale: Union[float, str] = 1.0,
                  num_samples: int = None,
                  sparse_embeddings: bool = False,
                  bidirectional: bool = False,
@@ -140,7 +135,6 @@ def __init__(self,
         else:
             self._dropout = lambda x: x
 
-        self._loss_scale = loss_scale
         if initializer is not None:
             initializer(self)
 
@@ -312,17 +306,12 @@ def forward(self,  # type: ignore
             self._last_average_loss[0] = average_loss.detach().item()
 
             if num_targets > 0:
-                # loss is directly minimized
-                if self._loss_scale == 'n_samples':
-                    scale_factor = num_targets.float()
-                else:
-                    scale_factor = self._loss_scale
-
                 return_dict.update({
-                        'loss': average_loss * scale_factor,
-                        'forward_loss': forward_loss * scale_factor / num_targets.float(),
-                        'backward_loss': (backward_loss * scale_factor / num_targets.float()
-                                          if backward_loss is not None else None)
+                        'loss': average_loss,
+                        'forward_loss': forward_loss / num_targets.float(),
+                        'backward_loss': (backward_loss / num_targets.float()
+                                          if backward_loss is not None else None),
+                        'batch_weight': num_targets.float()
                 })
             else:
                 # average_loss zero tensor, return it for all
diff --git a/allennlp/tests/commands/evaluate_test.py b/allennlp/tests/commands/evaluate_test.py
@@ -1,11 +1,40 @@
 # pylint: disable=invalid-name,no-self-use
 import argparse
 import json
+from typing import Iterator, List, Dict, Iterable
 
+import torch
 from flaky import flaky
 
-from allennlp.commands.evaluate import evaluate_from_args, Evaluate
+from allennlp.commands.evaluate import evaluate_from_args, Evaluate, evaluate
 from allennlp.common.testing import AllenNlpTestCase
+from allennlp.data import DataIterator, Instance
+from allennlp.data.dataset import Batch
+from allennlp.data.iterators.data_iterator import TensorDict
+from allennlp.models import Model
+
+
+class DummyIterator(DataIterator):
+    def __init__(self, outputs: List[TensorDict]) -> None:
+        super().__init__()
+        self._outputs = outputs
+
+    def __call__(self,
+                 instances: Iterable[Instance],
+                 num_epochs: int = None,
+                 shuffle: bool = True) -> Iterator[TensorDict]:
+        yield from self._outputs
+
+    def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
+        raise NotImplementedError
+
+
+class DummyModel(Model):
+    def __init__(self) -> None:
+        super().__init__(None) # type: ignore
+
+    def forward(self, **kwargs) -> Dict[str, torch.Tensor]:  # type: ignore # pylint: disable=arguments-differ
+        return kwargs
 
 
 class TestEvaluate(AllenNlpTestCase):
@@ -16,6 +45,23 @@ def setUp(self):
         subparsers = self.parser.add_subparsers(title='Commands', metavar='')
         Evaluate().add_subparser('evaluate', subparsers)
 
+    def test_evaluate_calculates_average_loss(self):
+        losses = [7.0, 9.0, 8.0]
+        outputs = [{"loss": torch.Tensor([loss])} for loss in losses]
+        iterator = DummyIterator(outputs)
+        metrics = evaluate(DummyModel(), None, iterator, -1, "")
+        self.assertAlmostEqual(metrics["loss"], 8.0)
+
+    def test_evaluate_calculates_average_loss_with_weights(self):
+        losses = [7.0, 9.0, 8.0]
+        weights = [10, 2, 1.5]
+        inputs = zip(losses, weights)
+        outputs = [{"loss": torch.Tensor([loss]), "batch_weight": torch.Tensor([weight])}
+                   for loss, weight in inputs]
+        iterator = DummyIterator(outputs)
+        metrics = evaluate(DummyModel(), None, iterator, -1, "batch_weight")
+        self.assertAlmostEqual(metrics["loss"], (70 + 18 + 12)/13.5)
+
     @flaky
     def test_evaluate_from_args(self):
         kebab_args = ["evaluate", str(self.FIXTURES_ROOT / "bidaf" / "serialization" / "model.tar.gz"),
diff --git a/allennlp/tests/models/bidirectional_lm_test.py b/allennlp/tests/models/bidirectional_lm_test.py
@@ -16,14 +16,14 @@ def test_bidirectional_lm_can_train_save_load(self):
         self.ensure_model_can_train_save_and_load(self.param_file)
 
     def test_batch_predictions_are_consistent(self):
-        self.ensure_batch_predictions_are_consistent()
+        self.ensure_batch_predictions_are_consistent(keys_to_ignore=["batch_weight"])
 
     def test_forward_pass_runs_correctly(self):
         training_tensors = self.dataset.as_tensor_dict()
         result = self.model(**training_tensors)
 
-        assert set(result) == {"loss", "forward_loss", "backward_loss",
-                               "lm_embeddings", "noncontextual_token_embeddings", "mask"}
+        assert set(result) == {"loss", "forward_loss", "backward_loss", "lm_embeddings",
+                               "noncontextual_token_embeddings", "mask", "batch_weight"}
 
         # The model should preserve the BOS / EOS tokens.
         embeddings = result["lm_embeddings"]
diff --git a/allennlp/tests/models/language_model_test.py b/allennlp/tests/models/language_model_test.py
@@ -23,14 +23,14 @@ def test_unidirectional_language_model_can_train_save_and_load(self):
         self.ensure_model_can_train_save_and_load(self.param_file)
 
     def test_batch_predictions_are_consistent(self):
-        self.ensure_batch_predictions_are_consistent()
+        self.ensure_batch_predictions_are_consistent(keys_to_ignore=["batch_weight"])
 
     def test_forward_pass_runs_correctly(self):
         training_tensors = self.dataset.as_tensor_dict()
         result = self.model(**training_tensors)
 
-        assert set(result) == {"loss", "forward_loss", "backward_loss",
-                               "lm_embeddings", "noncontextual_token_embeddings", "mask"}
+        assert set(result) == {"loss", "forward_loss", "backward_loss", "lm_embeddings",
+                               "noncontextual_token_embeddings", "mask", "batch_weight"}
 
         # The model should preserve the BOS / EOS tokens.
         embeddings = result["lm_embeddings"]