allenai · brendan-ai2 · Jan 18, 2019 · Nov 21, 2018 · Dec 1, 2018 · Dec 4, 2018
diff --git a/allennlp/data/iterators/bucket_iterator.py b/allennlp/data/iterators/bucket_iterator.py
@@ -124,6 +124,9 @@ def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Itera
             if excess:
                 batches.append(Batch(excess))
 
+            # TODO(brendanr): Add multi-GPU friendly grouping, i.e. group
+            # num_gpu batches together, shuffle and then expand the groups.
+            # This guards against imbalanced batches across GPUs.
             move_to_front = self._biggest_batch_first and len(batches) > 1
             if move_to_front:
                 # We'll actually pop the last _two_ batches, because the last one might not be full.

diff --git a/allennlp/data/iterators/data_iterator.py b/allennlp/data/iterators/data_iterator.py
@@ -125,6 +125,8 @@ def __call__(self,
                 tensor_dicts = self._cache[key]
 
                 if shuffle:
+                    # TODO(brendanr): How can we handle this shuffle in a way
+                    # that respects multi-GPU friendly grouping?
                     random.shuffle(tensor_dicts)
                 for tensor_dict in tensor_dicts:
                     if self._track_epoch:

diff --git a/allennlp/tests/training/trainer_test.py b/allennlp/tests/training/trainer_test.py
@@ -20,7 +20,8 @@
 from allennlp.common.params import Params
 from allennlp.models.simple_tagger import SimpleTagger
 from allennlp.data.iterators import BasicIterator
-from allennlp.data.dataset_readers import SequenceTaggingDatasetReader
+from allennlp.data.dataset_readers import SequenceTaggingDatasetReader, WikiTablesDatasetReader
+from allennlp.models.archival import load_archive
 from allennlp.models.model import Model
 
 
@@ -91,6 +92,9 @@ def test_trainer_can_run(self):
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device registered.")
     def test_trainer_can_run_cuda(self):
+        # Trainer expects the model to already be on the correct device.
+        self.model.cuda(0)
+
         trainer = Trainer(self.model, self.optimizer,
                           self.iterator, self.instances, num_epochs=2,
                           cuda_device=0)
@@ -99,6 +103,8 @@ def test_trainer_can_run_cuda(self):
     @pytest.mark.skipif(torch.cuda.device_count() < 2,
                         reason="Need multiple GPUs.")
     def test_trainer_can_run_multiple_gpu(self):
+        # Trainer expects the model to already be on some GPU in the multi-GPU setting.
+        self.model.cuda(0)
 
         class MetaDataCheckWrapper(Model):
             """
@@ -132,6 +138,23 @@ def forward(self, **kwargs) -> Dict[str, torch.Tensor]:  # type: ignore # pylint
         assert 'peak_gpu_1_memory_MB' in metrics
         assert isinstance(metrics['peak_gpu_1_memory_MB'], int)
 
+    @pytest.mark.skipif(torch.cuda.device_count() < 2,
+                        reason="Need multiple GPUs.")
+    def test_production_rule_field_with_multiple_gpus(self):
+        wikitables_dir = 'allennlp/tests/fixtures/data/wikitables/'
+        wikitables_reader = WikiTablesDatasetReader(tables_directory=wikitables_dir,
+                                                    dpd_output_directory=wikitables_dir + 'dpd_output/')
+        instances = wikitables_reader.read(wikitables_dir + 'sample_data.examples')
+        archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'wikitables' / 'serialization' / 'model.tar.gz'
+        model = load_archive(archive_path).model
+        # Trainer expects the model to already be on some GPU in the multi-GPU setting.
+        model.cuda(0)
+
+        multigpu_iterator = BasicIterator(batch_size=4)
+        multigpu_iterator.index_with(model.vocab)
+        trainer = Trainer(model, self.optimizer, multigpu_iterator, instances, num_epochs=2, cuda_device=[0, 1])
+        trainer.train()
+
     def test_trainer_can_resume_training(self):
         trainer = Trainer(self.model, self.optimizer,
                           self.iterator, self.instances,

diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
@@ -1,5 +1,6 @@
 
 import logging
+import math
 import os
 import time
 import re
@@ -13,10 +14,10 @@
 from allennlp.common import Params
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.util import (dump_metrics, gpu_memory_mb, parse_cuda_device, peak_memory_mb,
-                                  get_frozen_and_tunable_parameter_names)
+                                  get_frozen_and_tunable_parameter_names, lazy_groups_of)
 from allennlp.common.tqdm import Tqdm
 from allennlp.data.instance import Instance
-from allennlp.data.iterators.data_iterator import DataIterator
+from allennlp.data.iterators.data_iterator import DataIterator, TensorDict
 from allennlp.data.vocabulary import Vocabulary
 from allennlp.models.model import Model
 from allennlp.nn import util as nn_util
@@ -212,14 +213,16 @@ def __init__(self,
     def rescale_gradients(self) -> Optional[float]:
         return training_util.rescale_gradients(self.model, self._grad_norm)
 
-    def batch_loss(self, batch: torch.Tensor, for_training: bool) -> torch.Tensor:
+    def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch.Tensor:
         """
-        Does a forward pass on the given batch and returns the ``loss`` value in the result.
+        Does a forward pass on the given batches and returns the ``loss`` value in the result.
         If ``for_training`` is `True` also applies regularization penalty.
         """
         if self._multiple_gpu:
-            output_dict = training_util.data_parallel(batch, self.model, self._cuda_devices)
+            output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
         else:
+            assert len(batch_group) == 1
+            batch = batch_group[0]
             batch = nn_util.move_to_device(batch, self._cuda_devices[0])
             output_dict = self.model(**batch)
 
@@ -251,11 +254,14 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
         # Set the model to "train" mode.
         self.model.train()
 
+        num_gpus = len(self._cuda_devices)
+
         # Get tqdm for the training batches
-        train_generator = self.iterator(self.train_data,
-                                        num_epochs=1,
-                                        shuffle=self.shuffle)
-        num_training_batches = self.iterator.get_num_batches(self.train_data)
+        raw_train_generator = self.iterator(self.train_data,
+                                            num_epochs=1,
+                                            shuffle=self.shuffle)
+        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
+        num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data)/num_gpus)
         self._last_log = time.time()
         last_save_time = time.time()
 
@@ -265,18 +271,20 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
 
         histogram_parameters = set(self.model.get_parameters_for_histogram_tensorboard_logging())
 
+
         logger.info("Training")
         train_generator_tqdm = Tqdm.tqdm(train_generator,
                                          total=num_training_batches)
         cumulative_batch_size = 0
-        for batch in train_generator_tqdm:
+        for batch_group in train_generator_tqdm:
             batches_this_epoch += 1
             self._batch_num_total += 1
             batch_num_total = self._batch_num_total
 
             self.optimizer.zero_grad()
 
-            loss = self.batch_loss(batch, for_training=True)
+            loss = self.batch_loss(batch_group, for_training=True)
+
             if torch.isnan(loss):
                 raise ValueError("nan loss encountered")
 
@@ -325,7 +333,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
                 self._tensorboard.log_histograms(self.model, histogram_parameters)
 
             if self._log_batch_size_period:
-                cur_batch = training_util.get_batch_size(batch)
+                cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group])
                 cumulative_batch_size += cur_batch
                 if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                     average = cumulative_batch_size/batches_this_epoch
@@ -361,17 +369,20 @@ def _validation_loss(self) -> Tuple[float, int]:
         else:
             val_iterator = self.iterator
 
-        val_generator = val_iterator(self._validation_data,
-                                     num_epochs=1,
-                                     shuffle=False)
-        num_validation_batches = val_iterator.get_num_batches(self._validation_data)
+        num_gpus = len(self._cuda_devices)
+
+        raw_val_generator = val_iterator(self._validation_data,
+                                         num_epochs=1,
+                                         shuffle=False)
+        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
+        num_validation_batches = math.ceil(val_iterator.get_num_batches(self._validation_data)/num_gpus)
         val_generator_tqdm = Tqdm.tqdm(val_generator,
                                        total=num_validation_batches)
         batches_this_epoch = 0
         val_loss = 0
-        for batch in val_generator_tqdm:
+        for batch_group in val_generator_tqdm:
 
-            loss = self.batch_loss(batch, for_training=False)
+            loss = self.batch_loss(batch_group, for_training=False)
             if loss is not None:
                 # You shouldn't necessarily have to compute a loss for validation, so we allow for
                 # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is

diff --git a/allennlp/training/util.py b/allennlp/training/util.py
@@ -18,6 +18,7 @@
 from allennlp.data.dataset_readers import DatasetReader
 from allennlp.data import Instance
 from allennlp.data.iterators import DataIterator
+from allennlp.data.iterators.data_iterator import TensorDict
 from allennlp.models.model import Model
 from allennlp.models.archival import CONFIG_NAME
 from allennlp.nn import util as nn_util
@@ -228,24 +229,29 @@ def create_serialization_dir(
                                      "does not exist.  There is nothing to recover from.")
         os.makedirs(serialization_dir, exist_ok=True)
 
-def data_parallel(batch, model: Model, cuda_devices: List) -> Dict[str, torch.Tensor]:
+def data_parallel(batch_group: List[TensorDict],
+                  model: Model,
+                  cuda_devices: List) -> Dict[str, torch.Tensor]:
     """
     Performs a forward pass using multiple GPUs.  This is a simplification
     of torch.nn.parallel.data_parallel to support the allennlp model
     interface.
     """
-    inputs, module_kwargs = scatter_kwargs((), batch, cuda_devices, 0)
+    assert len(batch_group) <= len(cuda_devices)
 
-    used_device_ids = cuda_devices[:len(inputs)]
+    inputs = [()] * len(batch_group)
+    moved = [nn_util.move_to_device(batch, device)
+             for batch, device in zip(batch_group, cuda_devices)]
+
+    used_device_ids = cuda_devices[:len(moved)]
     replicas = replicate(model, used_device_ids)
-    outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
+    outputs = parallel_apply(replicas, inputs, moved, used_device_ids)
 
     # Only the 'loss' is needed.
     # a (num_gpu, ) tensor with loss on each GPU
     losses = gather([output['loss'].unsqueeze(0) for output in outputs], used_device_ids[0], 0)
     return {'loss': losses.mean()}
 
-
 def enable_gradient_clipping(model: Model, grad_clipping: Optional[float]) -> None:
     if grad_clipping is not None:
         for parameter in model.parameters():

diff --git a/training_config/bidirectional_language_model.jsonnet b/training_config/bidirectional_language_model.jsonnet
@@ -21,7 +21,7 @@ local BASE_READER = {
             "type": "elmo_characters"
           }
         },
-        "max_sequence_length": 500,
+        "max_sequence_length": 400,
         "start_tokens": ["<S>"],
         "end_tokens": ["</S>"]
 };
@@ -34,7 +34,7 @@ local BASE_ITERATOR = {
   // samples in every batch.
   "batch_size": 512 * NUM_GPUS,
   "sorting_keys": [["source", "num_tokens"]],
-  "maximum_samples_per_batch": ["num_tokens", NUM_GPUS * 1000]
+  "maximum_samples_per_batch": ["num_tokens", 2000]
 };
 
 {
@@ -117,7 +117,7 @@ local BASE_ITERATOR = {
     // The multiprocess dataset reader and iterator use many file descriptors,
     // so we need to increase the ulimit depending on the size of this queue.
     // See https://pytorch.org/docs/stable/multiprocessing.html#file-descriptor-file-descriptor
-    // for a description of the underlying issue. `ulimit -n 4096` has sufficed,
+    // for a description of the underlying issue. `ulimit -n 8192` has sufficed,
     // but that number could use tuning.
     "output_queue_size": 500
   },
@@ -139,6 +139,7 @@ local BASE_ITERATOR = {
       // See https://github.com/allenai/calypso/blob/master/bin/train_transformer_lm1b.py#L51.
       // Adjusted based on our sample size relative to Calypso's.
       "warmup_steps": 6000
-    }
+    },
+    "should_log_learning_rate": true
   }
 }