allenai · epwalsh · Jul 19, 2021 · Apr 27, 2021 · Apr 27, 2021 · Apr 27, 2021
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -162,6 +162,8 @@ jobs:
         . .venv/bin/activate
         git clone https://github.com/allenai/allennlp-models.git
         cd allennlp-models
+        # TODO: remove
+        git checkout fairscale
         pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt
 
     - name: Debug info

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,10 +19,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `TransformerTextField`, for cases where you don't care about AllenNLP's advanced text handling capabilities.
 - Added `TransformerModule._post_load_pretrained_state_dict_hook()` method. Can be used to modify `missing_keys` and `unexpected_keys` after
   loading a pretrained state dictionary. This is useful when tying weights, for example.
+- Added a module `allennlp.nn.parallel` with a new base class, `DdpAccelerator`, which generalizes
+  PyTorch's `DistributedDataParallel` wrapper to support other implementations. Two implementations of
+  this class are provided. The default is `TorchDdpAccelerator` (registered at "torch"), which is just a thin wrapper around
+  `DistributedDataParallel`. The other is `FairScaleFsdpAccelerator`, which wraps FairScale's
+  [`FullyShardedDataParallel`](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html).
+  You can specify the `DdpAccelerator` in the "distributed" section of a configuration file under the key "ddp_accelerator".
+- Added a module `allennlp.nn.checkpoint` with a new base class, `CheckpointWrapper`, for implementations
+  of activation/gradient checkpointing. Two implentations are provided. The default implementation is `TorchCheckpointWrapper` (registered as "torch"),
+  which exposes [PyTorch's checkpoint functionality](https://pytorch.org/docs/stable/checkpoint.html).
+  The other is `FairScaleCheckpointWrapper` which exposes the more flexible
+  [checkpointing funtionality from FairScale](https://fairscale.readthedocs.io/en/latest/api/nn/checkpoint/checkpoint_activations.html).
+- The `Model` base class now takes a `ddp_accelerator` parameter (an instance of `DdpAccelerator`) which will be available as
+  `self.ddp_accelerator` during distributed training. This is useful when, for example, instantiating submodules in your
+  model's `__init__()` method by wrapping them with `self.ddp_accelerator.wrap_module()`. See the `allennlp.modules.transformer.t5`
+  for an example.
 - Added an end-to-end test for the Transformer Toolkit.
 
 ### Fixed
 
+- Fixed a mispelling: the parameter `contructor_extras` in `Lazy()` is now correctly called `constructor_extras`.
 - Fixed Broken link in `allennlp.fairness.fairness_metrics.Separation` docs
 - Ensured all `allennlp` submodules are imported with `allennlp.common.plugins.import_plugins()`.
 - Fixed `IndexOutOfBoundsException` in `MultiOptimizer` when checking if optimizer received any parameters.
@@ -38,6 +54,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 
 - Changed behavior of `MultiOptimizer` so that while a default optimizer is still required, an error is not thrown if the default optimizer receives no parameters.
+- The type of the `grad_norm` parameter of `GradientDescentTrainer` is now `Union[float, bool]`,
+  with a default value of `False`. `False` means gradients are not rescaled and the gradient
+  norm is never even calculated. `True` means the gradients are still not rescaled but the gradient
+  norm is calculated and passed on to callbacks. A `float` value means gradients are rescaled.
 - Made the epsilon parameter for the layer normalization in token embeddings configurable. 
 
 ### Removed

diff --git a/Dockerfile.test b/Dockerfile.test
@@ -15,6 +15,11 @@ ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 LABEL com.nvidia.volumes.needed="nvidia_driver"
 
+# These environment variables are helpful for debugging.
+# See https://pytorch.org/docs/stable/distributed.html#common-environment-variables for more info.
+ENV NCCL_DEBUG INFO
+ENV NCCL_DEBUG_SUBSYS ALL
+
 WORKDIR /stage/allennlp
 
 # Install torch ecosystem first. This build arg should be in the form of a version requirement,

diff --git a/Makefile b/Makefile
@@ -100,6 +100,7 @@ install :
 	pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt
 	# These nltk packages are used by the 'checklist' module.
 	$(NLTK_DOWNLOAD_CMD)
+
 #
 # Documention helpers.
 #
@@ -175,4 +176,4 @@ docker-test-image :
 
 .PHONY : docker-test-run
 docker-test-run :
-	$(DOCKER_RUN_CMD) $(DOCKER_GPUS) $(DOCKER_TEST_IMAGE_NAME) $(ARGS)
+	$(DOCKER_RUN_CMD) --shm-size 2G $(DOCKER_GPUS) $(DOCKER_TEST_IMAGE_NAME) $(ARGS)
diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
@@ -26,7 +26,8 @@
 from allennlp.data import DatasetReader, Vocabulary
 from allennlp.data import DataLoader
 from allennlp.models.archival import archive_model, CONFIG_NAME, verify_include_in_archive
-from allennlp.models.model import _DEFAULT_WEIGHTS, Model
+from allennlp.models.model import Model
+from allennlp.nn.parallel import DdpAccelerator
 from allennlp.training.trainer import Trainer
 from allennlp.training import util as training_util
 
@@ -131,6 +132,7 @@ def train_model_from_file(
     include_package: List[str] = None,
     dry_run: bool = False,
     file_friendly_logging: bool = False,
+    return_model: Optional[bool] = None,
 ) -> Optional[Model]:
     """
     A wrapper around [`train_model`](#train_model) which loads the params from a file.
@@ -160,11 +162,16 @@ def train_model_from_file(
     file_friendly_logging : `bool`, optional (default=`False`)
         If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
         down tqdm's output to only once every 10 seconds.
+    return_model : `Optional[bool]`, optional (default = `None`)
+        Whether or not to return the final model. If not specified, this defaults to `False` for
+        distributed training and `True` otherwise.
 
     # Returns
 
+    best_model : `Optional[str]`
+        The path to the archived model with the best weights or `None` if in dry run.
     best_model : `Optional[Model]`
-        The model with the best epoch weights or `None` if in dry run.
+        The model with the best epoch weights or `None`, depending on the value of `return_model` and `dry_run`.
     """
     # Load the experiment config from a file and pass it to `train_model`.
     params = Params.from_file(parameter_filename, overrides)
@@ -177,6 +184,7 @@ def train_model_from_file(
         include_package=include_package,
         dry_run=dry_run,
         file_friendly_logging=file_friendly_logging,
+        return_model=return_model,
     )
 
 
@@ -189,6 +197,7 @@ def train_model(
     include_package: List[str] = None,
     dry_run: bool = False,
     file_friendly_logging: bool = False,
+    return_model: Optional[bool] = None,
 ) -> Optional[Model]:
     """
     Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data
@@ -216,11 +225,14 @@ def train_model(
     file_friendly_logging : `bool`, optional (default=`False`)
         If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
         down tqdm's output to only once every 10 seconds.
+    return_model : `Optional[bool]`, optional (default = `None`)
+        Whether or not to return the final model. If not specified, this defaults to `False` for
+        distributed training and `True` otherwise.
 
     # Returns
 
     best_model : `Optional[Model]`
-        The model with the best epoch weights or `None` if in dry run.
+        The model with the best epoch weights or `None`, depending on the value of `return_model` and `dry_run`.
     """
     common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging
 
@@ -233,6 +245,8 @@ def train_model(
     include_in_archive = params.pop("include_in_archive", None)
     verify_include_in_archive(include_in_archive)
 
+    model: Optional[Model] = None
+
     distributed_params = params.params.pop("distributed", None)
     # If distributed isn't in the config and the config contains strictly
     # one cuda device, we just run a single training process.
@@ -245,11 +259,6 @@ def train_model(
             dry_run=dry_run,
             file_friendly_logging=file_friendly_logging,
         )
-
-        if not dry_run:
-            archive_model(serialization_dir, include_in_archive=include_in_archive)
-        return model
-
     # Otherwise, we are running multiple processes for training.
     else:
         common_logging.prepare_global_logging(
@@ -323,15 +332,22 @@ def train_model(
                 device_ids,
                 file_friendly_logging,
                 include_in_archive,
+                Params(distributed_params),
             ),
             nprocs=num_procs,
         )
-        if dry_run:
-            return None
-        else:
-            archive_model(serialization_dir, include_in_archive=include_in_archive)
-            model = Model.load(params, serialization_dir)
-            return model
+
+    if not dry_run:
+        archive_model(serialization_dir, include_in_archive=include_in_archive)
+    else:
+        return None
+
+    if return_model is None:
+        return model  # model may or may not be `None`.
+    elif return_model is True:
+        return model if model is not None else Model.load(params, serialization_dir)
+    else:
+        return None
 
 
 def _train_worker(
@@ -347,6 +363,7 @@ def _train_worker(
     distributed_device_ids: List[int] = None,
     file_friendly_logging: bool = False,
     include_in_archive: List[str] = None,
+    distributed_params: Optional[Params] = None,
 ) -> Optional[Model]:
     """
     Helper to train the configured model/experiment. In distributed mode, this is spawned as a
@@ -383,6 +400,8 @@ def _train_worker(
         down tqdm's output to only once every 10 seconds.
     include_in_archive : `List[str]`, optional
         Paths relative to `serialization_dir` that should be archived in addition to the default ones.
+    distributed_params : `Optional[Params]`, optional
+        Additional distributed params.
 
     # Returns
 
@@ -404,8 +423,11 @@ def _train_worker(
 
     include_package = include_package or []
 
+    ddp_accelerator: Optional[DdpAccelerator] = None
+
     if distributed:
         assert distributed_device_ids is not None
+        assert distributed_params is not None
 
         # Since the worker is spawned and not forked, the extra imports need to be done again.
         # Both the ones from the plugins and the ones from `include_package`.
@@ -426,16 +448,17 @@ def _train_worker(
         # In distributed training, the configured device is always going to be a list.
         # The corresponding gpu id for the particular worker is obtained by picking the id
         # from the device list with the rank as index
-        gpu_id = distributed_device_ids[process_rank]  # type: ignore
+        gpu_id = int(distributed_device_ids[process_rank])  # type: ignore
 
         # Till now, "cuda_device" might not be set in the trainer params.
         # But a worker trainer needs to only know about its specific GPU id.
+        params["trainer"]["local_rank"] = process_rank
         params["trainer"]["cuda_device"] = gpu_id
         params["trainer"]["world_size"] = world_size
         params["trainer"]["distributed"] = True
 
         if gpu_id >= 0:
-            torch.cuda.set_device(int(gpu_id))
+            torch.cuda.set_device(gpu_id)
             dist.init_process_group(
                 backend="nccl",
                 init_method=f"tcp://{primary_addr}:{primary_port}",
@@ -449,6 +472,16 @@ def _train_worker(
                 world_size=world_size,
                 rank=global_rank,
             )
+
+        if "ddp_accelerator" in distributed_params:
+            ddp_accelerator_params = distributed_params.pop("ddp_accelerator")
+            ddp_accelerator = DdpAccelerator.from_params(
+                ddp_accelerator_params,
+                local_rank=process_rank,
+                world_size=world_size,
+                cuda_device=gpu_id,
+            )
+
         logging.info(
             f"Process group of world size {world_size} initialized "
             f"for distributed training in worker {global_rank}"
@@ -458,6 +491,7 @@ def _train_worker(
         params=params,
         serialization_dir=serialization_dir,
         local_rank=process_rank,
+        ddp_accelerator=ddp_accelerator,
     )
 
     if dry_run:
@@ -470,7 +504,7 @@ def _train_worker(
         metrics = train_loop.run()
     except KeyboardInterrupt:
         # if we have completed an epoch, try to create a model archive.
-        if primary and os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
+        if primary:
             best_weights_path = train_loop.trainer.get_best_weights_path()
             if best_weights_path is None:
                 logging.info(
@@ -581,6 +615,7 @@ def from_partial_objects(
         test_data_path: Any = None,
         evaluate_on_test: bool = False,
         batch_weight_key: str = "",
+        ddp_accelerator: Optional[DdpAccelerator] = None,
     ) -> "TrainModel":
         """
         This method is intended for use with our `FromParams` logic, to construct a `TrainModel`
@@ -667,6 +702,10 @@ def from_partial_objects(
         batch_weight_key: `str`, optional (default=`""`)
             The name of metric used to weight the loss on a per-batch basis.  This is only used
             during evaluation on final test data, if you've specified `evaluate_on_test=True`.
+
+        ddp_accelerator : `Optional[DdpAccelerator]`, optional (default = `None`)
+            A `DdpAccelerator` to use in distributed trainer. Passed to the model and the trainer.
+
         """
         # Train data loader.
         data_loaders: Dict[str, DataLoader] = {
@@ -724,7 +763,9 @@ def from_partial_objects(
 
         vocabulary_ = vocabulary.construct(instances=instance_generator)
 
-        model_ = model.construct(vocab=vocabulary_, serialization_dir=serialization_dir)
+        model_ = model.construct(
+            vocab=vocabulary_, serialization_dir=serialization_dir, ddp_accelerator=ddp_accelerator
+        )
 
         # Initializing the model can have side effect of expanding the vocabulary.
         # Save the vocab only in the primary. In the degenerate non-distributed
@@ -744,6 +785,7 @@ def from_partial_objects(
             data_loader=data_loaders["train"],
             validation_data_loader=data_loaders.get("validation"),
             local_rank=local_rank,
+            ddp_accelerator=ddp_accelerator,
         )
         assert trainer_ is not None
 

diff --git a/allennlp/common/from_params.py b/allennlp/common/from_params.py
@@ -458,7 +458,7 @@ def construct_arg(
 
         value_cls = args[0]
         subextras = create_extras(value_cls, extras)
-        return Lazy(value_cls, params=deepcopy(popped_params), contructor_extras=subextras)  # type: ignore
+        return Lazy(value_cls, params=deepcopy(popped_params), constructor_extras=subextras)  # type: ignore
 
     # For any other kind of iterable, we will just assume that a list is good enough, and treat
     # it the same as List. This condition needs to be at the end, so we don't catch other kinds

diff --git a/allennlp/common/lazy.py b/allennlp/common/lazy.py
@@ -50,11 +50,13 @@ def __init__(
         self,
         constructor: Union[Type[T], Callable[..., T]],
         params: Optional[Params] = None,
-        contructor_extras: Optional[Dict[str, Any]] = None,
+        constructor_extras: Optional[Dict[str, Any]] = None,
+        **kwargs,
     ) -> None:
         self._constructor = constructor
         self._params = params or Params({})
-        self._constructor_extras = contructor_extras or {}
+        self._constructor_extras = constructor_extras or {}
+        self._constructor_extras.update(kwargs)
 
     @property
     def constructor(self) -> Callable[..., T]:

diff --git a/allennlp/common/testing/model_test_case.py b/allennlp/common/testing/model_test_case.py
@@ -121,7 +121,7 @@ def ensure_model_can_train_save_and_load(
         """
         save_dir = self.TEST_DIR / "save_and_load_test"
         archive_file = save_dir / "model.tar.gz"
-        model = train_model_from_file(param_file, save_dir, overrides=overrides)
+        model = train_model_from_file(param_file, save_dir, overrides=overrides, return_model=True)
         assert model is not None
 
         metrics_file = save_dir / "metrics.json"

diff --git a/allennlp/common/util.py b/allennlp/common/util.py
@@ -418,12 +418,14 @@ def peak_gpu_memory() -> Dict[int, int]:
     if not torch.cuda.is_available():
         return {}
 
+    device = torch.cuda.current_device()
+
+    results_dict: Dict[int, int] = {}
     if is_distributed():
         # If the backend is not 'nccl', we're training on CPU.
         if dist.get_backend() != "nccl":
             return {}
 
-        device = torch.cuda.current_device()
         global_rank = dist.get_rank()
         world_size = dist.get_world_size()
         peak_bytes = torch.cuda.max_memory_allocated(device)
@@ -433,13 +435,15 @@ def peak_gpu_memory() -> Dict[int, int]:
 
         dist.all_gather(gather_results, peak_bytes_tensor)
 
-        results_dict: Dict[int, int] = {}
         for peak_bytes_tensor in gather_results:
             results_dict[int(peak_bytes_tensor[0])] = int(peak_bytes_tensor[1])
-
-        return results_dict
     else:
-        return {0: torch.cuda.max_memory_allocated()}
+        results_dict = {0: torch.cuda.max_memory_allocated()}
+
+    # Reset peak stats.
+    torch.cuda.reset_max_memory_allocated(device)
+
+    return results_dict
 
 
 def ensure_list(iterable: Iterable[A]) -> List[A]: