facebookresearch · prigoyal · Feb 8, 2021 · Feb 8, 2021 · Feb 8, 2021
diff --git a/vissl/config/defaults.yaml b/vissl/config/defaults.yaml
@@ -258,6 +258,9 @@ config:
       # how many times the model should be checkpointed. User should tune this parameter
       # and find the number that offers best memory saving and compute tradeoff.
       NUM_ACTIVATION_CHECKPOINTING_SPLITS: 2
+    # setup for Fairscale sharded DDP
+    SHARDED_DDP_SETUP:
+      reduce_buffer_size: -1
     # ----------------------------------------------------------------------------------- #
     # Feature evaluation settings
     # ----------------------------------------------------------------------------------- #
@@ -537,7 +540,7 @@ config:
       num_prototypes: [3000]        # automatically inferred from model HEAD settings
       temp_hard_assignment_iters: 0
       # for dumping the debugging info in case loss becomes NaN
-      output_dir: ""                # automatically inferred and set to checkpoint dir
+      output_dir: "."               # automatically inferred and set to checkpoint dir
       queue:
         queue_length: 0             # automatically adjusted to ensure queue_length % global batch size = 0
         start_iter: 0
@@ -593,6 +596,8 @@ config:
   # ----------------------------------------------------------------------------------- #
   OPTIMIZER:
     name: "sgd"
+    # whether to shard optimizer state as per ZeRO https://arxiv.org/abs/1910.02054
+    use_zero: False
     use_larc: False  # supported for SGD only for now
     larc_config:
       clip: False

diff --git a/vissl/hooks/state_update_hooks.py b/vissl/hooks/state_update_hooks.py
@@ -260,6 +260,18 @@ def on_backward(self, task: "tasks.ClassyTask") -> None:
         map_params_to_iters = {}
         for to_map in task.config.MODEL.TEMP_FROZEN_PARAMS_ITER_MAP:
             map_params_to_iters[to_map[0]] = to_map[1]
+
+        # get the maximum iterations until which the params are frozen.
+        # if the iterations are past the maximum iterations freezing any
+        # param, we simply return.
+        max_iterations = max(list(map_params_to_iters.values()))
+        if task.iteration >= max_iterations:
+            if task.iteration == max_iterations:
+                logging.info(
+                    f"No parameters grad removed from now on: {task.iteration}"
+                )
+            return
+
         for name, p in task.model.named_parameters():
             if (
                 name in map_params_to_iters

diff --git a/vissl/utils/checkpoint.py b/vissl/utils/checkpoint.py
@@ -45,7 +45,7 @@ def get_checkpoint_folder(config: AttrDict):
     makedir(odir)
     assert PathManager.exists(
         config.CHECKPOINT.DIR
-    ), "Please specify config.CHECKPOINT.DIR parameter. It should not be None."
+    ), f"Please specify config.CHECKPOINT.DIR parameter. Invalid: {config.CHECKPOINT.DIR}"
     return odir
 
 

diff --git a/vissl/utils/hydra_config.py b/vissl/utils/hydra_config.py
@@ -189,7 +189,7 @@ def resolve_linear_schedule(cfg, param_schedulers):
 
 def get_scaled_lr_scheduler(cfg, param_schedulers, scaled_lr):
     """
-    Scale learning rate value for different Learning rate types. See assert_learning_rate()
+    Scale learning rate value for different Learning rate types. See infer_learning_rate()
     for how the scaled LR is calculated.
 
     Values changed for learning rate schedules:
@@ -258,7 +258,7 @@ def get_scaled_lr_scheduler(cfg, param_schedulers, scaled_lr):
     return param_schedulers
 
 
-def assert_learning_rate(cfg):
+def infer_learning_rate(cfg):
     """
     1) Assert the Learning rate here. LR is scaled as per https://arxiv.org/abs/1706.02677.
     to turn this automatic scaling off,
@@ -325,7 +325,7 @@ def assert_learning_rate(cfg):
     return cfg
 
 
-def assert_losses(cfg):
+def infer_losses_config(cfg):
     """
     Infer settings for various self-supervised losses. Takes care of setting various loss
     parameters correctly like world size, batch size per gpu, effective global batch size,
@@ -441,8 +441,8 @@ def assert_hydra_conf(cfg):
        LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we
        set the LABEL_TYPE to "sample_index".
     """
-    cfg = assert_losses(cfg)
-    cfg = assert_learning_rate(cfg)
+    cfg = infer_losses_config(cfg)
+    cfg = infer_learning_rate(cfg)
 
     # in case of linear evaluation, we often evaluate several layers at a time. For each
     # layer, there's a separate accuracy meter. In such case, we want to output the layer
@@ -499,3 +499,15 @@ def assert_hydra_conf(cfg):
             url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE, cache_dir=cache_dir
         )
         cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path
+
+    # if we use a zero optimizer, we nest the optimizer related settings under the
+    # base_optimizer.
+    if cfg.OPTIMIZER.use_zero:
+        cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy()
+        cfg.OPTIMIZER.name = "zero"
+        del cfg.OPTIMIZER.base_optimizer["param_schedulers"]
+        del cfg.OPTIMIZER.base_optimizer["regularize_bn"]
+        del cfg.OPTIMIZER.base_optimizer["regularize_bias"]
+        del cfg.OPTIMIZER.base_optimizer["num_epochs"]
+        del cfg.OPTIMIZER.base_optimizer["use_zero"]
+        del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"]