Skip to content
This repository was archived by the owner on Mar 19, 2024. It is now read-only.

Remove the need to iterate the model parameters for freezing and speedup #176

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion vissl/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,9 @@ config:
# how many times the model should be checkpointed. User should tune this parameter
# and find the number that offers best memory saving and compute tradeoff.
NUM_ACTIVATION_CHECKPOINTING_SPLITS: 2
# setup for Fairscale sharded DDP
SHARDED_DDP_SETUP:
reduce_buffer_size: -1
# ----------------------------------------------------------------------------------- #
# Feature evaluation settings
# ----------------------------------------------------------------------------------- #
Expand Down Expand Up @@ -537,7 +540,7 @@ config:
num_prototypes: [3000] # automatically inferred from model HEAD settings
temp_hard_assignment_iters: 0
# for dumping the debugging info in case loss becomes NaN
output_dir: "" # automatically inferred and set to checkpoint dir
output_dir: "." # automatically inferred and set to checkpoint dir
queue:
queue_length: 0 # automatically adjusted to ensure queue_length % global batch size = 0
start_iter: 0
Expand Down Expand Up @@ -593,6 +596,8 @@ config:
# ----------------------------------------------------------------------------------- #
OPTIMIZER:
name: "sgd"
# whether to shard optimizer state as per ZeRO https://arxiv.org/abs/1910.02054
use_zero: False
use_larc: False # supported for SGD only for now
larc_config:
clip: False
Expand Down
12 changes: 12 additions & 0 deletions vissl/hooks/state_update_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,18 @@ def on_backward(self, task: "tasks.ClassyTask") -> None:
map_params_to_iters = {}
for to_map in task.config.MODEL.TEMP_FROZEN_PARAMS_ITER_MAP:
map_params_to_iters[to_map[0]] = to_map[1]

# get the maximum iterations until which the params are frozen.
# if the iterations are past the maximum iterations freezing any
# param, we simply return.
max_iterations = max(list(map_params_to_iters.values()))
if task.iteration >= max_iterations:
if task.iteration == max_iterations:
logging.info(
f"No parameters grad removed from now on: {task.iteration}"
)
return

for name, p in task.model.named_parameters():
if (
name in map_params_to_iters
Expand Down
2 changes: 1 addition & 1 deletion vissl/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_checkpoint_folder(config: AttrDict):
makedir(odir)
assert PathManager.exists(
config.CHECKPOINT.DIR
), "Please specify config.CHECKPOINT.DIR parameter. It should not be None."
), f"Please specify config.CHECKPOINT.DIR parameter. Invalid: {config.CHECKPOINT.DIR}"
return odir


Expand Down
22 changes: 17 additions & 5 deletions vissl/utils/hydra_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def resolve_linear_schedule(cfg, param_schedulers):

def get_scaled_lr_scheduler(cfg, param_schedulers, scaled_lr):
"""
Scale learning rate value for different Learning rate types. See assert_learning_rate()
Scale learning rate value for different Learning rate types. See infer_learning_rate()
for how the scaled LR is calculated.

Values changed for learning rate schedules:
Expand Down Expand Up @@ -258,7 +258,7 @@ def get_scaled_lr_scheduler(cfg, param_schedulers, scaled_lr):
return param_schedulers


def assert_learning_rate(cfg):
def infer_learning_rate(cfg):
"""
1) Assert the Learning rate here. LR is scaled as per https://arxiv.org/abs/1706.02677.
to turn this automatic scaling off,
Expand Down Expand Up @@ -325,7 +325,7 @@ def assert_learning_rate(cfg):
return cfg


def assert_losses(cfg):
def infer_losses_config(cfg):
"""
Infer settings for various self-supervised losses. Takes care of setting various loss
parameters correctly like world size, batch size per gpu, effective global batch size,
Expand Down Expand Up @@ -441,8 +441,8 @@ def assert_hydra_conf(cfg):
LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we
set the LABEL_TYPE to "sample_index".
"""
cfg = assert_losses(cfg)
cfg = assert_learning_rate(cfg)
cfg = infer_losses_config(cfg)
cfg = infer_learning_rate(cfg)

# in case of linear evaluation, we often evaluate several layers at a time. For each
# layer, there's a separate accuracy meter. In such case, we want to output the layer
Expand Down Expand Up @@ -499,3 +499,15 @@ def assert_hydra_conf(cfg):
url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE, cache_dir=cache_dir
)
cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path

# if we use a zero optimizer, we nest the optimizer related settings under the
# base_optimizer.
if cfg.OPTIMIZER.use_zero:
cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy()
cfg.OPTIMIZER.name = "zero"
del cfg.OPTIMIZER.base_optimizer["param_schedulers"]
del cfg.OPTIMIZER.base_optimizer["regularize_bn"]
del cfg.OPTIMIZER.base_optimizer["regularize_bias"]
del cfg.OPTIMIZER.base_optimizer["num_epochs"]
del cfg.OPTIMIZER.base_optimizer["use_zero"]
del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"]