|
| 1 | +# resume or fork a training from a checkpoint last.ckpt or specified in hardware.files.warm_start |
| 2 | +run_id: null |
| 3 | +fork_run_id: null |
| 4 | +transfer_learning: False # activate to perform transfer learning |
| 5 | +load_weights_only: False # only load model weights, do not restore optimiser states etc. |
| 6 | + |
| 7 | +# run in deterministic mode ; slows down |
| 8 | +deterministic: False |
| 9 | + |
| 10 | +# miscellaneous |
| 11 | +precision: 16-mixed |
| 12 | + |
| 13 | +# multistep input |
| 14 | +# 1 = single step scheme, X(t-1) used to predict X(t) |
| 15 | +# k > 1: multistep scheme, uses [X(t-k), X(t-k+1), ... X(t-1)] to predict X(t) |
| 16 | +# Deepmind use k = 2 in their model |
| 17 | +multistep_input: 2 |
| 18 | + |
| 19 | +# gradient accumulation across K batches, K >= 1 (if K == 1 then no accumulation) |
| 20 | +# the effective batch size becomes num-devices * batch_size * k |
| 21 | +accum_grad_batches: 1 |
| 22 | + |
| 23 | +num_sanity_val_steps: 6 |
| 24 | + |
| 25 | +# clipp gradients, 0 : don't clip, default algorithm: norm, alternative: value |
| 26 | +gradient_clip: |
| 27 | + val: 32. |
| 28 | + algorithm: value |
| 29 | + |
| 30 | +# stochastic weight averaging |
| 31 | +# https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/ |
| 32 | +swa: |
| 33 | + enabled: False |
| 34 | + lr: 1.e-4 |
| 35 | + |
| 36 | +# Optimizer settings |
| 37 | +optimizer: |
| 38 | + zero: False # use ZeroRedundancyOptimizer ; saves memory for larger models |
| 39 | + kwargs: |
| 40 | + betas: [0.9, 0.95] |
| 41 | + |
| 42 | +# select model |
| 43 | +model_task: anemoi.training.train.forecaster.GraphForecasterMultiDataset |
| 44 | + |
| 45 | +# select strategy |
| 46 | +strategy: |
| 47 | + _target_: anemoi.training.distributed.strategy.DDPGroupStrategy |
| 48 | + num_gpus_per_model: ${hardware.num_gpus_per_model} |
| 49 | + read_group_size: ${dataloader.read_group_size} |
| 50 | + |
| 51 | +# loss functions |
| 52 | + |
| 53 | +# dynamic rescaling of the loss gradient |
| 54 | +# see https://arxiv.org/pdf/2306.06079.pdf, section 4.3.2 |
| 55 | +# don't enable this by default until it's been tested and proven beneficial |
| 56 | +loss_gradient_scaling: False |
| 57 | + |
| 58 | +# length of the "rollout" window (see Keisler's paper) |
| 59 | +rollout: |
| 60 | + start: 1 |
| 61 | + # increase rollout every n epochs |
| 62 | + epoch_increment: 0 |
| 63 | + # maximum rollout to use |
| 64 | + max: 1 |
| 65 | + |
| 66 | +# Set max_epochs or max_steps. Training stops at the first limit reached. |
| 67 | +max_epochs: null |
| 68 | +max_steps: 150000 |
| 69 | + |
| 70 | +lr: |
| 71 | + warmup: 1000 # number of warmup iterations |
| 72 | + rate: 0.625e-4 #local_lr |
| 73 | + iterations: ${training.max_steps} # NOTE: When max_epochs < max_steps, scheduler will run for max_steps |
| 74 | + min: 3e-7 #Not scaled by #GPU |
| 75 | + |
| 76 | +output: |
| 77 | + era5: # Make it easier to set the same key name in different places of the config |
| 78 | + training_loss: |
| 79 | + _target_: anemoi.training.losses.mse.WeightedMSELoss |
| 80 | + scalars: ['variable', 'loss_weights_mask'] |
| 81 | + ignore_nans: True |
| 82 | + output_mask: null |
| 83 | + variable_loss_scaling: |
| 84 | + default: 1 |
| 85 | + pl: |
| 86 | + q: 0.6 |
| 87 | + t: 6 |
| 88 | + u: 0.8 |
| 89 | + v: 0.5 |
| 90 | + w: 0.001 |
| 91 | + z: 12 |
| 92 | + sfc: |
| 93 | + sp: 10 |
| 94 | + 10u: 0.1 |
| 95 | + 10v: 0.1 |
| 96 | + 2d: 0.5 |
| 97 | + tp: 0.025 |
| 98 | + cp: 0.0025 |
| 99 | + |
| 100 | + pressure_level_scaler: |
| 101 | + _target_: anemoi.training.data.scaling.ReluPressureLevelScaler |
| 102 | + minimum: 0.2 |
| 103 | + slope: 0.001 |
| 104 | + |
| 105 | + node_loss_weights: |
| 106 | + _target_: anemoi.training.losses.nodeweights.GraphNodeAttribute |
| 107 | + target_nodes: ${graph.data} |
| 108 | + node_attribute: area_weight |
| 109 | + |
| 110 | + validation_metrics: |
| 111 | + - _target_: anemoi.training.losses.mse.WeightedMSELoss |
| 112 | + scalars: [] |
| 113 | + ignore_nans: True |
| 114 | + scale_validation_metrics: |
| 115 | + scalars_to_apply: ['variable'] |
| 116 | + metrics: |
| 117 | + - 'all' |
| 118 | + |
| 119 | + metrics: |
| 120 | + - z_500 |
| 121 | + - t_850 |
| 122 | + - u_850 |
| 123 | + - v_850 |
| 124 | + |
| 125 | + |
| 126 | +submodules_to_freeze: [] |
| 127 | + |
| 128 | + |
| 129 | + |
0 commit comments