Skip to content

Commit 023fb21

Browse files
committed
WIP loss function / forecaster multiple datasets
1 parent 42213b6 commit 023fb21

File tree

6 files changed

+933
-2
lines changed

6 files changed

+933
-2
lines changed

models/src/anemoi/models/models/encoder_processor_decoder.py

+16
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,22 @@ def __init__(
7979
skip_variables = [var for var in self.input_variables if var in self.output_variables]
8080
self._internal_input_idx = [self.input_variables.index(var) for var in skip_variables]
8181
self._internal_output_idx = [self.output_variables.index(var) for var in skip_variables]
82+
83+
#Fake data_indices to be used in Forecaster
84+
self.data_indices = DotDict(
85+
{
86+
"internal_model": {
87+
"input": {
88+
"full": self.input_variables,
89+
"name_to_index": self.name_to_index_input,
90+
},
91+
"output": {
92+
"full": self.output_variables,
93+
"name_to_index": self.name_to_index_output,
94+
}
95+
}
96+
}
97+
)
8298
#-------------------------------------------------------------------------#
8399

84100
self.data_indices = data_indices

training/src/anemoi/training/config/model/refactored_graphtransformer.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
activation: GELU
22
num_channels: 1024
33
cpu_offload: False
4-
output_mask: null
4+
#output_mask: null # moved this to config.training to have one mask per output
55

66
model:
77
_target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# resume or fork a training from a checkpoint last.ckpt or specified in hardware.files.warm_start
2+
run_id: null
3+
fork_run_id: null
4+
transfer_learning: False # activate to perform transfer learning
5+
load_weights_only: False # only load model weights, do not restore optimiser states etc.
6+
7+
# run in deterministic mode ; slows down
8+
deterministic: False
9+
10+
# miscellaneous
11+
precision: 16-mixed
12+
13+
# multistep input
14+
# 1 = single step scheme, X(t-1) used to predict X(t)
15+
# k > 1: multistep scheme, uses [X(t-k), X(t-k+1), ... X(t-1)] to predict X(t)
16+
# Deepmind use k = 2 in their model
17+
multistep_input: 2
18+
19+
# gradient accumulation across K batches, K >= 1 (if K == 1 then no accumulation)
20+
# the effective batch size becomes num-devices * batch_size * k
21+
accum_grad_batches: 1
22+
23+
num_sanity_val_steps: 6
24+
25+
# clipp gradients, 0 : don't clip, default algorithm: norm, alternative: value
26+
gradient_clip:
27+
val: 32.
28+
algorithm: value
29+
30+
# stochastic weight averaging
31+
# https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/
32+
swa:
33+
enabled: False
34+
lr: 1.e-4
35+
36+
# Optimizer settings
37+
optimizer:
38+
zero: False # use ZeroRedundancyOptimizer ; saves memory for larger models
39+
kwargs:
40+
betas: [0.9, 0.95]
41+
42+
# select model
43+
model_task: anemoi.training.train.forecaster.GraphForecasterMultiDataset
44+
45+
# select strategy
46+
strategy:
47+
_target_: anemoi.training.distributed.strategy.DDPGroupStrategy
48+
num_gpus_per_model: ${hardware.num_gpus_per_model}
49+
read_group_size: ${dataloader.read_group_size}
50+
51+
# loss functions
52+
53+
# dynamic rescaling of the loss gradient
54+
# see https://arxiv.org/pdf/2306.06079.pdf, section 4.3.2
55+
# don't enable this by default until it's been tested and proven beneficial
56+
loss_gradient_scaling: False
57+
58+
# length of the "rollout" window (see Keisler's paper)
59+
rollout:
60+
start: 1
61+
# increase rollout every n epochs
62+
epoch_increment: 0
63+
# maximum rollout to use
64+
max: 1
65+
66+
# Set max_epochs or max_steps. Training stops at the first limit reached.
67+
max_epochs: null
68+
max_steps: 150000
69+
70+
lr:
71+
warmup: 1000 # number of warmup iterations
72+
rate: 0.625e-4 #local_lr
73+
iterations: ${training.max_steps} # NOTE: When max_epochs < max_steps, scheduler will run for max_steps
74+
min: 3e-7 #Not scaled by #GPU
75+
76+
output:
77+
era5: # Make it easier to set the same key name in different places of the config
78+
training_loss:
79+
_target_: anemoi.training.losses.mse.WeightedMSELoss
80+
scalars: ['variable', 'loss_weights_mask']
81+
ignore_nans: True
82+
output_mask: null
83+
variable_loss_scaling:
84+
default: 1
85+
pl:
86+
q: 0.6
87+
t: 6
88+
u: 0.8
89+
v: 0.5
90+
w: 0.001
91+
z: 12
92+
sfc:
93+
sp: 10
94+
10u: 0.1
95+
10v: 0.1
96+
2d: 0.5
97+
tp: 0.025
98+
cp: 0.0025
99+
100+
pressure_level_scaler:
101+
_target_: anemoi.training.data.scaling.ReluPressureLevelScaler
102+
minimum: 0.2
103+
slope: 0.001
104+
105+
node_loss_weights:
106+
_target_: anemoi.training.losses.nodeweights.GraphNodeAttribute
107+
target_nodes: ${graph.data}
108+
node_attribute: area_weight
109+
110+
validation_metrics:
111+
- _target_: anemoi.training.losses.mse.WeightedMSELoss
112+
scalars: []
113+
ignore_nans: True
114+
scale_validation_metrics:
115+
scalars_to_apply: ['variable']
116+
metrics:
117+
- 'all'
118+
119+
metrics:
120+
- z_500
121+
- t_850
122+
- u_850
123+
- v_850
124+
125+
126+
submodules_to_freeze: []
127+
128+
129+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# (C) Copyright 2024 Anemoi contributors.
2+
#
3+
# This software is licensed under the terms of the Apache Licence Version 2.0
4+
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
5+
#
6+
# In applying this licence, ECMWF does not waive the privileges and immunities
7+
# granted to it by virtue of its status as an intergovernmental organisation
8+
# nor does it submit to any jurisdiction.
9+
10+
import torch.nn as nn
11+
from torch import Tensor
12+
13+
class DictLoss(nn.Module):
14+
"""Wrapper for a dictionary of loss-fuctions that operate on different outputs."""
15+
16+
def __init__(
17+
self,
18+
loss_dict: nn.ModuleDict,
19+
) -> None:
20+
super().__init__()
21+
self.loss_dict = loss_dict
22+
self.outputs = list(loss_dict.keys())
23+
24+
def forward(
25+
self,
26+
pred: dict[str, Tensor],
27+
target: dict[str, Tensor],
28+
squash: bool = True, # TODO Generalise this per output?
29+
) -> dict[str, Tensor]:
30+
out = {}
31+
for output, loss in self.loss_dict.items():
32+
out[output] = loss(pred[output], target[output], squash)
33+
34+
return out

training/src/anemoi/training/train/forecaster/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@
1010
from .ensforecaster import GraphEnsForecaster
1111
from .forecaster import GraphForecaster
1212
from .interpolator import GraphInterpolator
13+
from .forecaster_multiple_datasets import GraphForecasterMultiDataset
1314

14-
__all__ = ["GraphEnsForecaster", "GraphForecaster", "GraphInterpolator"]
15+
__all__ = ["GraphEnsForecaster", "GraphForecaster", "GraphInterpolator", "GraphForecasterMultiDataset"]

0 commit comments

Comments
 (0)