Add training code

liaoxin2 · liaoxin2 · commit 76f85cf5b020 · 2025-05-19T15:27:53.000+08:00
diff --git a/docs/zh/examples/gencast.md b/docs/zh/examples/gencast.md
@@ -5,6 +5,16 @@
 - 下载目录`dm_graphcast/gencast/stats`下的所有文件放入`./data/stats/`目录下。
 - 下载目录`dm_graphcast/gencast/dataset`下的任意或所有文件（例如：source-era5_date-2019-03-29_res-1.0_levels-13_steps-12.nc）放入`./data/dataset/`目录下。
 
+=== "模型训练命令"
+
+    ``` sh
+    # 设置路径到 PaddleScience/jointContribution 文件夹
+    cd PaddleScience/jointContribution
+    export PYTHONPATH=$PWD:$PYTHONPAT
+    # 运行训练脚本
+    python run_gencast.py mode=train
+    ```
+
 === "模型评估命令"
 
     ``` sh
@@ -15,7 +25,7 @@
     cd gencast/
     wget -nc https://paddle-org.bj.bcebos.com/paddlescience/models/gencast/gencast_params_GenCast-1p0deg-Mini-_2019.pdparams -P ./data/params/
     # 运行评估脚本
-    python run_gencast.py
+    python run_gencast.py mode=eval
     ```
 
 ## 1. 背景简介
diff --git a/jointContribution/gencast/conf/gencast.yaml b/jointContribution/gencast/conf/gencast.yaml
@@ -28,6 +28,13 @@ mean_path: data/stats/gencast_stats_mean_by_level.nc
 min_path: data/stats/gencast_stats_min_by_level.nc
 param_path: data/params/gencast_params_GenCast-1p0deg-Mini-_2019.pdparams
 
+train:
+  learning_rate: 0.001
+  weight_decay: 0.1
+  num_epochs: 2000000
+  batch_size: 1
+  snapshot_freq: 10
+
 sampler_config:
   max_noise_level: 80.0
   min_noise_level: 0.03
@@ -63,8 +70,9 @@ denoiser_architecture_config:
     block_q_dkv: 512
     block_kv_dkv: 1024
     block_kv_dkv_compute: 1024
-    ffw_winit_final_mult: 0.0
-    attn_winit_final_mult: 0.0
+    ffw_winit_final_mult: 1.0
+    attn_winit_final_mult: 1.0
+    attn_winit_mult: 2.0
     ffw_hidden: 2048
     mesh_node_dim: 186
     mesh_node_emb_dim: 512
diff --git a/jointContribution/gencast/denoiser.py b/jointContribution/gencast/denoiser.py
@@ -14,6 +14,7 @@
 """Support for wrapping a general Predictor to act as a Denoiser."""
 
 import copy
+import math
 import os
 import pickle
 from typing import Optional
@@ -66,12 +67,16 @@ def __init__(
         self._num_frequencies = num_frequencies
         self._apply_log_first = apply_log_first
 
-        # 创建 MLP
+        # Creating MLP
         layers = []
         input_size = 2 * num_frequencies
         num_layers = len(output_sizes)
         for i, output_size in enumerate(output_sizes):
-            linear_layer = nn.Linear(input_size, output_size)
+            limit = math.sqrt(6 / input_size)
+            weight_attr = paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Uniform(low=-limit, high=limit)
+            )
+            linear_layer = nn.Linear(input_size, output_size, weight_attr=weight_attr)
             layers.append(linear_layer)
             if i < num_layers - 1:
                 layers.append(activation)
@@ -168,4 +173,12 @@ def forward(
             grid_node_outputs, noisy_targets
         )
 
-        return raw_predictions
+        resolution = self.cfg.denoiser_architecture_config.resolution
+        grid_lat = np.arange(-90.0, 90.0 + resolution, resolution).astype(np.float32)
+        grid_lon = np.arange(0.0, 360.0, resolution).astype(np.float32)
+        grid_shape = [grid_lat.shape[0], grid_lon.shape[0]]
+        grid_outputs_lat_lon_leading = grid_node_outputs.reshape(
+            grid_shape + grid_node_outputs.shape[1:]
+        )
+
+        return raw_predictions, grid_outputs_lat_lon_leading
diff --git a/jointContribution/gencast/dpm_solver_plus_plus_2s.py b/jointContribution/gencast/dpm_solver_plus_plus_2s.py
@@ -173,7 +173,7 @@ def init_noise(template):
             mid_over_current = mid_noise_level / noise_level
             # x = xr.open_dataset('/workspace/workspace/graphcast/x.nc')
 
-            x_denoised = denoiser(noise_level, x)
+            x_denoised, _ = denoiser(noise_level, x)
             # This turns out to be a convex combination of current and denoised x,
             # which isn't entirely apparent from the paper formulae:
             x_mid = (
@@ -182,7 +182,7 @@ def init_noise(template):
             )
 
             next_over_current = next_noise_level / noise_level
-            x_mid_denoised = denoiser(mid_noise_level, x_mid)
+            x_mid_denoised, _ = denoiser(mid_noise_level, x_mid)
             x_next = (
                 next_over_current.numpy() * x
                 + (1 - next_over_current.numpy()) * x_mid_denoised
diff --git a/jointContribution/gencast/gencast.py b/jointContribution/gencast/gencast.py
@@ -24,8 +24,13 @@
 
 import denoiser
 import dpm_solver_plus_plus_2s
+import losses
+import numpy as np
+import paddle
 import paddle.nn as nn
+import samplers_utils
 import xarray as xr
+from graphcast import datasets
 
 
 class GenCast(nn.Layer):
@@ -54,6 +59,7 @@ def __init__(
         self._sampler_config = cfg.sampler_config
         self._sampler = None
         self._noise_config = cfg.noise_config
+        self.cfg = cfg
 
     def _c_in(self, noise_scale: xr.DataArray) -> xr.DataArray:
         """Scaling applied to the noisy targets input to the underlying network."""
@@ -81,22 +87,95 @@ def _preconditioned_denoiser(
     ) -> xr.Dataset:
         """The preconditioned denoising function D from the paper (Eqn 7)."""
         # Convert xarray DataArray to Paddle tensor for operations
-        raw_predictions = self._denoiser(
+        raw_predictions, grid_node_outputs = self._denoiser(
             inputs=inputs,
             noisy_targets=noisy_targets * self._c_in(noise_levels),
             noise_levels=noise_levels,
             forcings=forcings,
             **kwargs
         )
 
-        return raw_predictions * self._c_out(
-            noise_levels
-        ) + noisy_targets * self._c_skip(noise_levels)
+        stacked_noisy_targets = datasets.dataset_to_stacked(noisy_targets)
+        stacked_noisy_targets = stacked_noisy_targets.transpose("lat", "lon", ...)
+
+        out = grid_node_outputs * paddle.to_tensor(self._c_out(noise_levels).data)
+        skip = paddle.to_tensor(
+            stacked_noisy_targets.data * self._c_skip(noise_levels).data
+        )
+        grid_node_outputs = out + skip
+
+        return (
+            raw_predictions * self._c_out(noise_levels)
+            + noisy_targets * self._c_skip(noise_levels),
+            grid_node_outputs,
+        )
+
+    def loss(
+        self,
+        inputs: xr.Dataset,
+        targets: xr.Dataset,
+        forcings: Optional[xr.Dataset] = None,
+    ):
+
+        if self._noise_config is None:
+            raise ValueError("Noise config must be specified to train GenCast.")
+
+        grid_node_outputs, denoised_predictions, noise_levels = self.forward(
+            inputs, targets, forcings
+        )
+
+        loss, diagnostics = losses.weighted_mse_loss_from_xarray(
+            grid_node_outputs,
+            targets,
+            # Weights are same as we used for GraphCast.
+            per_variable_weights={
+                # Any variables not specified here are weighted as 1.0.
+                # A single-level variable, but an important headline variable
+                # and also one which we have struggled to get good performance
+                # on at short lead times, so leaving it weighted at 1.0, equal
+                # to the multi-level variables:
+                "2m_temperature": 1.0,
+                # New single-level variables, which we don't weight too highly
+                # to avoid hurting performance on other variables.
+                "10m_u_component_of_wind": 0.1,
+                "10m_v_component_of_wind": 0.1,
+                "mean_sea_level_pressure": 0.1,
+                "sea_surface_temperature": 0.1,
+                "total_precipitation_12hr": 0.1,
+            },
+        )
+        loss *= paddle.to_tensor(self._loss_weighting(noise_levels).data)
+        return loss, diagnostics
 
     def forward(self, inputs, targets_template, forcings=None, **kwargs):
+        if self.cfg.mode == "eval":
+            if self._sampler is None:
+                self._sampler = dpm_solver_plus_plus_2s.Sampler(
+                    self._preconditioned_denoiser, **self._sampler_config
+                )
+            return self._sampler(inputs, targets_template, forcings, **kwargs)
+        if self.cfg.mode == "train":
+            # Sample noise levels:
+            batch_size = inputs.sizes["batch"]
+            noise_levels = xr.DataArray(
+                data=samplers_utils.rho_inverse_cdf(
+                    min_value=self._noise_config.training_min_noise_level,
+                    max_value=self._noise_config.training_max_noise_level,
+                    rho=self._noise_config.training_noise_level_rho,
+                    cdf=np.random.uniform(size=(batch_size,)).astype("float32"),
+                ),
+                dims=("batch",),
+            )
+
+            # Sample noise and apply it to targets:
+            noise = (
+                samplers_utils.spherical_white_noise_like(targets_template)
+                * noise_levels
+            )
+
+            noisy_targets = targets_template + noise
 
-        if self._sampler is None:
-            self._sampler = dpm_solver_plus_plus_2s.Sampler(
-                self._preconditioned_denoiser, **self._sampler_config
+            denoised_predictions, grid_node_outputs = self._preconditioned_denoiser(
+                inputs, noisy_targets, noise_levels, forcings
             )
-        return self._sampler(inputs, targets_template, forcings, **kwargs)
+            return grid_node_outputs, denoised_predictions, noise_levels
diff --git a/jointContribution/gencast/run_gencast.py b/jointContribution/gencast/run_gencast.py
@@ -26,20 +26,88 @@
 from omegaconf import DictConfig
 
 
-def crps(targets, predictions, bias_corrected=True):
-    if predictions.sizes.get("sample", 1) < 2:
-        raise ValueError("predictions must have dim 'sample' with size at least 2.")
-    sum_dims = ["sample", "sample2"]
-    preds2 = predictions.rename({"sample": "sample2"})
-    num_samps = predictions.sizes["sample"]
-    num_samps2 = (num_samps - 1) if bias_corrected else num_samps
-    mean_abs_diff = np.abs(predictions - preds2).sum(dim=sum_dims, skipna=False) / (
-        num_samps * num_samps2
+class CustomDataLoader(paddle.io.Dataset):
+    def __init__(self, target_lead_times, cfg):
+        super(CustomDataLoader, self).__init__()
+
+        self.target_lead_times = target_lead_times
+        self.cfg = cfg
+
+    def __len__(self):
+        # Return the number of time steps in target_lead_times
+        return len(self.target_lead_times)
+
+    def __getitem__(self, index):
+        # Select a specific time step
+        time_step = self.target_lead_times[index]
+
+        # Multiply by 12 to get 'a'
+        a = time_step * 12
+
+        # Create a string in the format 'ah'
+        ah_str = f"{a}h"
+
+        # Update the config with this new 'ah' string
+        self.cfg["target_lead_times"] = ah_str
+
+        # Call the ERA5Data function/class
+        # Assuming ERA5Data is a function or class that processes this config
+        data = datasets.ERA5Data(config=self.cfg)
+
+        return data
+
+
+def train(cfg: DictConfig):
+    # Initialize the GenCast model with the given configuration.
+    model = gencast.GenCast(cfg)
+    model.train()
+
+    # set optimizer
+    optimizer = paddle.optimizer.AdamW(
+        parameters=model.parameters(),
+        learning_rate=cfg.train.learning_rate,
+        weight_decay=cfg.train.weight_decay,
     )
-    mean_abs_err = (
-        np.abs(targets - predictions).sum(dim="sample", skipna=False) / num_samps
+    # Load the dataset using the given configuration.
+    nc_dataset = xarray.open_dataset(cfg.data_path)
+    time_total = len(nc_dataset.time.data)
+    train_loader = CustomDataLoader(
+        target_lead_times=list(range(1, time_total - 1)),
+        cfg=cfg,
     )
-    return mean_abs_err - 0.5 * mean_abs_diff
+
+    best_loss = float("inf")
+    for epoch in range(cfg.train.num_epochs):
+        epoch_loss = 0
+        for dataset in train_loader:
+            # Forward pass and compute loss
+            loss, diagnostics = model.loss(
+                dataset.inputs_template,
+                dataset.targets_template,
+                dataset.forcings_template,
+            )
+            # Backward pass and optimization
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+
+            epoch_loss += loss.item()
+
+        # Average loss for the epoch
+        epoch_loss /= len(train_loader)
+        logging.info(f"Epoch {epoch}: Loss = {epoch_loss:.6f}")
+        if epoch % cfg.train.snapshot_freq == 0 or epoch == 1:
+            model_save_path = os.path.join(
+                cfg.output_dir, f"last_model_epoch_{epoch}.pdparams"
+            )
+            paddle.save(model.state_dict(), model_save_path)
+
+        # Save model if it has the best loss
+        if epoch_loss < best_loss:
+            best_loss = epoch_loss
+            model_save_path = os.path.join(cfg.output_dir, "best_model_epoch.pdparams")
+            paddle.save(model.state_dict(), model_save_path)
+            logging.info(f"Best model saved at epoch {epoch} with loss {best_loss:.6f}")
 
 
 def eval(cfg: DictConfig):
@@ -113,6 +181,8 @@ def eval(cfg: DictConfig):
 def main(cfg: DictConfig):
     if cfg.mode == "eval":
         eval(cfg)
+    elif cfg.mode == "train":
+        train(cfg)
     else:
         raise ValueError(f"cfg.mode should in ['eval'], but got '{cfg.mode}'")
 
diff --git a/jointContribution/gencast/sparse_transformer.py b/jointContribution/gencast/sparse_transformer.py
diff --git a/jointContribution/graphcast/datasets.py b/jointContribution/graphcast/datasets.py
diff --git a/jointContribution/graphcast/vis.py b/jointContribution/graphcast/vis.py