ecmwf · tjhunter · Mar 17, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 13, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,18 +9,21 @@ authors = [
 
 requires-python = ">=3.11,<3.13"
 # TODO: split the plotting dependencies into their own dep groups, they are not required.
-dependencies = [ 'torch',
-                 'numpy',
-                 'astropy_healpix',
-                 'zarr',
-                 'anemoi-datasets',
-                 'pandas',
-                 'pynvml',
-                 'tqdm',
-                 'matplotlib',
-                 'packaging',
-                 'wheel',
-                 'psutil']
+dependencies = [
+ 'torch',
+ 'numpy',
+ 'astropy_healpix',
+ 'zarr',
+ 'anemoi-datasets',
+ 'pandas',
+ 'pynvml',
+ 'tqdm',
+ 'matplotlib',
+ 'packaging',
+ 'wheel',
+ 'psutil',
+ "flash-attn",
+]
 
 [project.urls]
 Homepage = "https://www.weathergenerator.eu"
@@ -91,4 +94,7 @@ ignore = [
   "F811",
   # To ignore, not relevant for us
   "E741",
-]
+]
+
+[tool.uv.sources]
+flash-attn = { url = "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl" }
diff --git a/src/weathergen/utils/train_logger.py b/src/weathergen/utils/train_logger.py
@@ -8,6 +8,10 @@
 # nor does it submit to any jurisdiction.
 
 import datetime
+import json
+import math
+import os.path
+import time
 
 import numpy as np
 
@@ -19,33 +23,65 @@ class TrainLogger:
     def __init__(self, cf, path_run) -> None:
         self.cf = cf
         self.path_run = path_run
-        # TODO: add header with col names (loadtxt has an option to skip k header lines)
+
+    def log_metrics(self, metrics: dict[str, float]) -> None:
+        """
+        Log metrics to a file.
+        For now, just scalar values are expected. There is no check.
+        """
+        # Clean all the metrics to convert to float. Any other type (numpy etc.) will trigger a serialization error.
+        clean_metrics = {
+            "weathergen.timestamp": time.time_ns() // 1_000_000,
+            "weathergen.time": int(datetime.datetime.now().strftime("%Y%m%d%H%M%S")),
+        }
+        for key, value in metrics.items():
+            v = float(value)
+            if math.isnan(v) or math.isinf(v):
+                v = str(v)
+            clean_metrics[key] = v
+
+        # TODO: performance: we repeatedly open the file for each call. Better for multiprocessing
+        # but we can probably do better and rely for example on the logging module.
+        with open(os.path.join(self.path_run, "metrics.json"), "ab") as f:
+            s = json.dumps(clean_metrics) + "\n"
+            f.write(s.encode("utf-8"))
 
     #######################################
     def add_train(self, samples, lr, loss_avg, stddev_avg, perf_gpu=0.0, perf_mem=0.0) -> None:
         """
         Log training data
         """
 
+        metrics = dict(num_samples=samples)
+
         log_vals = [int(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))]
         log_vals += [samples]
 
+        metrics["loss_avg_0_mean"] = loss_avg[0].mean()
+        metrics["learning_rate"] = lr
         log_vals += [loss_avg[0].mean()]
         log_vals += [lr]
 
         for i_obs, _rt in enumerate(self.cf.streams):
             for j, _ in enumerate(self.cf.loss_fcts):
+                metrics[f"stream_{i_obs}.loss_{j}.loss_avg"] = loss_avg[j, i_obs]
                 log_vals += [loss_avg[j, i_obs]]
         if len(stddev_avg) > 0:
             for i_obs, _rt in enumerate(self.cf.streams):
                 log_vals += [stddev_avg[i_obs]]
+                metrics[f"stream_{i_obs}.stddev_avg"] = stddev_avg[i_obs]
 
         with open(self.path_run + self.cf.run_id + "_train_log.txt", "ab") as f:
             np.savetxt(f, log_vals)
 
         log_vals = []
         log_vals += [perf_gpu]
         log_vals += [perf_mem]
+        if perf_gpu > 0.0:
+            metrics["perf.gpu"] = perf_gpu
+        if perf_mem > 0.0:
+            metrics["perf.memory"] = perf_mem
+        self.log_metrics(metrics)
         with open(self.path_run + self.cf.run_id + "_perf_log.txt", "ab") as f:
             np.savetxt(f, log_vals)
 

diff --git a/uv.lock b/uv.lock