Skip to content

[29] Logging in json format #68

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 17, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion src/weathergen/utils/train_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
# nor does it submit to any jurisdiction.

import datetime
import json
import math
import os.path
import time

import numpy as np

Expand All @@ -19,33 +23,65 @@ class TrainLogger:
def __init__(self, cf, path_run) -> None:
self.cf = cf
self.path_run = path_run
# TODO: add header with col names (loadtxt has an option to skip k header lines)

def log_metrics(self, metrics: dict[str, float]) -> None:
"""
Log metrics to a file.
For now, just scalar values are expected. There is no check.
"""
# Clean all the metrics to convert to float. Any other type (numpy etc.) will trigger a serialization error.
clean_metrics = {
"weathergen.timestamp": time.time_ns() // 1_000_000,
"weathergen.time": int(datetime.datetime.now().strftime("%Y%m%d%H%M%S")),
}
for key, value in metrics.items():
v = float(value)
if math.isnan(v) or math.isinf(v):
v = str(v)
clean_metrics[key] = v

# TODO: performance: we repeatedly open the file for each call. Better for multiprocessing
# but we can probably do better and rely for example on the logging module.
with open(os.path.join(self.path_run, "metrics.json"), "ab") as f:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest that we start with this simple version, we can always improve performance if it turns out to be a bottleneck

s = json.dumps(clean_metrics) + "\n"
f.write(s.encode("utf-8"))

#######################################
def add_train(self, samples, lr, loss_avg, stddev_avg, perf_gpu=0.0, perf_mem=0.0) -> None:
"""
Log training data
"""

metrics = dict(num_samples=samples)

log_vals = [int(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))]
log_vals += [samples]

metrics["loss_avg_0_mean"] = loss_avg[0].mean()
metrics["learning_rate"] = lr
log_vals += [loss_avg[0].mean()]
log_vals += [lr]

for i_obs, _rt in enumerate(self.cf.streams):
for j, _ in enumerate(self.cf.loss_fcts):
metrics[f"stream_{i_obs}.loss_{j}.loss_avg"] = loss_avg[j, i_obs]
log_vals += [loss_avg[j, i_obs]]
if len(stddev_avg) > 0:
for i_obs, _rt in enumerate(self.cf.streams):
log_vals += [stddev_avg[i_obs]]
metrics[f"stream_{i_obs}.stddev_avg"] = stddev_avg[i_obs]

with open(self.path_run + self.cf.run_id + "_train_log.txt", "ab") as f:
np.savetxt(f, log_vals)

log_vals = []
log_vals += [perf_gpu]
log_vals += [perf_mem]
if perf_gpu > 0.0:
metrics["perf.gpu"] = perf_gpu
if perf_mem > 0.0:
metrics["perf.memory"] = perf_mem
self.log_metrics(metrics)
with open(self.path_run + self.cf.run_id + "_perf_log.txt", "ab") as f:
np.savetxt(f, log_vals)

Expand Down