add scripts/metrics_table.py

janosh · janosh · commit 20ef518e6e7f · 2023-06-19T20:29:22.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -48,7 +48,7 @@ repos:
     rev: v0.991
     hooks:
       - id: mypy
-        additional_dependencies: [types-pyyaml]
+        additional_dependencies: [types-pyyaml, types-requests]
 
   - repo: https://github.com/codespell-project/codespell
     rev: v2.2.2
diff --git a/models/cgcnn/test_cgcnn.py b/models/cgcnn/test_cgcnn.py
@@ -77,7 +77,7 @@
         if val == runs[0].config[key] or key.startswith(("slurm_", "timestamp")):
             continue
         raise ValueError(
-            f"Configs not identical: runs[{idx}][{key}]={val}, {runs[0][key]=}"
+            f"Run configs not identical: runs[{idx}][{key}]={val}, {runs[0][key]=}"
         )
 
 run_params = dict(
@@ -96,7 +96,10 @@
 wandb.init(project="matbench-discovery", name=job_name, config=run_params)
 
 cg_data = CrystalGraphData(
-    df, task_dict={target_col: "regression"}, structure_col=input_col
+    df,
+    task_dict={target_col: "regression"},
+    structure_col=input_col,
+    identifiers=("material_id", "formula_from_cse"),
 )
 data_loader = DataLoader(
     cg_data, batch_size=1024, shuffle=False, collate_fn=collate_batch
@@ -120,6 +123,5 @@
 R2 = ensemble_metrics.R2.mean()
 
 title = rf"CGCNN {task_type} ensemble={len(runs)} {MAE=:.4} {R2=:.4}"
-print(title)
 
 wandb_log_scatter(table, fields=dict(x=target_col, y=pred_col), title=title)
diff --git a/models/wrenformer/test_wrenformer.py b/models/wrenformer/test_wrenformer.py
@@ -63,7 +63,7 @@
         if val == runs[0].config[key] or key.startswith(("slurm_", "timestamp")):
             continue
         raise ValueError(
-            f"Configs not identical: runs[{idx}][{key}]={val}, {runs[0][key]=}"
+            f"Run configs not identical: runs[{idx}][{key}]={val}, {runs[0][key]=}"
         )
 
 run_params = dict(
@@ -109,6 +109,5 @@
 R2 = ensemble_metrics.R2.mean()
 
 title = rf"Wrenformer {task_type} ensemble={len(runs)} {MAE=:.4} {R2=:.4}"
-print(title)
 
 wandb_log_scatter(table, fields=dict(x=target_col, y=pred_col), title=title)
diff --git a/scripts/metrics_table.py b/scripts/metrics_table.py
@@ -0,0 +1,148 @@
+# %%
+from __future__ import annotations
+
+from typing import Any
+
+import pandas as pd
+import requests
+import wandb
+from sklearn.metrics import f1_score, r2_score
+from tqdm import tqdm
+
+from matbench_discovery import ROOT, today
+from matbench_discovery.load_preds import load_df_wbm_with_preds
+
+__author__ = "Janosh Riebesell"
+__date__ = "2022-11-28"
+
+
+# %%
+models: dict[str, dict[str, Any]] = {
+    "Wren": dict(n_runs=0),
+    "CGCNN": dict(
+        n_runs=10,
+        filters=dict(
+            created_at={"$gt": "2022-11-21", "$lt": "2022-11-23"},
+            display_name={"$regex": "cgcnn-robust-formation_energy_per_atom"},
+        ),
+    ),
+    "Voronoi RF": dict(
+        n_runs=70,
+        filters=dict(
+            created_at={"$gt": "2022-11-17", "$lt": "2022-11-28"},
+            display_name={"$regex": "voronoi-features"},
+        ),
+    ),
+    "Wrenformer": dict(
+        n_runs=10,
+        filters=dict(
+            created_at={"$gt": "2022-11-14", "$lt": "2022-11-16"},
+            display_name={"$regex": "wrenformer-robust-mp-formation_energy"},
+        ),
+    ),
+    "MEGNet": dict(
+        n_runs=1,
+        filters=dict(
+            created_at={"$gt": "2022-11-17", "$lt": "2022-11-19"},
+            display_name={"$regex": "megnet-wbm-IS2RE"},
+        ),
+    ),
+    "M3GNet": dict(
+        n_runs=99,
+        filters=dict(
+            created_at={"$gt": "2022-10-31", "$lt": "2022-11-01"},
+            display_name={"$regex": "m3gnet-wbm-IS2RE"},
+        ),
+    ),
+    "BOWSR MEGNet": dict(
+        n_runs=1000,
+        filters=dict(
+            created_at={"$gt": "2022-11-22", "$lt": "2022-11-25"},
+            display_name={"$regex": "bowsr-megnet"},
+        ),
+    ),
+}
+
+run_times: dict[str, dict[str, str | int | float]] = {}
+
+
+# %% calculate total model run times from wandb logs
+# NOTE these model run times are pretty meaningless since some models were run on GPU
+# (Wrenformer and CGCNN), others on CPU. Also BOWSR MEGNet, M3GNet and MEGNet weren't
+# trained from scratch. Their run times only indicate the time needed to predict the
+# test set.
+
+for model in (pbar := tqdm(models)):
+    model_dict = models[model]
+    n_runs, filters = (model_dict.get(x) for x in ("n_runs", "filters"))
+    if n_runs == 0 or model in run_times:
+        continue
+    pbar.set_description(model)
+
+    runs = wandb.Api().runs("janosh/matbench-discovery", filters=filters)
+
+    assert len(runs) == n_runs, f"found {len(runs)=} for {model}, expected {n_runs}"
+
+    run_time = sum(run.summary.get("_wandb", {}).get("runtime", 0) for run in runs)
+    # NOTE we assume all jobs have the same metadata here
+    metadata = requests.get(runs[0].file("wandb-metadata.json").url).json()
+
+    n_gpu, n_cpu = metadata.get("gpu_count", 0), metadata.get("cpu_count", 0)
+    run_times[model] = {"Run time": run_time, "Hardware": f"GPU: {n_gpu}, CPU: {n_cpu}"}
+
+
+# on 2022-11-28:
+# run_times = {'Voronoi RF': 739608,
+#  'Wrenformer': 208399,
+#  'MEGNet': 12396,
+#  'M3GNet': 301138,
+#  'BOWSR MEGNet': 9105237}
+
+
+# %%
+df_wbm = load_df_wbm_with_preds(models=models).round(3)
+
+
+target_col = "e_form_per_atom_mp2020_corrected"
+
+df_wbm = df_wbm.query(f"{target_col} < 5")
+e_above_hull_col = "e_above_hull_mp2020_corrected_ppd_mp"
+e_above_hull = df_wbm[e_above_hull_col]
+
+
+# %%
+df_metrics = pd.DataFrame(run_times).T
+
+for model in models:
+    dct = {}
+    e_above_hull_pred = df_wbm[model] - df_wbm[target_col]
+
+    dct["F1"] = f1_score(e_above_hull < 0, e_above_hull_pred < 0)
+    dct["Precision"] = f1_score(e_above_hull < 0, e_above_hull_pred < 0, pos_label=True)
+    dct["Recall"] = f1_score(e_above_hull < 0, e_above_hull_pred < 0, pos_label=False)
+
+    dct["MAE"] = (e_above_hull_pred - e_above_hull).abs().mean()
+
+    dct["RMSE"] = ((e_above_hull_pred - e_above_hull) ** 2).mean() ** 0.5
+    dct["R2"] = r2_score(
+        e_above_hull.loc[e_above_hull_pred.dropna().index], e_above_hull_pred.dropna()
+    )
+
+    df_metrics.loc[model, list(dct)] = dct.values()
+
+
+df_styled = df_metrics.style.format(precision=3).background_gradient(
+    cmap="viridis",
+    # gmap=np.log10(df_table) # for log scaled color map
+)
+df_styled
+
+
+# %%
+styles = {
+    "": "font-family: sans-serif; border-collapse: collapse;",
+    "td, th": "border: 1px solid #ddd; text-align: left; padding: 8px;",
+}
+df_styled.set_table_styles([dict(selector=sel, props=styles[sel]) for sel in styles])
+
+df_styled.to_html(f"{ROOT}/figures/{today}-metrics-table.html")