change SI plot of largest model errors: Predicted vs. DFT hull distance colored by model disagreement

janosh · janosh · commit a5b3211f571f · 2023-06-19T20:29:25.000-07:00
change y-axis from average model error to all-model mean
diff --git a/matbench_discovery/preds.py b/matbench_discovery/preds.py
@@ -19,12 +19,12 @@
 e_form_col = "e_form_per_atom_mp2020_corrected"
 each_true_col = "e_above_hull_mp2020_corrected_ppd_mp"
 each_pred_col = "e_above_hull_pred"
+model_mean_each_col = "Mean prediction all models"
 model_mean_err_col = "Mean error all models"
 model_std_col = "Std. dev. over models"
 
-
-quantity_labels[model_mean_err_col] = f"{model_mean_err_col} {ev_per_atom}"
-quantity_labels[model_std_col] = f"{model_std_col} {ev_per_atom}"
+for col in (model_mean_each_col, model_mean_err_col, model_std_col):
+    quantity_labels[col] = f"{col} {ev_per_atom}"
 
 
 class PredFiles(Files):
@@ -157,8 +157,18 @@ def load_df_wbm_with_preds(
         df_preds[each_true_col] + df_preds[model] - df_preds[e_form_col]
     )
 
+# important: do df_each_pred.std(axis=1) before inserting
+# df_each_pred[model_mean_each_col]
+df_preds[model_std_col] = df_each_pred.std(axis=1)
+df_each_pred[model_mean_each_col] = df_preds[model_mean_each_col] = df_each_pred.mean(
+    axis=1
+)
 
 # dataframe of all models' errors in their EACH predictions (eV/atom)
 df_each_err = pd.DataFrame()
 for model in df_metrics.T.MAE.sort_values().index:
     df_each_err[model] = df_preds[model] - df_preds[e_form_col]
+
+df_each_err[model_mean_err_col] = df_preds[model_mean_err_col] = df_each_err.abs().mean(
+    axis=1
+)
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,23 +10,23 @@ authors = [{ name = "Janosh Riebesell", email = "janosh@lbl.gov" }]
 readme = "readme.md"
 license = { file = "license" }
 keywords = [
-  "materials discovery",
-  "inorganic crystal stability",
-  "machine learning",
-  "interatomic potential",
   "Bayesian optimization",
-  "high-throughput search",
   "convex hull",
+  "high-throughput search",
+  "inorganic crystal stability",
+  "interatomic potential",
+  "machine learning",
+  "materials discovery",
 ]
 classifiers = [
   "Intended Audience :: Science/Research",
   "License :: OSI Approved :: MIT License",
   "Operating System :: OS Independent",
-  "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.9",
-  "Topic :: Scientific/Engineering :: Chemistry",
   "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Scientific/Engineering :: Chemistry",
   "Topic :: Scientific/Engineering :: Physics",
 ]
 
@@ -58,6 +58,7 @@ running-models = [
   "chgnet",
   # torch needs to install before aviary
   "torch",
+
   "aviary@git+https://github.com/CompRhys/aviary",
   "m3gnet",
   "maml",
@@ -82,7 +83,7 @@ select = [
   "B",   # flake8-bugbear
   "C40", # flake8-comprehensions
   "D",   # pydocstyle
-  "E",   # pycodestyle
+  "E",   # pycodestyle error
   "F",   # pyflakes
   "I",   # isort
   "N",   # pep8-naming
@@ -97,7 +98,7 @@ select = [
   "SIM", # flake8-simplify
   "TID", # tidy imports
   "UP",  # pyupgrade
-  "W",   # pycodestyle
+  "W",   # pycodestyle warning
   "YTT", # flake8-2020
 ]
 ignore = [
diff --git a/scripts/analyze_model_failure_cases.py b/scripts/analyze_model_failure_cases.py
@@ -25,6 +25,7 @@
     df_metrics,
     df_preds,
     each_true_col,
+    model_mean_each_col,
     model_mean_err_col,
     model_std_col,
 )
@@ -33,10 +34,6 @@
 __date__ = "2023-02-15"
 
 models = list(df_each_pred)
-df_preds[model_std_col] = df_preds[models].std(axis=1)
-df_each_err[model_mean_err_col] = df_preds[model_mean_err_col] = df_each_err.abs().mean(
-    axis=1
-)
 fp_diff_col = "site_stats_fingerprint_init_final_norm_diff"
 
 
@@ -181,14 +178,15 @@
 # on MP which is highly low-energy enriched.
 # also possible models failed to learn whatever physics makes these materials highly
 # unstable
+n_structs = 200
 fig = (
-    df_preds.nlargest(200, model_mean_err_col)
+    df_preds.nlargest(n_structs, model_mean_err_col)
     .round(2)
     .plot.scatter(
         x=each_true_col,
-        y=model_mean_err_col,
+        y=model_mean_each_col,
         color=model_std_col,
-        size=n_examp_for_rarest_elem_col,
+        size="n_sites",
         backend="plotly",
         hover_name="material_id",
         hover_data=["formula"],
@@ -197,17 +195,19 @@
 )
 # yanchor="bottom", y=1, xanchor="center", x=0.5, orientation="h", thickness=12
 fig.layout.coloraxis.colorbar.update(title_side="right", thickness=14)
+fig.layout.margin.update(l=0, r=30, b=0, t=30)
 add_identity_line(fig)
-fig.layout.title = (
-    "Largest model errors vs. DFT hull distance colored by model disagreement"
+fig.layout.title.update(
+    text=f"{n_structs} largest model errors: Predicted vs. DFT hull distance<br>"
+    "colored by model disagreement",
+    x=0.5,
 )
 # tried setting error_y=model_std_col but looks bad
 # fig.update_traces(error_y=dict(color="rgba(255,255,255,0.2)", width=3, thickness=2))
 fig.show()
-# save_fig(fig, f"{FIGS}/scatter-largest-errors-models-mean-vs-each-true.svelte")
-# save_fig(
-#     fig, f"{ROOT}/tmp/figs/scatter-largest-errors-models-mean-vs-each-true.pdf"
-# )
+img_name = "scatter-largest-errors-models-mean-vs-true-hull-dist"
+save_fig(fig, f"{FIGS}/{img_name}.svelte")
+# save_fig(fig, f"{ROOT}/tmp/figs/{img_name}.pdf")
 
 
 # %% find materials that were misclassified by all models
diff --git a/scripts/compile_metrics.py b/scripts/compile_metrics.py
@@ -223,8 +223,8 @@
                 "margin-bottom": "0",
                 "margin-left": "0",
                 # fit page size to content
-                "page-width": f"{(len(styler.columns) + 1) * 10}",
-                "page-height": f"{(len(styler.index) + 1) * 6}",
+                "page-width": f"{(len(styler.columns) + 1) * 8.3}",
+                "page-height": f"{(len(styler.index) + 1) * 5.5}",
             },
         )
 
diff --git a/scripts/hist_classified_stable_vs_hull_dist.py b/scripts/hist_classified_stable_vs_hull_dist.py
@@ -11,7 +11,7 @@
 
 from pymatviz.utils import save_fig
 
-from matbench_discovery import FIGS
+from matbench_discovery import ROOT
 from matbench_discovery.data import df_wbm
 from matbench_discovery.plots import hist_classified_stable_vs_hull_dist
 from matbench_discovery.preds import df_each_pred, each_true_col
@@ -21,11 +21,10 @@
 
 
 # %%
-model_name = "Wrenformer"
-model_name = "CHGNet"
-# model_name = "M3GNet"
-# model_name = "Voronoi RF"
-which_energy: Final = "true"
+# model_name = "Wrenformer"
+model_name = "CGCNN"
+# model_name = "CGCNN+P"
+which_energy: Final = "pred"
 df_each_pred[each_true_col] = df_wbm[each_true_col]
 backend: Final = "plotly"
 
@@ -35,16 +34,18 @@
     each_pred_col=model_name,
     which_energy=which_energy,
     # stability_threshold=-0.05,
-    # rolling_acc=None,
+    rolling_acc=None,
     backend=backend,
 )
 
 if backend == "plotly":
-    fig.layout.title = model_name
+    # fig.layout.title.update(text=model_name, x=0.5)
+    fig.layout.margin.update(l=0, r=0, b=0, t=30)
+    # fig.update_yaxes(range=[0, 12000])
     fig.show()
 
 
 # %%
-img_path = f"{FIGS}/hist-clf-{which_energy}-hull-dist-{model_name}"
-# save_fig(fig, f"{img_path}.svelte")
-save_fig(fig, f"{img_path}.webp")
+img_name = f"hist-clf-{which_energy}-hull-dist-{model_name}"
+# save_fig(fig, f"{FIGS}/{img_name}.svelte")
+save_fig(fig, f"{ROOT}/tmp/figs/{img_name}.pdf")
diff --git a/site/src/figs/scatter-largest-errors-models-mean-vs-true-hull-dist.svelte b/site/src/figs/scatter-largest-errors-models-mean-vs-true-hull-dist.svelte
diff --git a/site/src/routes/preprint/+page.md b/site/src/routes/preprint/+page.md
@@ -5,7 +5,7 @@
   import CumulativeClfMetrics from '$figs/cumulative-clf-metrics.svelte'
   import RollingMaeVsHullDistModels from '$figs/rolling-mae-vs-hull-dist-models.svelte'
   import ElementErrorsPtableHeatmap from '$models/element-errors-ptable-heatmap.svelte'
-  import HistClfTrueHullDistModels from '$figs/hist-clf-true-hull-dist-models.svelte'
+  import HistClfTrueHullDistModels from '$figs/hist-clf-true-hull-dist-models-4x2.svelte'
   import { onMount } from 'svelte'
 
   let mounted = false
diff --git a/site/src/routes/si/+page.md b/site/src/routes/si/+page.md
@@ -1,5 +1,5 @@
 <script lang="ts">
-  import MetricsTableMegnetCombos from '$figs/metrics-table-megnet-combos.svelte'
+  import MetricsTableMegnetUipCombos from '$figs/metrics-table-megnet-uip-combos.svelte'
   import MetricsTableFirst10k from '$figs/metrics-table-first-10k.svelte'
   import RunTimeBars from '$figs/model-run-times-bar.svelte'
   import RocModels from '$figs/roc-models.svelte'
@@ -14,7 +14,7 @@
   import HistClfPredHullDistModels from '$figs/hist-clf-pred-hull-dist-models-4x2.svelte'
   import SpacegroupSunburstWbm from '$figs/spacegroup-sunburst-wbm.svelte'
   import SpacegroupSunburstWrenformerFailures from '$figs/spacegroup-sunburst-wrenformer-failures.svelte'
-  import ScatterLargestErrorsModelsMeanVsEachTrue from '$figs/scatter-largest-errors-models-mean-vs-each-true.svelte'
+  import ScatterLargestErrorsModelsMeanVsTrueHullDist from '$figs/scatter-largest-errors-models-mean-vs-true-hull-dist.svelte'
   import EAboveHullScatterWrenformerFailures from '$figs/e-above-hull-scatter-wrenformer-failures.svelte'
   import ProtoCountsWrenformerFailures from '$figs/proto-counts-wrenformer-failures.svelte'
   import ElementPrevalenceVsError from '$figs/element-prevalence-vs-error.svelte'
@@ -99,19 +99,19 @@ Given its strong performance on batch 1, it is possible that given sufficiently
 ## Largest Errors vs DFT Hull Distance
 
 {#if mounted}
-<ScatterLargestErrorsModelsMeanVsEachTrue />
+<ScatterLargestErrorsModelsMeanVsTrueHullDist />
 {/if}
 
-> @label:fig:scatter-largest-errors-models-mean-vs-each-true The 200 structures with largest error averaged over all models vs their DFT hull distance colored by model disagreement (as measured by standard deviation in hull distance predictions from different models) and sized by number of training structures containing the least prevalent element (e.g. if a scatter point had composition FeO, MP has 6.6k structures containing Fe and 82k containing O so its size would be set to 6.6k). Thus smaller points have less training support. This plot suggests all models are biased to predict low energy and perhaps fail to capture certain physics resulting in highly unstable structures. This is unsurprising considering MP training data mainly consists of low energy structures.<br>
-> It is also possible that some of the blue points with large error yet good agreement among models are in fact accurate ML predictions for a DFT relaxation gone wrong.
+> @label:fig:scatter-largest-errors-models-mean-vs-true-hull-dist DFT vs predicted hull distance (average over all models) for the 200 largest error structures colored by model disagreement (as measured by standard deviation in hull distance predictions from different models) and sized by number of atoms in the structures. This plot shows that high-error predictions are biased towards predicting too small hull distance. This is unsurprising considering MP training data mainly consists of low-energy structures.<br>
+> However, note the clear color separation between the mostly blue low-energy-bias predictions and the yellow/red high error prediction. Blue means models are in good agreement, i.e. all models are "wrong" together. Red/yellow are large-error predictions with little model agreement, i.e. all models are wrong in different ways. It is possible that some of the blue points with large error yet good agreement among models are in fact accurate ML predictions for a DFT relaxation gone wrong. Zooming in on the blue points reveals that many of them are large. Larger markers correspond to larger structures where DFT failures are less surprising. This suggests ML model committees could be used to cheaply screen large databases for DFT errors in a high-throughput manner.
 
 ## MEGNet formation energies from UIP-relaxed structures
 
 {#if mounted}
-<MetricsTableMegnetCombos select={[`model`, `MEGNet`, `CHGNet`, `M3GNet`, `CHGNet + MEGNet`, `M3GNet + MEGNet`]} />
+<MetricsTableMegnetUipCombos select={[`model`, `MEGNet`, `CHGNet`, `M3GNet`, `CHGNet + MEGNet`, `M3GNet + MEGNet`]} />
 {/if}
 
-> @label:fig:metrics-table-megnet-combos This table shows metrics obtained by combining MEGNet with both UIPs. The metrics in rows labeled M3GNet + MEGNet and CHGNet + MEGNet are the result of passing M3GNet/CHGNet-relaxed structures into MEGNet for formation energy prediction. Both combos perform worse than using the respective UIPs on their own with a more pronounced performance drop from CHGNet to CHGNet + MEGNet than M3GNet to M3GNet + MEGnet. This suggests MEGNet has learned no additional knowledge of the PES that is not already present in the UIPs. However, both combos perform better than MEGNet on its own, demonstrating that UIP relaxation provides real utility at very low cost for any downstream structure-dependent analysis.
+> @label:fig:metrics-table-megnet-uip-combos This table shows metrics obtained by combining MEGNet with both UIPs. The metrics in rows labeled M3GNet + MEGNet and CHGNet + MEGNet are the result of passing M3GNet/CHGNet-relaxed structures into MEGNet for formation energy prediction. Both combos perform worse than using the respective UIPs on their own with a more pronounced performance drop from CHGNet to CHGNet + MEGNet than M3GNet to M3GNet + MEGnet. This suggests MEGNet has learned no additional knowledge of the PES that is not already present in the UIPs. However, both combos perform better than MEGNet on its own, demonstrating that UIP relaxation provides real utility at very low cost for any downstream structure-dependent analysis.
 
 The UIPs M3GNet and CHGNet are both trained to predict DFT energies (including/excluding MP2020 energy corrections for CHGNet/M3GNet) while MEGNet is trained to predict formation energies.
 
diff --git a/tests/test_preds.py b/tests/test_preds.py
@@ -11,6 +11,8 @@
     e_form_col,
     each_true_col,
     load_df_wbm_with_preds,
+    model_mean_each_col,
+    model_mean_err_col,
 )
 
 
@@ -29,13 +31,19 @@ def test_df_metrics() -> None:
 
 def test_df_each_pred() -> None:
     assert len(df_each_pred) == len(df_wbm)
-    assert {*df_each_pred} == {*df_metrics}, "df_each_pred has wrong columns"
+    assert {*df_each_pred} == {
+        *df_metrics,
+        model_mean_each_col,
+    }, "df_each_pred has wrong columns"
     assert all(df_each_pred.isna().mean() < 0.05), "too many NaNs in df_each_pred"
 
 
 def test_df_each_err() -> None:
     assert len(df_each_err) == len(df_wbm)
-    assert {*df_each_err} == {*df_metrics}, "df_each_err has wrong columns"
+    assert {*df_each_err} == {
+        *df_metrics,
+        model_mean_err_col,
+    }, "df_each_err has wrong columns"
     assert all(df_each_err.isna().mean() < 0.05), "too many NaNs in df_each_err"