janosh
diff --git a/‎data/mp/build_phase_diagram.py
Lines changed: 1 addition & 1 deletion b/‎data/mp/build_phase_diagram.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/mp/get_mp_energies.py
Lines changed: 2 additions & 2 deletions b/‎data/mp/get_mp_energies.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎data/wbm/analysis.py
Lines changed: 58 additions & 2 deletions b/‎data/wbm/analysis.py
Lines changed: 58 additions & 2 deletions
diff --git a/‎data/wbm/fetch_process_wbm_dataset.py
Lines changed: 1 addition & 1 deletion b/‎data/wbm/fetch_process_wbm_dataset.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/wbm/readme.md
Lines changed: 19 additions & 7 deletions b/‎data/wbm/readme.md
Lines changed: 19 additions & 7 deletions
diff --git a/‎matbench_discovery/__init__.py
Lines changed: 3 additions & 2 deletions b/‎matbench_discovery/__init__.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎matbench_discovery/data.py
Lines changed: 1 addition & 1 deletion b/‎matbench_discovery/data.py
Lines changed: 1 addition & 1 deletion
@@ -109,4 +109,4 @@
     xlabel="MP Formation Energy (eV/atom)",
     ylabel="Our Formation Energy (eV/atom)",
 )
-ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.png", dpi=300)
+ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.webp", dpi=300)
@@ -80,7 +80,7 @@
 annotate_mae_r2(df.formation_energy_per_atom, df.decomposition_enthalpy)
 # result on 2023-01-10: plots match. no correlation between formation energy and decomposition
 # enthalpy. R^2 = -1.571, MAE = 1.604
-# ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.png", dpi=300)
+# ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.webp", dpi=300)
 
 
 # %% scatter plot energy above convex hull vs decomposition enthalpy
@@ -99,4 +99,4 @@
     title=f"{n_above_line:,} / {len(df):,} = {n_above_line/len(df):.1%} "
     "MP materials with\nenergy_above_hull - decomposition_enthalpy.clip(0) > 0.1"
 )
-# ax.figure.savefig(f"{module_dir}/{today}-mp-e-above-hull-vs-decomp-enth.png", dpi=300)
+# ax.figure.savefig(f"{module_dir}/{today}-mp-e-above-hull-vs-decomp-enth.webp", dpi=300)
@@ -7,13 +7,15 @@
 
 from matbench_discovery import FIGS, today
 from matbench_discovery.data import df_wbm
-
-module_dir = os.path.dirname(__file__)
+from matbench_discovery.plots import pio
 
 """
 Compare MP and WBM elemental prevalence. Starting with WBM, MP below.
 """
 
+module_dir = os.path.dirname(__file__)
+print(f"{pio.templates.default=}")
+
 
 # %%
 wbm_elem_counts = count_elements(df_wbm.formula).astype(int)
@@ -81,3 +83,57 @@
 # %%
 mp_fig.write_image(f"{module_dir}/figs/{today}-mp-elements.svg", width=1000, height=500)
 # save_fig(mp_fig, f"{FIGS}/{today}-mp-elements.svelte")
+
+
+# %% histogram of energy above MP convex hull for WBM
+col = "e_above_hull_mp2020_corrected_ppd_mp"
+# col = "e_form_per_atom_mp2020_corrected"
+mean, std = df_wbm[col].mean(), df_wbm[col].std()
+
+fig = df_wbm[col].hist(
+    bins=100,
+    backend="plotly",
+    range_x=[mean - 2 * std, mean + 2 * std],
+    template="plotly_dark",
+)
+
+if col.startswith("e_above_hull"):
+    n_stable = sum(df_wbm[col] <= 0)
+    n_unstable = sum(df_wbm[col] > 0)
+    assert n_stable + n_unstable == len(df_wbm.dropna())
+
+    dummy_mae = (df_wbm[col] - df_wbm[col].mean()).abs().mean()
+
+    title = (
+        f"n={len(df_wbm.dropna()):,} with {n_stable:,} stable + {n_unstable:,} "
+        f"unstable, dummy MAE={dummy_mae:.2f}"
+    )
+    fig.update_layout(title=dict(text=title, x=0.5, y=0.95))
+
+fig.update_layout(showlegend=False, paper_bgcolor="rgba(0,0,0,0)")
+fig.update_xaxes(title_text="WBM energy above MP convex hull (eV/atom)")
+
+
+for x_pos, label in zip(
+    [mean, mean + std, mean - std],
+    [f"{mean = :.2f}", f"{mean + std = :.2f}", f"{mean - std = :.2f}"],
+):
+    fig.add_vline(x=x_pos, line=dict(width=1, dash="dash"))
+    fig.add_annotation(
+        x=x_pos,
+        y=0.95,
+        text=label,
+        showarrow=False,
+        yref="paper",
+        xanchor="left",
+        xshift=5,
+    )
+fig.show()
+
+
+# subsample x
+for trace in fig.data:
+    trace.x = trace.x[::8]
+
+save_fig(fig, f"{FIGS}/{today}-wbm-each-hist.svelte")
+save_fig(fig, f"./figs/{today}-wbm-each-hist.svg", width=1000, height=500)
@@ -526,7 +526,7 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
     xlabel="legacy corrections (eV / atom)",
     ylabel="MP2020 corrections (eV / atom)",
 )
-# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
+# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.webp")
 
 
 # %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to
 
@@ -45,9 +45,9 @@ The number of materials in each step before and after processing are:
 | before | 61,848 | 52,800 | 79,205 | 40,328 | 23,308 | 257,487 |
 | after  | 61,466 | 52,755 | 79,160 | 40,314 | 23,268 | 256,963 |
 
-## 🔗 &thinsp; Links to raw WBM Data Files
+## 🔗 &thinsp; Links to WBM Files
 
-Links to WBM data files have proliferated. This is an attempt to keep track of them.
+Links to raw WBM data files have proliferated. This is an attempt to keep track of them.
 
 Initial structures (after element substitution but before DFT relaxation) were sent as Google Drive links via email by Hai-Chen Wang on 2021-09-01.
 
@@ -72,18 +72,30 @@ materialscloud:2021.68 includes a readme file with a description of the dataset,
 
 [wbm paper]: https://nature.com/articles/s41524-020-00481-6
 
-## 📊 &thinsp; Chemical Diversity
+## 🧪 &thinsp; Chemical Diversity
 
-Both the WBM test set and even more so the MP training set are heavily oxide dominated. The WBM test set is about 75% larger than the MP training set and also more chemically diverse, containing a higher fraction of transition metals, post-transition metals and metalloids. Our goal in picking such a large diverse test set is future-proofing. Ideally, this data will provide a challenging materials discovery test bed even for large foundational ML models in the future.
+The WBM test set and even more so the MP training set are heavily oxide dominated. The WBM test set is about 75% larger than the MP training set and also more chemically diverse, containing a higher fraction of transition metals, post-transition metals and metalloids. Our goal in picking such a large diverse test set is future-proofing. Ideally, this data will provide a challenging materials discovery test bed even for large foundational ML models in the future.
+
+Element counts for WBM test set consisting of 256,963 WBM `ComputedStructureEntries`
 
 <slot name="wbm-elements-heatmap">
   <img src="./figs/2023-01-08-wbm-elements.svg" alt="Periodic table log heatmap of WBM elements">
 </slot>
-<caption>Element counts for test set consisting of 256,963 WBM <code>ComputedStructureEntries</code></caption>
 
-By comparison, the training set of MP ComputedStructureEntries has this element distribution.
+Element counts for MP training set consisting of 146,323 `ComputedStructureEntries`
 
 <slot name="mp-elements-heatmap">
   <img src="./figs/2023-01-08-mp-elements.svg" alt="Periodic table log heatmap of MP elements">
 </slot>
-<caption>Element counts for training set consisting of 146,323 MP <code>ComputedStructureEntries</code></caption>
+
+## 🎯 &thinsp; Target Distribution
+
+The WBM test set has an energy above the MP convex hull distribution with mean **0.02 eV/atom** and standard deviation of **0.25 eV/atom**.
+
+The dummy MAE of always predicting the test set mean is **0.17 eV/atom**.
+
+The number of stable materials is **97k** out of 257k, resulting in a dummy stability hit rate of **37%**.
+
+<slot name="wbm-each-hist">
+  <img src="./figs/2023-01-26-wbm-each-hist.svg" alt="WBM energy above MP convex hull distribution">
+</slot>
@@ -5,9 +5,10 @@
 import sys
 from datetime import datetime
 
-ROOT = os.path.dirname(os.path.dirname(__file__))  # repository root
+ROOT = os.path.dirname(os.path.dirname(__file__))  # repo root
 FIGS = f"{ROOT}/site/src/figs"  # directory to store interactive figures
-STATIC = f"{ROOT}/site/static/figs"  # directory to store static figures
+STATIC = f"{ROOT}/site/static/figs"  # directory to store static figures, is symlinked
+# into site/src/routes/paper/figs dir
 MODELS = f"{ROOT}/site/src/routes/models"  # directory to write model analysis
 # whether a currently running slurm job is in debug mode
 DEBUG = "DEBUG" in os.environ or (
 
@@ -181,7 +181,7 @@ def glob_to_df(
     return pd.concat(sub_dfs.values())
 
 
-def load_df_wbm_with_preds(
+def load_df_wbm_preds(
     models: Sequence[str],
     pbar: bool = True,
     id_col: str = "material_id",
Original file line number	Diff line number	Diff line change
`@@ -109,4 +109,4 @@`
`109`	`109`	`xlabel="MP Formation Energy (eV/atom)",`
`110`	`110`	`ylabel="Our Formation Energy (eV/atom)",`
`111`	`111`	`)`
`112`		`-ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.png", dpi=300)`
	`112`	`+ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.webp", dpi=300)`
Original file line number	Diff line number	Diff line change
`@@ -526,7 +526,7 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:`
`526`	`526`	`xlabel="legacy corrections (eV / atom)",`
`527`	`527`	`ylabel="MP2020 corrections (eV / atom)",`
`528`	`528`	`)`
`529`		`-# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")`
	`529`	`+# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.webp")`
`530`	`530`
`531`	`531`
`532`	`532`	`# %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to`