Skip to content

Commit 9fed210

Browse files
committed
add 2023-01-25-rolling-mae-vs-hull-dist-models.svelte
fix color of SEM shaded area and rolling window annotation in rolling_mae_vs_hull_dist() for backend=plotly add site/src/figs/2023-01-26-wbm-each-hist.svelte to wbm/readme.md implement rolling_acc for backend=plotly branch of hist_classified_stable_vs_hull_dist()
1 parent 2f795f7 commit 9fed210

26 files changed

+344
-155
lines changed

data/mp/build_phase_diagram.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,4 +109,4 @@
109109
xlabel="MP Formation Energy (eV/atom)",
110110
ylabel="Our Formation Energy (eV/atom)",
111111
)
112-
ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.png", dpi=300)
112+
ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.webp", dpi=300)

data/mp/get_mp_energies.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
annotate_mae_r2(df.formation_energy_per_atom, df.decomposition_enthalpy)
8181
# result on 2023-01-10: plots match. no correlation between formation energy and decomposition
8282
# enthalpy. R^2 = -1.571, MAE = 1.604
83-
# ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.png", dpi=300)
83+
# ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.webp", dpi=300)
8484

8585

8686
# %% scatter plot energy above convex hull vs decomposition enthalpy
@@ -99,4 +99,4 @@
9999
title=f"{n_above_line:,} / {len(df):,} = {n_above_line/len(df):.1%} "
100100
"MP materials with\nenergy_above_hull - decomposition_enthalpy.clip(0) > 0.1"
101101
)
102-
# ax.figure.savefig(f"{module_dir}/{today}-mp-e-above-hull-vs-decomp-enth.png", dpi=300)
102+
# ax.figure.savefig(f"{module_dir}/{today}-mp-e-above-hull-vs-decomp-enth.webp", dpi=300)

data/wbm/analysis.py

+58-2
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77

88
from matbench_discovery import FIGS, today
99
from matbench_discovery.data import df_wbm
10-
11-
module_dir = os.path.dirname(__file__)
10+
from matbench_discovery.plots import pio
1211

1312
"""
1413
Compare MP and WBM elemental prevalence. Starting with WBM, MP below.
1514
"""
1615

16+
module_dir = os.path.dirname(__file__)
17+
print(f"{pio.templates.default=}")
18+
1719

1820
# %%
1921
wbm_elem_counts = count_elements(df_wbm.formula).astype(int)
@@ -81,3 +83,57 @@
8183
# %%
8284
mp_fig.write_image(f"{module_dir}/figs/{today}-mp-elements.svg", width=1000, height=500)
8385
# save_fig(mp_fig, f"{FIGS}/{today}-mp-elements.svelte")
86+
87+
88+
# %% histogram of energy above MP convex hull for WBM
89+
col = "e_above_hull_mp2020_corrected_ppd_mp"
90+
# col = "e_form_per_atom_mp2020_corrected"
91+
mean, std = df_wbm[col].mean(), df_wbm[col].std()
92+
93+
fig = df_wbm[col].hist(
94+
bins=100,
95+
backend="plotly",
96+
range_x=[mean - 2 * std, mean + 2 * std],
97+
template="plotly_dark",
98+
)
99+
100+
if col.startswith("e_above_hull"):
101+
n_stable = sum(df_wbm[col] <= 0)
102+
n_unstable = sum(df_wbm[col] > 0)
103+
assert n_stable + n_unstable == len(df_wbm.dropna())
104+
105+
dummy_mae = (df_wbm[col] - df_wbm[col].mean()).abs().mean()
106+
107+
title = (
108+
f"n={len(df_wbm.dropna()):,} with {n_stable:,} stable + {n_unstable:,} "
109+
f"unstable, dummy MAE={dummy_mae:.2f}"
110+
)
111+
fig.update_layout(title=dict(text=title, x=0.5, y=0.95))
112+
113+
fig.update_layout(showlegend=False, paper_bgcolor="rgba(0,0,0,0)")
114+
fig.update_xaxes(title_text="WBM energy above MP convex hull (eV/atom)")
115+
116+
117+
for x_pos, label in zip(
118+
[mean, mean + std, mean - std],
119+
[f"{mean = :.2f}", f"{mean + std = :.2f}", f"{mean - std = :.2f}"],
120+
):
121+
fig.add_vline(x=x_pos, line=dict(width=1, dash="dash"))
122+
fig.add_annotation(
123+
x=x_pos,
124+
y=0.95,
125+
text=label,
126+
showarrow=False,
127+
yref="paper",
128+
xanchor="left",
129+
xshift=5,
130+
)
131+
fig.show()
132+
133+
134+
# subsample x
135+
for trace in fig.data:
136+
trace.x = trace.x[::8]
137+
138+
save_fig(fig, f"{FIGS}/{today}-wbm-each-hist.svelte")
139+
save_fig(fig, f"./figs/{today}-wbm-each-hist.svg", width=1000, height=500)

data/wbm/fetch_process_wbm_dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,7 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
526526
xlabel="legacy corrections (eV / atom)",
527527
ylabel="MP2020 corrections (eV / atom)",
528528
)
529-
# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
529+
# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.webp")
530530

531531

532532
# %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to

data/wbm/readme.md

+19-7
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ The number of materials in each step before and after processing are:
4545
| before | 61,848 | 52,800 | 79,205 | 40,328 | 23,308 | 257,487 |
4646
| after | 61,466 | 52,755 | 79,160 | 40,314 | 23,268 | 256,963 |
4747

48-
## 🔗 &thinsp; Links to raw WBM Data Files
48+
## 🔗 &thinsp; Links to WBM Files
4949

50-
Links to WBM data files have proliferated. This is an attempt to keep track of them.
50+
Links to raw WBM data files have proliferated. This is an attempt to keep track of them.
5151

5252
Initial structures (after element substitution but before DFT relaxation) were sent as Google Drive links via email by Hai-Chen Wang on 2021-09-01.
5353

@@ -72,18 +72,30 @@ materialscloud:2021.68 includes a readme file with a description of the dataset,
7272

7373
[wbm paper]: https://nature.com/articles/s41524-020-00481-6
7474

75-
## 📊 &thinsp; Chemical Diversity
75+
## 🧪 &thinsp; Chemical Diversity
7676

77-
Both the WBM test set and even more so the MP training set are heavily oxide dominated. The WBM test set is about 75% larger than the MP training set and also more chemically diverse, containing a higher fraction of transition metals, post-transition metals and metalloids. Our goal in picking such a large diverse test set is future-proofing. Ideally, this data will provide a challenging materials discovery test bed even for large foundational ML models in the future.
77+
The WBM test set and even more so the MP training set are heavily oxide dominated. The WBM test set is about 75% larger than the MP training set and also more chemically diverse, containing a higher fraction of transition metals, post-transition metals and metalloids. Our goal in picking such a large diverse test set is future-proofing. Ideally, this data will provide a challenging materials discovery test bed even for large foundational ML models in the future.
78+
79+
Element counts for WBM test set consisting of 256,963 WBM `ComputedStructureEntries`
7880

7981
<slot name="wbm-elements-heatmap">
8082
<img src="./figs/2023-01-08-wbm-elements.svg" alt="Periodic table log heatmap of WBM elements">
8183
</slot>
82-
<caption>Element counts for test set consisting of 256,963 WBM <code>ComputedStructureEntries</code></caption>
8384

84-
By comparison, the training set of MP ComputedStructureEntries has this element distribution.
85+
Element counts for MP training set consisting of 146,323 `ComputedStructureEntries`
8586

8687
<slot name="mp-elements-heatmap">
8788
<img src="./figs/2023-01-08-mp-elements.svg" alt="Periodic table log heatmap of MP elements">
8889
</slot>
89-
<caption>Element counts for training set consisting of 146,323 MP <code>ComputedStructureEntries</code></caption>
90+
91+
## 🎯 &thinsp; Target Distribution
92+
93+
The WBM test set has an energy above the MP convex hull distribution with mean **0.02 eV/atom** and standard deviation of **0.25 eV/atom**.
94+
95+
The dummy MAE of always predicting the test set mean is **0.17 eV/atom**.
96+
97+
The number of stable materials is **97k** out of 257k, resulting in a dummy stability hit rate of **37%**.
98+
99+
<slot name="wbm-each-hist">
100+
<img src="./figs/2023-01-26-wbm-each-hist.svg" alt="WBM energy above MP convex hull distribution">
101+
</slot>

matbench_discovery/__init__.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
import sys
66
from datetime import datetime
77

8-
ROOT = os.path.dirname(os.path.dirname(__file__)) # repository root
8+
ROOT = os.path.dirname(os.path.dirname(__file__)) # repo root
99
FIGS = f"{ROOT}/site/src/figs" # directory to store interactive figures
10-
STATIC = f"{ROOT}/site/static/figs" # directory to store static figures
10+
STATIC = f"{ROOT}/site/static/figs" # directory to store static figures, is symlinked
11+
# into site/src/routes/paper/figs dir
1112
MODELS = f"{ROOT}/site/src/routes/models" # directory to write model analysis
1213
# whether a currently running slurm job is in debug mode
1314
DEBUG = "DEBUG" in os.environ or (

matbench_discovery/data.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def glob_to_df(
181181
return pd.concat(sub_dfs.values())
182182

183183

184-
def load_df_wbm_with_preds(
184+
def load_df_wbm_preds(
185185
models: Sequence[str],
186186
pbar: bool = True,
187187
id_col: str = "material_id",

0 commit comments

Comments
 (0)