Skip to content

Commit 705e0d0

Browse files
committed
add SI discussion on 'WBM Batch Robustness as a Measure of Extrapolation Prowess'
remove date from fig export paths clean up plotting code (e.g. convert EACH hist to area plot) add plausible analytics to matbench-discovery.janosh.dev
1 parent 5dae951 commit 705e0d0

40 files changed

+975
-556
lines changed

data/wbm/analysis.py

+21-28
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# %%
22
import os
33

4+
import numpy as np
45
import pandas as pd
56
from pymatviz import count_elements, ptable_heatmap_plotly
67
from pymatviz.utils import save_fig
@@ -16,13 +17,13 @@
1617

1718
module_dir = os.path.dirname(__file__)
1819
print(f"{pio.templates.default=}")
20+
about_data_page = f"{ROOT}/site/src/routes/about-the-test-set"
1921

2022

2123
# %%
2224
wbm_elem_counts = count_elements(df_wbm.formula).astype(int)
2325

24-
out_elem_counts = f"{ROOT}/site/src/routes/about-the-test-set/wbm-element-counts.json"
25-
# wbm_elem_counts.to_json(out_elem_counts)
26+
# wbm_elem_counts.to_json(f"{about_data_page}/wbm-element-counts.json")
2627

2728

2829
# %%
@@ -46,17 +47,14 @@
4647

4748
# %%
4849
wbm_fig.write_image(f"{module_dir}/figs/wbm-elements.svg", width=1000, height=500)
49-
save_fig(wbm_fig, f"{FIGS}/{today}-wbm-elements.svelte")
50+
# save_fig(wbm_fig, f"{FIGS}/wbm-elements.svelte")
5051

5152

5253
# %% load MP training set
5354
df = pd.read_json(f"{module_dir}/../mp/2022-08-13-mp-energies.json.gz")
5455
mp_elem_counts = count_elements(df.formula_pretty).astype(int)
5556

56-
# mp_elem_counts.to_json(
57-
# f"{ROOT}/site/src/routes/about-the-test-set/{today}-mp-element-counts.json"
58-
# )
59-
mp_elem_counts.describe()
57+
# mp_elem_counts.to_json(f"{about_data_page}/mp-element-counts.json")
6058

6159

6260
# %%
@@ -80,20 +78,20 @@
8078

8179
# %%
8280
mp_fig.write_image(f"{module_dir}/figs/{today}-mp-elements.svg", width=1000, height=500)
83-
# save_fig(mp_fig, f"{FIGS}/{today}-mp-elements.svelte")
81+
# save_fig(mp_fig, f"{FIGS}/mp-elements.svelte")
8482

8583

8684
# %% histogram of energy above MP convex hull for WBM
8785
col = "e_above_hull_mp2020_corrected_ppd_mp"
8886
# col = "e_form_per_atom_mp2020_corrected"
8987
mean, std = df_wbm[col].mean(), df_wbm[col].std()
9088

91-
fig = df_wbm[col].hist(
92-
bins=100,
93-
backend="plotly",
94-
range_x=[mean - 2 * std, mean + 2 * std],
95-
template="plotly_dark",
96-
)
89+
range_x = (mean - 2 * std, mean + 2 * std)
90+
counts, bins = np.histogram(df_wbm[col], bins=150, range=range_x)
91+
x_label = "WBM energy above MP convex hull (eV/atom)"
92+
df_hist = pd.DataFrame([counts, bins], index=["count", x_label]).T
93+
94+
fig = df_hist.plot.area(x=x_label, y="count", backend="plotly", range_x=range_x)
9795

9896
if col.startswith("e_above_hull"):
9997
n_stable = sum(df_wbm[col] <= 0)
@@ -108,25 +106,20 @@
108106
)
109107
fig.update_layout(title=dict(text=title, x=0.5, y=0.95))
110108

111-
fig.update_layout(showlegend=False, paper_bgcolor="rgba(0,0,0,0)")
112-
fig.update_xaxes(title="WBM energy above MP convex hull (eV/atom)")
109+
fig.update_layout(showlegend=False)
113110

114-
for x_pos, label in zip(
115-
[mean, mean + std, mean - std],
116-
[f"{mean = :.2f}", f"{mean + std = :.2f}", f"{mean - std = :.2f}"],
111+
for x_pos, label in (
112+
(mean, f"{mean = :.2f}"),
113+
(mean - std, f"{mean - std = :.2f}"),
114+
(mean + std, f"{mean + std = :.2f}"),
117115
):
118-
anno = dict(text=label, yshift=-10, xshift=5)
116+
anno = dict(text=label, yshift=-10, xshift=-5, xanchor="right")
119117
fig.add_vline(x=x_pos, line=dict(width=1, dash="dash"), annotation=anno)
120118

121119
fig.show()
122120

123-
124-
# subsample x
125-
for trace in fig.data:
126-
trace.x = trace.x[::8]
127-
128-
save_fig(fig, f"{FIGS}/{today}-wbm-each-hist.svelte")
129-
save_fig(fig, f"./figs/{today}-wbm-each-hist.svg", width=1000, height=500)
121+
save_fig(fig, f"{FIGS}/wbm-each-hist.svelte")
122+
save_fig(fig, "./figs/wbm-each-hist.svg", width=1000, height=500)
130123

131124

132125
# %%
@@ -158,4 +151,4 @@
158151

159152
fig.show()
160153

161-
save_fig(fig, f"{FIGS}/{today}-mp-elemental-ref-energies.svelte")
154+
save_fig(fig, f"{FIGS}/mp-elemental-ref-energies.svelte")
File renamed without changes.

data/wbm/figs/wbm-each-hist.svg

+1
Loading
File renamed without changes.

data/wbm/readme.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ The full set of processing steps used to curate the WBM test set from the raw da
2525

2626
<caption>WBM Formation energy distribution. 524 materials outside green dashed lines were discarded.<br />(zoom out on this plot to see discarded samples)</caption>
2727
<slot name="hist-e-form-per-atom">
28-
<img src="./figs/wbm-e-form-per-atom.svg" alt="WBM formation energy histogram indicating outlier cutoffs">
28+
<img src="./figs/hist-wbm-e-form-per-atom.svg" alt="WBM formation energy histogram indicating outlier cutoffs">
2929
</slot>
3030

3131
- apply the [`MaterialsProject2020Compatibility`](https://pymatgen.org/pymatgen.entries.compatibility.html#pymatgen.entries.compatibility.MaterialsProject2020Compatibility) energy correction scheme to the formation energies
@@ -67,8 +67,8 @@ The [paper itself][wbm paper] links to a [Halle University data page](https://td
6767

6868
materialscloud:2021.68 includes a readme file with a description of the dataset, meanings of the summary CSV columns and a Python script for loading the data.
6969

70-
| [Materials Cloud archive](https://archive.materialscloud.org/record/2021.68) | [step 1](https://archive.materialscloud.org/record/file?record_id=840&filename=step_1.json.bz2) | [step 2](https://archive.materialscloud.org/record/file?record_id=840&filename=step_2.json.bz2) | [step 3](https://archive.materialscloud.org/record/file?record_id=840&filename=step_3.json.bz2) | [step 4](https://archive.materialscloud.org/record/file?record_id=840&filename=step_4.json.bz2) | [step 5](https://archive.materialscloud.org/record/file?record_id=840&filename=step_5.json.bz2) | [summary](https://archive.materialscloud.org/record/file?record_id=840&filename=summary.txt.bz2) | [readme](https://archive.materialscloud.org/record/file?record_id=840&filename=README.txt) |
71-
| ---------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------ |
70+
| [Materials Cloud](https://archive.materialscloud.org/record/2021.68) | [step 1](https://archive.materialscloud.org/record/file?record_id=840&filename=step_1.json.bz2) | [step 2](https://archive.materialscloud.org/record/file?record_id=840&filename=step_2.json.bz2) | [step 3](https://archive.materialscloud.org/record/file?record_id=840&filename=step_3.json.bz2) | [step 4](https://archive.materialscloud.org/record/file?record_id=840&filename=step_4.json.bz2) | [step 5](https://archive.materialscloud.org/record/file?record_id=840&filename=step_5.json.bz2) | [summary](https://archive.materialscloud.org/record/file?record_id=840&filename=summary.txt.bz2) | [readme](https://archive.materialscloud.org/record/file?record_id=840&filename=README.txt) |
71+
| -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------ |
7272

7373
[wbm paper]: https://nature.com/articles/s41524-020-00481-6
7474

@@ -79,13 +79,13 @@ The WBM test set and even more so the MP training set are heavily oxide dominate
7979
Element counts for WBM test set consisting of 256,963 WBM `ComputedStructureEntries`
8080

8181
<slot name="wbm-elements-heatmap">
82-
<img src="./figs/2023-01-08-wbm-elements.svg" alt="Periodic table log heatmap of WBM elements">
82+
<img src="./figs/wbm-elements.svg" alt="Periodic table log heatmap of WBM elements">
8383
</slot>
8484

8585
Element counts for MP training set consisting of 146,323 `ComputedStructureEntries`
8686

8787
<slot name="mp-elements-heatmap">
88-
<img src="./figs/2023-01-08-mp-elements.svg" alt="Periodic table log heatmap of MP elements">
88+
<img src="./figs/mp-elements.svg" alt="Periodic table log heatmap of MP elements">
8989
</slot>
9090

9191
## 🎯 &thinsp; Target Distribution

matbench_discovery/__init__.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,6 @@
2525
# load docs, repo, package URLs from package.json
2626
with open(f"{ROOT}/site/package.json") as file:
2727
pkg = json.load(file)
28-
pypi_keys_to_npm = dict(Docs="homepage", Repo="repository", Package="package")
29-
URLs = {key: pkg[val] for key, val in pypi_keys_to_npm.items()}
28+
29+
pypi_keys_to_npm = dict(Docs="homepage", Repo="repository", Package="package")
30+
URLs = {key: pkg[val] for key, val in pypi_keys_to_npm.items()}

matbench_discovery/data.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,8 @@ def load_df_wbm_preds(
206206

207207
dfs: dict[str, pd.DataFrame] = {}
208208

209-
for model_name in (bar := tqdm(models, disable=not pbar)):
210-
bar.set_description(model_name)
209+
for model_name in (bar := tqdm(models, disable=not pbar, desc="Loading preds")):
210+
bar.set_postfix_str(model_name)
211211
pattern = f"models/{PRED_FILENAMES[model_name]}"
212212
df = glob_to_df(pattern, pbar=False, **kwargs).set_index(id_col)
213213
dfs[model_name] = df

0 commit comments

Comments
 (0)