Skip to content

Commit ad8349b

Browse files
committed
crop WBM data to formation energy cutoff of +/-5 eV/atom (drops 524 materials, 502 below -5, 22 above +5)
rename WBM material IDs from wbm-step-1-1 to wbm-1-1 rename 2022-10-19-wbm-{cses->computed-structure-entries}.json.bz2 expand data/wbm/readme.md, tweak matbench_discovery/energy.py doc strings
1 parent fbf6a02 commit ad8349b

12 files changed

+156
-82
lines changed

data/mp/build_phase_diagram.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151

5252
# %% build phase diagram with both MP entries + WBM entries
5353
df_wbm = pd.read_json(
54-
f"{ROOT}/data/wbm/2022-10-19-wbm-cses+init-structs.json.bz2"
54+
f"{ROOT}/data/wbm/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
5555
).set_index("material_id")
5656

5757
# using ComputedStructureEntry vs ComputedEntry here is important as CSEs receive

data/wbm/compare_cse_vs_ce_mp_2020_corrections.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"""
2424

2525

26-
cse_path = f"{ROOT}/data/wbm/2022-10-19-wbm-cses.json.bz2"
26+
cse_path = f"{ROOT}/data/wbm/2022-10-19-wbm-computed-structure-entries.json.bz2"
2727
df_cse = pd.read_json(cse_path).set_index("material_id")
2828

2929
cses = [
@@ -134,10 +134,10 @@
134134
ce_mp2020, ce_legacy = ces[idx].copy(), ces[idx].copy()
135135

136136

137-
with gzip.open(f"{ROOT}/tmp/cse-wbm-step-2-34803.json.zip", "w") as f:
137+
with gzip.open(f"{ROOT}/tmp/cse-wbm-2-34803.json.zip", "w") as f:
138138
f.write(cse_mp2020.to_json().encode("utf-8"))
139139

140-
with gzip.open(f"{ROOT}/tmp/cse-wbm-step-2-34803.json.zip") as f:
140+
with gzip.open(f"{ROOT}/tmp/cse-wbm-2-34803.json.zip") as f:
141141
cse = ComputedStructureEntry.from_dict(json.load(f))
142142

143143
cse_mp2020 = cse.copy()

data/wbm/fetch_process_wbm_dataset.py

+63-19
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@
137137

138138

139139
def increment_wbm_material_id(wbm_id: str) -> str:
140-
"""Maps step_1_0, step_1_1, ... onto wbm-step-1-1, wbm-step-1-2, ..."""
140+
"""Maps step_1_0, step_1_1, ... onto wbm-1-1, wbm-1-2, ..."""
141141
try:
142142
prefix, step_num, material_num = wbm_id.split("_")
143143
except ValueError:
@@ -149,13 +149,13 @@ def increment_wbm_material_id(wbm_id: str) -> str:
149149
assert step_num.isdigit(), msg
150150
assert material_num.isdigit(), msg
151151

152-
return f"wbm-step-{step_num}-{int(material_num) + 1}"
152+
return f"wbm-{step_num}-{int(material_num) + 1}"
153153

154154

155155
df_wbm.index = df_wbm.index.map(increment_wbm_material_id)
156156
df_wbm.index.name = "material_id"
157-
assert df_wbm.index[0] == "wbm-step-1-1"
158-
assert df_wbm.index[-1] == "wbm-step-5-23308"
157+
assert df_wbm.index[0] == "wbm-1-1"
158+
assert df_wbm.index[-1] == "wbm-5-23308"
159159

160160
df_wbm["initial_structure"] = df_wbm.pop("org")
161161
df_wbm["final_structure"] = df_wbm.pop("opt")
@@ -221,7 +221,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
221221
"GGA+U" if cse["parameters"]["is_hubbard"] else "GGA"
222222
)
223223
cse["entry_id"] = mat_id
224-
assert cse["entry_id"].startswith("wbm-step-")
224+
assert cse["entry_id"].startswith("wbm-")
225225

226226
assert pd.Series(
227227
cse["parameters"]["run_type"] for cse in tqdm(df_wbm.computed_structure_entry)
@@ -230,8 +230,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
230230

231231
# drop two materials with missing initial structures
232232
assert list(df_wbm.query("initial_structure.isna()").index) == [
233-
"wbm-step-5-23166",
234-
"wbm-step-5-23294",
233+
"wbm-5-23166",
234+
"wbm-5-23294",
235235
]
236236
df_wbm = df_wbm.dropna(subset=["initial_structure"])
237237

@@ -247,12 +247,12 @@ def increment_wbm_material_id(wbm_id: str) -> str:
247247
]
248248

249249
# all but 1 composition matches between CSE and final structure
250-
# mismatching ID: wbm-step-1-37977 which becomes equal on reduction:
250+
# mismatching ID: wbm-1-37977 which becomes equal on reduction:
251251
# CSE Comp: Ag4 Bi4 O12
252252
# final structure Comp: Ag16 Bi16 O48
253253
df_mismatch = df_wbm.query("composition_from_cse != composition_from_final_struct")
254254
assert len(df_mismatch) == 1
255-
assert df_mismatch.index[0] == "wbm-step-1-37977"
255+
assert df_mismatch.index[0] == "wbm-1-37977"
256256
assert (
257257
df_mismatch.iloc[0].composition_from_cse.reduced_composition
258258
== df_mismatch.iloc[0].composition_from_final_struct.reduced_composition
@@ -281,9 +281,12 @@ def increment_wbm_material_id(wbm_id: str) -> str:
281281
]
282282

283283
for fname, cols in (
284-
("cses", ["computed_structure_entry"]),
284+
("computed-structure-entries", ["computed_structure_entry"]),
285285
("init-structs", ["initial_structure"]),
286-
("cses+init-structs", ["initial_structure", "computed_structure_entry"]),
286+
(
287+
"computed-structure-entries+init-structs",
288+
["initial_structure", "computed_structure_entry"],
289+
),
287290
):
288291
cols = ["formula_from_cse", *cols]
289292
df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{fname}.json.bz2")
@@ -310,7 +313,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
310313
f"{mat_cloud_url}&filename=summary.txt.bz2", sep="\t"
311314
).rename(columns=col_map)
312315

313-
# duplicate Ga3Ru2U3 step_3_28147 (1st one is wbm-step-2-18689) has 0 volume in
316+
# duplicate Ga3Ru2U3 step_3_28147 (1st one is wbm-2-18689) has 0 volume in
314317
# df_summary_bz2 vs 155.41 in df_summary
315318
query_str = "volume > 0 & formula != 'Ga3Ru2U3'"
316319
pd.testing.assert_frame_equal(
@@ -347,8 +350,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
347350

348351

349352
# fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE
350-
df_summary.at["wbm-step-2-18689", "uncorrected_energy"] = df_wbm.loc[
351-
"wbm-step-2-18689"
353+
df_summary.at["wbm-2-18689", "uncorrected_energy"] = df_wbm.loc[
354+
"wbm-2-18689"
352355
].computed_structure_entry["energy"]
353356

354357
# NOTE careful with ComputedEntries as object vs as dicts, the meaning of keys changes:
@@ -373,6 +376,47 @@ def increment_wbm_material_id(wbm_id: str) -> str:
373376
density_scatter(df_summary.uncorrected_energy, df_summary.uncorrected_energy_from_cse)
374377

375378

379+
# %% remove suspicious formation energy outliers
380+
e_form_cutoff = 5
381+
n_too_stable = sum(df_summary.e_form_per_atom_wbm < -e_form_cutoff)
382+
print(f"{n_too_stable = }") # n_too_stable = 502
383+
n_too_unstable = sum(df_summary.e_form_per_atom_wbm > e_form_cutoff)
384+
print(f"{n_too_unstable = }") # n_too_unstable = 22
385+
386+
fig = df_summary.hist(x="e_form_per_atom_wbm", bins=100, backend="plotly", log_y=True)
387+
fig.add_vline(x=e_form_cutoff, line=dict(width=2, dash="dash", color="green"))
388+
fig.add_vline(x=-e_form_cutoff, line=dict(width=2, dash="dash", color="green"))
389+
fig.add_annotation(
390+
**dict(x=0, y=1, yref="paper", yshift=20, font_color="green"),
391+
text=f"<b>dataset cropped to within +/- {e_form_cutoff} eV/atom</b>",
392+
showarrow=False,
393+
)
394+
fig.update_layout(
395+
xaxis_title="WBM formation energy (eV/atom)", margin=dict(l=10, r=10, t=40, b=10)
396+
)
397+
398+
fig.write_image(
399+
f"{module_dir}/{today}-hist-e-form-per-atom.png", scale=5, width=800, height=300
400+
)
401+
402+
403+
# %%
404+
assert len(df_summary) == len(df_wbm) == 257_487
405+
406+
query_str = f"{-e_form_cutoff} < e_form_per_atom_wbm < {e_form_cutoff}"
407+
dropped_ids = sorted(set(df_summary.index) - set(df_summary.query(query_str).index))
408+
assert len(dropped_ids) == 502 + 22
409+
assert dropped_ids[:3] == "wbm-1-12142 wbm-1-12143 wbm-1-12144".split()
410+
assert dropped_ids[-3:] == "wbm-5-9121 wbm-5-9211 wbm-5-934".split()
411+
412+
df_summary = df_summary.query(query_str)
413+
df_wbm = df_wbm.loc[df_summary.index]
414+
415+
416+
# make sure we dropped the expected number 524 of materials
417+
assert len(df_summary) == len(df_wbm) == 257_487 - 502 - 22
418+
419+
376420
# %%
377421
# raw WBM ComputedStructureEntries have no energy corrections applied:
378422
assert all(cse.uncorrected_energy == cse.energy for cse in df_wbm.cse)
@@ -413,9 +457,9 @@ def increment_wbm_material_id(wbm_id: str) -> str:
413457
# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
414458

415459

416-
# %% Python crashes with segfault on correcting the energy of wbm-step-1-24459 due to
460+
# %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to
417461
# https://github.com/spglib/spglib/issues/194 when using spglib v2.0.{0,1}
418-
cse = df_wbm.computed_structure_entry["wbm-step-1-24459"]
462+
cse = df_wbm.computed_structure_entry["wbm-1-24459"]
419463
cse = ComputedStructureEntry.from_dict(cse)
420464
mp_compat.process_entry(cse)
421465

@@ -432,7 +476,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
432476
assert e_above_hull_key not in df_summary
433477

434478
for entry in tqdm(df_wbm.cse):
435-
assert entry.entry_id.startswith("wbm-step-")
479+
assert entry.entry_id.startswith("wbm-")
436480

437481
e_per_atom = entry.uncorrected_energy_per_atom
438482
e_hull_per_atom = ppd_mp.get_hull_energy_per_atom(entry.composition)
@@ -491,7 +535,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
491535

492536
# %% read WBM dataset from disk
493537
df_wbm = pd.read_json(
494-
f"{module_dir}/2022-10-19-wbm-cses+init-structs.json.bz2"
538+
f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
495539
).set_index("material_id")
496540

497541
df_wbm["cse"] = [
@@ -524,7 +568,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
524568

525569
# %% make sure material IDs within each step are consecutive
526570
for step in range(1, 6):
527-
df = df_summary[df_summary.index.str.startswith(f"wbm-step-{step}-")]
571+
df = df_summary[df_summary.index.str.startswith(f"wbm-{step}-")]
528572
step_len = step_lens[step - 1]
529573
assert len(df) == step_len, f"{step=} has {len(df)=}, expected {step_len=}"
530574

data/wbm/readme.md

+41-23
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,57 @@
11
# WBM Dataset
22

3-
Source: [Predicting stable crystalline compounds using chemical similarity](https://nature.com/articles/s41524-020-00481-6) (2021)
3+
The **WBM dataset** was published in [Predicting stable crystalline compounds using chemical similarity][wbm paper] (Nature Computational Materials, Jan 2021, [doi:10.1038/s41524-020-00481-6](http://doi.org/10.1038/s41524-020-00481-6)). The authors generated 257,487 structures through single-element substitutions on Materials Project (MP) source structures. The replacement element was chosen based on chemical similarity determined by a matrix data-mined from the [Inorganic Crystal Structure Database (ICSD)](https://icsd.products.fiz-karlsruhe.de).
44

5+
The resulting novel structures were relaxed using MP-compatible VASP inputs (i.e. using `pymatgen`'s `MPRelaxSet`) and identical POTCARs in an attempt to create a database of Materials Project compatible novel crystals. Any degrade in model performance from training to test set should therefore largely be a result of extrapolation error rather thanyes, covariate shift in the underlying data.
56

6-
## `wbm-summary.csv`
7+
The authors performed 5 rounds of elemental substitution in total, each time relaxing generated structures and adding those found to lie on the convex hull back to the source pool. In total, ~20k or close to 10% were found to lie on the Materials Project convex hull.
78

8-
Load with
9+
Since repeated substitutions should - on average - increase chemical dissimilarity, the 5 iterations of this data-generation process are a unique and compelling feature as it allows out-of distribution testing. We can check how model performance degrades when asked to predict on structures increasingly more dissimilar from the training set (which is restricted to the MP 2022 database release (or earlier) for all models in this benchmark).
910

10-
```py
11-
df_wbm_summary = pd.read_csv("data/wbm/2022-10-19-wbm-summary.csv").set_index("material_id")
12-
```
11+
## Data processing steps
1312

14-
## Comprehensive Link Collection for WBM dataset
13+
The full set of processing steps used to curate the WBM test set from the raw data files (downloaded from the URLs listed below) can be found in [`data/wbm/fetch_process_wbm_dataset.py`](https://github.com/janosh/matbench-discovery/blob/site/data/wbm/fetch_process_wbm_dataset.py). Processing involved
14+
15+
- re-formatting material IDs
16+
- correctly aligning initial structures to DFT-relaxed `ComputedStructureEntries`
17+
- remove 6 pathological structures (with 0 volume)
18+
- remove formation energy outliers below -5 and above 5 eV/atom (removed 502 and 22 crystals respectively out of 257,487 total, including an anomaly of 500 structures at exactly -10 eV/atom)
19+
![WBM formation energy histogram indicating outlier cutoffs](2022-12-07-hist-e-form-per-atom.png)
20+
- apply the latest `MaterialsProject2020Compatibility` energy correction scheme to the formation energies
21+
- compute energy to the convex hull constructed from all MP `ComputedStructureEntries` queried on 2022-09-16 (2020-09-08 database release)
22+
23+
Invoking that script with `python fetch_process_wbm_dataset.py` will auto-download and regenerate the WBM test set files from scratch. If you find any questionable in the released test set or inconsistencies between the files on GitHub vs the output of that script, please [raise an issue](https://github.com/janosh/matbench-discovery/issues).
24+
25+
## Links to WBM data files
1526

1627
Links to WBM data files have proliferated. This is an attempt to keep track of all of them.
1728

1829
Initial structures were sent as Google Drive links via email by Hai-Chen Wang on 2021-09-01.
1930

20-
step 1: https://drive.google.com/file/d/1ZUgtYwrfZn_P8bULWRtTXepyAxHVxS5C
21-
step 2: https://drive.google.com/file/d/1-3uu2AcARJxH7GReteGVASZTuttFGiW_
22-
step 3: https://drive.google.com/file/d/1hc5BvDiFfTu_tc5F8m7ONSw2OgL9vN6o
23-
step 4: https://drive.google.com/file/d/1aMYxG5YJUgMHpbWmHpzL4hRfmP26UQqh
24-
step 5: https://drive.google.com/file/d/17kQt2r78ReWle4PhEIOXG7w7BFdezGM1
25-
summary: https://drive.google.com/file/d/1639IFUG7poaDE2uB6aISUOi65ooBwCIg
31+
step 1: <https://drive.google.com/file/d/1ZUgtYwrfZn_P8bULWRtTXepyAxHVxS5C>
32+
step 2: <https://drive.google.com/file/d/1-3uu2AcARJxH7GReteGVASZTuttFGiW_>
33+
step 3: <https://drive.google.com/file/d/1hc5BvDiFfTu_tc5F8m7ONSw2OgL9vN6o>
34+
step 4: <https://drive.google.com/file/d/1aMYxG5YJUgMHpbWmHpzL4hRfmP26UQqh>
35+
step 5: <https://drive.google.com/file/d/17kQt2r78ReWle4PhEIOXG7w7BFdezGM1>
36+
summary: <https://drive.google.com/file/d/1639IFUG7poaDE2uB6aISUOi65ooBwCIg>
2637

27-
The `ComputedStructureEntries` for steps 1-3 were also linked from the Nature paper:
38+
The `ComputedStructureEntries` for steps 1-3 were also linked from the [WBM Nature paper][wbm paper]:
2839

29-
Index page: https://tddft.org/bmg/data.php
30-
step 1 CSEs: https://tddft.org/bmg/files/data/substitutions_000.json.bz2
31-
step 2 CSEs: https://tddft.org/bmg/files/data/substitutions_001.json.bz2
32-
step 3 CSEs: https://tddft.org/bmg/files/data/substitutions_002.json.bz2
33-
CIF files: https://tddft.org/bmg/files/data/similarity-cifs.tar.gz
40+
Index page: <https://tddft.org/bmg/data.php>
41+
step 1 CSEs: <https://tddft.org/bmg/files/data/substitutions_000.json.bz2>
42+
step 2 CSEs: <https://tddft.org/bmg/files/data/substitutions_001.json.bz2>
43+
step 3 CSEs: <https://tddft.org/bmg/files/data/substitutions_002.json.bz2>
44+
CIF files: <https://tddft.org/bmg/files/data/similarity-cifs.tar.gz>
3445

35-
Materials Cloud archive: https://archive.materialscloud.org/record/2021.68
46+
Materials Cloud archive: <https://archive.materialscloud.org/record/2021.68>
3647
File URLs:
37-
readme: https://archive.materialscloud.org/record/file?record_id=840&filename=README.txt
38-
summary: https://archive.materialscloud.org/record/file?record_id=840&filename=summary.txt.bz2
39-
step 1: https://archive.materialscloud.org/record/file?record_id=840&filename=step_1.json.bz2 etc.
48+
49+
- readme: <https://archive.materialscloud.org/record/file?record_id=840&filename=README.txt>
50+
- summary: <https://archive.materialscloud.org/record/file?record_id=840&filename=summary.txt.bz2>
51+
- step 1: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_1.json.bz2>
52+
- step 2: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_2.json.bz2>
53+
- step 3: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_3.json.bz2>
54+
- step 4: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_4.json.bz2>
55+
- step 5: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_5.json.bz2>
56+
57+
[wbm paper]: https://nature.com/articles/s41524-020-00481-6

matbench_discovery/data.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010
from tqdm import tqdm
1111

1212
DATA_FILENAMES = {
13-
"wbm-summary": "wbm/2022-10-19-wbm-summary.csv",
14-
"wbm-initial-structures": "wbm/2022-10-19-wbm-init-structs.json.bz2",
15-
"wbm-computed-structure-entries": "wbm/2022-10-19-wbm-cses.json.bz2",
16-
"mp-energies": "mp/2022-08-13-mp-energies.json.gz",
1713
"mp-computed-structure-entries": "mp/2022-09-16-mp-computed-structure-entries.json.gz",
18-
"mp-patched-phase-diagram": "mp/2022-09-18-ppd-mp.pkl.gz",
1914
"mp-elemental-ref-energies": "mp/2022-09-19-mp-elemental-ref-energies.json",
15+
"mp-energies": "mp/2022-08-13-mp-energies.json.gz",
16+
"mp-patched-phase-diagram": "mp/2022-09-18-ppd-mp.pkl.gz",
17+
"wbm-computed-structure-entries": "wbm/2022-10-19-wbm-computed-structure-entries.json.bz2",
18+
"wbm-initial-structures": "wbm/2022-10-19-wbm-init-structs.json.bz2",
19+
"wbm-summary": "wbm/2022-10-19-wbm-summary.csv",
2020
}
2121

2222
RAW_REPO_URL = "https://raw.githubusercontent.com/janosh/matbench-discovery"
@@ -45,20 +45,19 @@ def load_train_test(
4545
cache_dir: str | None = default_cache_dir,
4646
hydrate: bool = False,
4747
) -> pd.DataFrame | dict[str, pd.DataFrame]:
48-
"""Download the MP training data and WBM test data in parts or in full as pandas
48+
"""Download parts of or the full MP training data and WBM test data as pandas
4949
DataFrames. The full training and test sets are each about ~500 MB as compressed
50-
JSON will be cached locally for faster re-loading unless cache_dir is set to None.
50+
JSON which will be cached locally to cache_dir for faster re-loading unless
51+
cache_dir is set to None.
5152
52-
Hint: Import DATA_FILES from the same module as this function and
53-
print(list(DATA_FILES)) to see permissible data names.
53+
Recognized data keys are mp-computed-structure-entries, mp-elemental-ref-energies,
54+
mp-energies, mp-patched-phase-diagram, wbm-computed-structure-entries,
55+
wbm-initial-structures, wbm-summary. See
56+
https://matbench-discovery.janosh.dev/how-to-use for brief data descriptions.
5457
5558
Args:
5659
parts (str | list[str], optional): Which parts of the MP/WBM dataset to load.
57-
Can be any subset of list(DATA_FILES). Defaults to ["summary"], a dataframe
58-
with columns for material properties like VASP energy, formation energy,
59-
energy above the convex hull (3 columns with old, new and no Materials
60-
Project energy corrections applied for each), volume, band gap, number of
61-
sites per unit cell, and more.
60+
Can be any subset of the above data names. Defaults to ["summary"].
6261
version (int, optional): Which version of the dataset to load. Defaults to 1
6362
(currently the only available option).
6463
cache_dir (str, optional): Where to cache data files on local drive. Defaults to

0 commit comments

Comments
 (0)