Skip to content

Commit 827c543

Browse files
committed
breaking add e_form_per_atm cutoff of +/- 5 eV/atom to WBM test set in fetch_process_wbm_dataset.py
also fix all failing asserts while rerunning fetch_process_wbm_dataset.py and update wbm-summary.csv + .json.bz2 structure + CSE files add WANDB_PATH in root __init__.py update readme.md and wbm/readme.md
1 parent ad8349b commit 827c543

12 files changed

+161
-111
lines changed

data/wbm/fetch_process_wbm_dataset.py

+131-91
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,14 @@
115115
# re-index after dropping bad structures to get same indices as summary file
116116
# where IDs are consecutive, i.e. step_3_70801 is followed by step_3_70802,
117117
# not step_3_70804, etc.
118-
df.index = [f"step_3_{idx + 1}" for idx in range(len(df))]
118+
# df.index = [f"step_3_{idx + 1}" for idx in range(len(df))]
119119

120120
step_len = step_lens[step - 1]
121121
assert len(df) == step_len, f"bad len for {step=}: {len(df)} != {step_len}"
122122
dfs_wbm_structs[step] = df
123123

124124

125-
# NOTE step 5 is missing 2 initial structures
125+
# NOTE step 5 is missing 2 initial structures, see nan_init_structs_ids below
126126
assert dict(dfs_wbm_structs[5].isna().sum()) == {"opt": 0, "org": 2}
127127
assert list(dfs_wbm_structs[5].query("org.isna()").index) == [
128128
"step_5_23165",
@@ -227,13 +227,11 @@ def increment_wbm_material_id(wbm_id: str) -> str:
227227
cse["parameters"]["run_type"] for cse in tqdm(df_wbm.computed_structure_entry)
228228
).value_counts().to_dict() == {"GGA": 248481, "GGA+U": 9008}
229229

230-
231-
# drop two materials with missing initial structures
232-
assert list(df_wbm.query("initial_structure.isna()").index) == [
233-
"wbm-5-23166",
234-
"wbm-5-23294",
235-
]
236-
df_wbm = df_wbm.dropna(subset=["initial_structure"])
230+
# make sure only 2 materials have missing initial structures with expected IDs
231+
nan_init_structs_ids = ["wbm-5-23166", "wbm-5-23294"]
232+
assert list(df_wbm.query("initial_structure.isna()").index) == nan_init_structs_ids
233+
# drop the two materials with missing initial structures
234+
df_wbm = df_wbm.drop(index=nan_init_structs_ids)
237235

238236

239237
# %% get composition from CSEs
@@ -275,22 +273,12 @@ def increment_wbm_material_id(wbm_id: str) -> str:
275273
), f"composition mismatch for {row.Index=}"
276274

277275

278-
# %%
276+
# %% extract alphabetical formula from CSEs (will be used as ground-truth formulas since
277+
# more informative than reduced formulas found in df_summary)
279278
df_wbm["formula_from_cse"] = [
280279
x.alphabetical_formula for x in df_wbm.pop("composition_from_cse")
281280
]
282281

283-
for fname, cols in (
284-
("computed-structure-entries", ["computed_structure_entry"]),
285-
("init-structs", ["initial_structure"]),
286-
(
287-
"computed-structure-entries+init-structs",
288-
["initial_structure", "computed_structure_entry"],
289-
),
290-
):
291-
cols = ["formula_from_cse", *cols]
292-
df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{fname}.json.bz2")
293-
294282

295283
# %%
296284
col_map = {
@@ -322,31 +310,89 @@ def increment_wbm_material_id(wbm_id: str) -> str:
322310
)
323311

324312

313+
assert sum(df_summary.index == "None") == 6
314+
# the 'None' materials have 0 volume, energy, n_sites, bandgap, etc.
315+
assert all(df_summary[df_summary.index == "None"].drop(columns=["formula"]) == 0)
316+
assert len(df_summary.query("volume > 0")) == len(df_wbm) + len(nan_init_structs_ids)
325317
# make sure dropping materials with 0 volume removes exactly 6 materials, the same ones
326318
# listed in bad_struct_ids above
327-
assert len(df_summary.query("volume > 0")) == len(df_wbm)
328319
assert all(
329320
df_summary.reset_index().query("volume == 0").index.values - sum(step_lens[:2])
330321
== bad_struct_ids
331322
)
332-
df_summary = df_summary.query("volume > 0")
333-
df_summary.index = df_summary.index.map(increment_wbm_material_id)
323+
324+
df_summary.index = df_summary.index.map(increment_wbm_material_id) # format IDs
325+
# drop materials with id='None' and missing initial structures
326+
df_summary = df_summary.drop(index=nan_init_structs_ids + ["None"])
327+
328+
# the 8403 material IDs in step 3 with final number larger than any of the ones in
329+
# bad_struct_ids are now misaligned between df_summary and df_wbm
330+
# the IDs in df_summary are consecutive while the IDs in df_wbm skip over the numbers in
331+
# bad_struct_ids. we fix this with fix_bad_struct_index_mismatch() by mapping the IDs in
332+
# df_wbm to the ones in df_summary so that both indices become consecutive.
333+
assert sum(df_summary.index != df_wbm.index) == 8403
334+
335+
336+
def fix_bad_struct_index_mismatch(material_id: str) -> str:
337+
"""Decrement material IDs in step 3 by the number of IDs with smaller final number
338+
in bad_struct_ids. This should fix the index mismatch between df_summary and df_wbm.
339+
"""
340+
_, step_num, mat_num = material_id.split("-")
341+
step_num, mat_num = int(step_num), int(mat_num)
342+
343+
if step_num == 3:
344+
mat_num -= sum(mat_num > idx + 1 for idx in bad_struct_ids)
345+
346+
return f"wbm-{step_num}-{mat_num}"
347+
348+
349+
# don't accidentally apply the fix twice
350+
if sum(df_summary.index != df_wbm.index) != 0:
351+
df_wbm.index = df_wbm.index.map(fix_bad_struct_index_mismatch)
352+
353+
# check that the index mismatch is fixed
334354
assert sum(df_summary.index != df_wbm.index) == 0
335355

356+
# update ComputedStructureEntry entry_ids to match material_ids
357+
for mat_id, cse in df_wbm.computed_structure_entry.items():
358+
entry_id = cse["entry_id"]
359+
if mat_id != entry_id:
360+
print(f"{mat_id=} != {entry_id=}")
361+
cse["entry_id"] = mat_id
362+
363+
336364
# sort formulas alphabetically
337365
df_summary["alph_formula"] = [
338366
Composition(x).alphabetical_formula for x in df_summary.formula
339367
]
340-
assert sum(df_summary.alph_formula != df_summary.formula) == 219_215
341-
assert df_summary.alph_formula[3] == "Ag2 Au1 Hg1"
342-
assert df_summary.formula[3] == "Ag2 Hg1 Au1"
368+
# alphabetical formula and original formula differ due to spaces, number 1 after element
369+
# symbols (FeO vs Fe1 O1), and element order (FeO vs OFe)
370+
assert sum(df_summary.alph_formula != df_summary.formula) == 257_483
343371

344372
df_summary["formula"] = df_summary.pop("alph_formula")
345373

346374

375+
# %% write initial structures and computed structure entries to compressed json
376+
for fname, cols in (
377+
("computed-structure-entries", ["computed_structure_entry"]),
378+
("init-structs", ["initial_structure"]),
379+
(
380+
"computed-structure-entries+init-structs",
381+
["initial_structure", "computed_structure_entry"],
382+
),
383+
):
384+
cols = ["formula_from_cse", *cols]
385+
df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{fname}.json.bz2")
386+
387+
347388
# %%
348-
# check summary and CSE formulas agree
349-
assert all(df_summary["formula"] == df_wbm.formula_from_cse)
389+
# df_summary and df_wbm formulas differ because summary formulas are reduced while
390+
# df_wbm formulas are not (e.g. Ac6 U2 vs Ac3 U1 in summary). unreduced is more
391+
# informative so we use it.
392+
assert sum(df_summary.formula != df_wbm.formula_from_cse) == 114_273
393+
assert sum(df_summary.formula == df_wbm.formula_from_cse) == 143_214
394+
395+
df_summary.formula = df_wbm.formula_from_cse
350396

351397

352398
# fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE
@@ -418,34 +464,37 @@ def increment_wbm_material_id(wbm_id: str) -> str:
418464

419465

420466
# %%
467+
for mat_id, cse in df_wbm.computed_structure_entry.items():
468+
assert mat_id == cse["entry_id"], f"{mat_id} != {cse['entry_id']}"
469+
470+
df_wbm["cse"] = [
471+
ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
472+
]
421473
# raw WBM ComputedStructureEntries have no energy corrections applied:
422474
assert all(cse.uncorrected_energy == cse.energy for cse in df_wbm.cse)
423475
# summary and CSE n_sites match
424476
assert all(df_summary.n_sites == [len(cse.structure) for cse in df_wbm.cse])
425477

478+
for mp_compat in [MPLegacyCompat(), MP2020Compat()]:
479+
compat_out = mp_compat.process_entries(df_wbm.cse, clean=True, verbose=True)
480+
assert len(compat_out) == len(df_wbm) == len(df_summary)
426481

427-
mp_compat = MP2020Compat() if False else MPLegacyCompat()
428-
compat_out = mp_compat.process_entries(df_wbm.cse, clean=True, verbose=True)
429-
430-
mp_compat.process_entry(cse)
431-
assert len(compat_out) == len(df_wbm) == len(df_summary)
432-
433-
n_corrected = sum(cse.uncorrected_energy != cse.energy for cse in df_wbm.cse)
434-
if isinstance(mp_compat, MPLegacyCompat):
435-
assert n_corrected == 39595, f"{n_corrected=}"
436-
if isinstance(mp_compat, MP2020Compat):
437-
assert n_corrected == 100931, f"{n_corrected=}"
482+
n_corrected = sum(cse.uncorrected_energy != cse.energy for cse in df_wbm.cse)
483+
if isinstance(mp_compat, MPLegacyCompat):
484+
assert n_corrected == 39591, f"{n_corrected=}"
485+
if isinstance(mp_compat, MP2020Compat):
486+
assert n_corrected == 100930, f"{n_corrected=}"
438487

439-
corr_label = "mp2020" if isinstance(mp_compat, MP2020Compat) else "legacy"
440-
df_summary[f"e_correction_per_atom_{corr_label}"] = [
441-
cse.correction_per_atom for cse in df_wbm.cse
442-
]
488+
corr_label = "mp2020" if isinstance(mp_compat, MP2020Compat) else "legacy"
489+
df_summary[f"e_correction_per_atom_{corr_label}"] = [
490+
cse.correction_per_atom for cse in df_wbm.cse
491+
]
443492

444-
assert df_summary.e_correction_per_atom_mp2020.mean().round(4) == -0.1067
445-
assert df_summary.e_correction_per_atom_legacy.mean().round(4) == -0.0643
493+
assert df_summary.e_correction_per_atom_mp2020.mean().round(4) == -0.1069
494+
assert df_summary.e_correction_per_atom_legacy.mean().round(4) == -0.0645
446495
assert (df_summary.filter(like="correction").abs() > 1e-4).sum().to_dict() == {
447-
"e_correction_per_atom_mp2020": 100931,
448-
"e_correction_per_atom_legacy": 39595,
496+
"e_correction_per_atom_mp2020": 100930,
497+
"e_correction_per_atom_legacy": 39591,
449498
}, "unexpected number of materials received non-zero corrections"
450499

451500
ax = density_scatter(
@@ -458,7 +507,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
458507

459508

460509
# %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to
461-
# https://github.com/spglib/spglib/issues/194 when using spglib v2.0.{0,1}
510+
# https://github.com/spglib/spglib/issues/194 when using spglib versions 2.0.0 or 2.0.1
511+
# left here as a reminder and for future users in case they encounter the same issue
462512
cse = df_wbm.computed_structure_entry["wbm-1-24459"]
463513
cse = ComputedStructureEntry.from_dict(cse)
464514
mp_compat.process_entry(cse)
@@ -470,13 +520,14 @@ def increment_wbm_material_id(wbm_id: str) -> str:
470520

471521

472522
# %% calculate e_above_hull for each material
473-
# this loop needs the warnings filter above to not crash Jupyter kernel with logs
523+
# this loop needs above warnings.filterwarnings() to not crash Jupyter kernel with logs
474524
# takes ~20 min at 200 it/s for 250k entries in WBM
475525
e_above_hull_key = "e_above_hull_uncorrected_ppd_mp"
476526
assert e_above_hull_key not in df_summary
477527

478-
for entry in tqdm(df_wbm.cse):
479-
assert entry.entry_id.startswith("wbm-")
528+
for mat_id, entry in tqdm(df_wbm.cse.items(), total=len(df_wbm)):
529+
assert mat_id == entry.entry_id, f"{mat_id=} != {entry.entry_id=}"
530+
assert entry.entry_id in df_summary.index, f"{entry.entry_id=} not in df_summary"
480531

481532
e_per_atom = entry.uncorrected_energy_per_atom
482533
e_hull_per_atom = ppd_mp.get_hull_energy_per_atom(entry.composition)
@@ -497,8 +548,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
497548
# first make sure source and target dfs have matching indices
498549
assert sum(df_wbm.index != df_summary.index) == 0
499550

500-
e_form_key = "e_form_per_atom_uncorrected_mp_refs"
501-
assert e_form_key not in df_summary
551+
e_form_col = "e_form_per_atom_uncorrected"
552+
assert e_form_col not in df_summary
502553

503554
for row in tqdm(df_wbm.itertuples(), total=len(df_wbm)):
504555
mat_id, cse, formula = row.Index, row.cse, row.formula_from_cse
@@ -509,40 +560,21 @@ def increment_wbm_material_id(wbm_id: str) -> str:
509560
e_form = get_e_form_per_atom(entry_like)
510561
e_form_ppd = ppd_mp.get_form_energy_per_atom(cse)
511562

512-
# make sure the PPD and functional method of calculating formation energy agree
513-
assert abs(e_form - e_form_ppd) < 1e-7, f"{e_form=} != {e_form_ppd=}"
514-
df_summary.at[cse.entry_id, e_form_key] = e_form
515-
516-
assert len(df_summary) == sum(
517-
step_lens
518-
), f"rows were added: {len(df_summary)=} {sum(step_lens)=}"
519-
563+
correction = cse.correction_per_atom
564+
# make sure the PPD.get_e_form_per_atom() and standalone get_e_form_per_atom()
565+
# method of calculating formation energy agree
566+
assert (
567+
abs(e_form - (e_form_ppd - correction)) < 1e-7
568+
), f"{mat_id=}: {e_form=:.3} != {e_form_ppd - correction=:.3}"
569+
df_summary.at[cse.entry_id, e_form_col] = e_form
520570

521571
# add old + new MP energy corrections to formation energies
522572
for corrections in ("mp2020", "legacy"):
523-
df_summary[e_form_key.replace("un", f"{corrections}_")] = (
524-
df_summary[e_form_key] + df_summary[f"e_correction_per_atom_{corrections}"]
573+
df_summary[e_form_col.replace("un", f"{corrections}_")] = (
574+
df_summary[e_form_col] + df_summary[f"e_correction_per_atom_{corrections}"]
525575
)
526576

527577

528-
# %%
529-
df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv")
530-
531-
df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
532-
"material_id"
533-
)
534-
535-
536-
# %% read WBM dataset from disk
537-
df_wbm = pd.read_json(
538-
f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
539-
).set_index("material_id")
540-
541-
df_wbm["cse"] = [
542-
ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
543-
]
544-
545-
546578
# %%
547579
df_init_struct = pd.read_json(
548580
f"{module_dir}/2022-10-19-wbm-init-structs.json.bz2"
@@ -566,13 +598,21 @@ def increment_wbm_material_id(wbm_id: str) -> str:
566598
assert df_summary[wyckoff_col].isna().sum() == 0
567599

568600

569-
# %% make sure material IDs within each step are consecutive
570-
for step in range(1, 6):
571-
df = df_summary[df_summary.index.str.startswith(f"wbm-{step}-")]
572-
step_len = step_lens[step - 1]
573-
assert len(df) == step_len, f"{step=} has {len(df)=}, expected {step_len=}"
601+
# %% write final summary data to disk (yeah!)
602+
df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv")
603+
604+
605+
# %% read summary data from disk
606+
df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
607+
"material_id"
608+
)
609+
610+
611+
# %% read WBM initial structures and computed structure entries from disk
612+
df_wbm = pd.read_json(
613+
f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
614+
).set_index("material_id")
574615

575-
step_counts = list(df.index.str.split("-").str[-1].astype(int))
576-
assert step_counts == list(
577-
range(1, step_len + 1)
578-
), f"{step=} counts not consecutive"
616+
df_wbm["cse"] = [
617+
ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
618+
]

data/wbm/readme.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@ The full set of processing steps used to curate the WBM test set from the raw da
1616
- correctly aligning initial structures to DFT-relaxed `ComputedStructureEntries`
1717
- remove 6 pathological structures (with 0 volume)
1818
- remove formation energy outliers below -5 and above 5 eV/atom (removed 502 and 22 crystals respectively out of 257,487 total, including an anomaly of 500 structures at exactly -10 eV/atom)
19-
![WBM formation energy histogram indicating outlier cutoffs](2022-12-07-hist-e-form-per-atom.png)
20-
- apply the latest `MaterialsProject2020Compatibility` energy correction scheme to the formation energies
19+
<!-- ![WBM formation energy histogram indicating outlier cutoffs](2022-12-07-hist-e-form-per-atom.png) -->
2120
- compute energy to the convex hull constructed from all MP `ComputedStructureEntries` queried on 2022-09-16 (2020-09-08 database release)
21+
- apply the [`MaterialsProject2020Compatibility`](https://pymatgen.org/pymatgen.entries.compatibility.html#pymatgen.entries.compatibility.MaterialsProject2020Compatibility) energy correction scheme to the formation energies
22+
- compute energy to the convex hull constructed from all MP `ComputedStructureEntries` queried on 2022-09-16 ([database release 2021.05.13](https://docs.materialsproject.org/changes/database-versions#v2021.05.13))
2223

2324
Invoking that script with `python fetch_process_wbm_dataset.py` will auto-download and regenerate the WBM test set files from scratch. If you find any questionable in the released test set or inconsistencies between the files on GitHub vs the output of that script, please [raise an issue](https://github.com/janosh/matbench-discovery/issues).
2425

matbench_discovery/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
ROOT = os.path.dirname(os.path.dirname(__file__))
88
DEBUG = "slurm-submit" not in sys.argv and "SLURM_JOB_ID" not in os.environ
99
CHECKPOINT_DIR = f"{ROOT}/wandb/checkpoints"
10+
WANDB_PATH = "materialsproject/matbench-discovery"
1011

1112
timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
1213
today = timestamp.split("@")[0]

models/cgcnn/test_cgcnn.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from torch.utils.data import DataLoader
1515
from tqdm import tqdm
1616

17-
from matbench_discovery import CHECKPOINT_DIR, DEBUG, ROOT, today
17+
from matbench_discovery import CHECKPOINT_DIR, DEBUG, ROOT, WANDB_PATH, today
1818
from matbench_discovery.load_preds import df_wbm
1919
from matbench_discovery.plots import wandb_scatter
2020
from matbench_discovery.slurm import slurm_submit
@@ -68,7 +68,7 @@
6868
"created_at": {"$gt": "2022-12-03", "$lt": "2022-12-04"},
6969
"display_name": {"$regex": "^train-cgcnn-robust-augment=3-"},
7070
}
71-
runs = wandb.Api().runs("janosh/matbench-discovery", filters=filters)
71+
runs = wandb.Api().runs(WANDB_PATH, filters=filters)
7272

7373
assert len(runs) == 10, f"Expected 10 runs, got {len(runs)} for {filters=}"
7474
for idx, run in enumerate(runs):

models/cgcnn/train_cgcnn.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from torch.utils.data import DataLoader
1111
from tqdm import tqdm, trange
1212

13-
from matbench_discovery import DEBUG, ROOT, timestamp, today
13+
from matbench_discovery import DEBUG, ROOT, WANDB_PATH, timestamp, today
1414
from matbench_discovery.slurm import slurm_submit
1515
from matbench_discovery.structure import perturb_structure
1616

@@ -125,6 +125,6 @@
125125
train_loader=train_loader,
126126
test_loader=test_loader,
127127
timestamp=timestamp,
128-
wandb_path="janosh/matbench-discovery",
128+
wandb_path=WANDB_PATH,
129129
run_params=run_params,
130130
)

0 commit comments

Comments
 (0)