janosh
diff --git a/‎data/mp/build_phase_diagram.py
+1-1 b/‎data/mp/build_phase_diagram.py
+1-1
diff --git a/‎data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
+3-3 b/‎data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
+3-3
diff --git a/‎data/wbm/fetch_process_wbm_dataset.py
+63-19 b/‎data/wbm/fetch_process_wbm_dataset.py
+63-19
diff --git a/‎data/wbm/readme.md
+41-23 b/‎data/wbm/readme.md
+41-23
diff --git a/‎matbench_discovery/data.py
+13-14 b/‎matbench_discovery/data.py
+13-14
@@ -51,7 +51,7 @@
 
 # %% build phase diagram with both MP entries + WBM entries
 df_wbm = pd.read_json(
-    f"{ROOT}/data/wbm/2022-10-19-wbm-cses+init-structs.json.bz2"
+    f"{ROOT}/data/wbm/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
 ).set_index("material_id")
 
 # using ComputedStructureEntry vs ComputedEntry here is important as CSEs receive
 
@@ -23,7 +23,7 @@
 """
 
 
-cse_path = f"{ROOT}/data/wbm/2022-10-19-wbm-cses.json.bz2"
+cse_path = f"{ROOT}/data/wbm/2022-10-19-wbm-computed-structure-entries.json.bz2"
 df_cse = pd.read_json(cse_path).set_index("material_id")
 
 cses = [
@@ -134,10 +134,10 @@
 ce_mp2020, ce_legacy = ces[idx].copy(), ces[idx].copy()
 
 
-with gzip.open(f"{ROOT}/tmp/cse-wbm-step-2-34803.json.zip", "w") as f:
+with gzip.open(f"{ROOT}/tmp/cse-wbm-2-34803.json.zip", "w") as f:
     f.write(cse_mp2020.to_json().encode("utf-8"))
 
-with gzip.open(f"{ROOT}/tmp/cse-wbm-step-2-34803.json.zip") as f:
+with gzip.open(f"{ROOT}/tmp/cse-wbm-2-34803.json.zip") as f:
     cse = ComputedStructureEntry.from_dict(json.load(f))
 
 cse_mp2020 = cse.copy()
 
@@ -137,7 +137,7 @@
 
 
 def increment_wbm_material_id(wbm_id: str) -> str:
-    """Maps step_1_0, step_1_1, ... onto wbm-step-1-1, wbm-step-1-2, ..."""
+    """Maps step_1_0, step_1_1, ... onto wbm-1-1, wbm-1-2, ..."""
     try:
         prefix, step_num, material_num = wbm_id.split("_")
     except ValueError:
@@ -149,13 +149,13 @@ def increment_wbm_material_id(wbm_id: str) -> str:
     assert step_num.isdigit(), msg
     assert material_num.isdigit(), msg
 
-    return f"wbm-step-{step_num}-{int(material_num) + 1}"
+    return f"wbm-{step_num}-{int(material_num) + 1}"
 
 
 df_wbm.index = df_wbm.index.map(increment_wbm_material_id)
 df_wbm.index.name = "material_id"
-assert df_wbm.index[0] == "wbm-step-1-1"
-assert df_wbm.index[-1] == "wbm-step-5-23308"
+assert df_wbm.index[0] == "wbm-1-1"
+assert df_wbm.index[-1] == "wbm-5-23308"
 
 df_wbm["initial_structure"] = df_wbm.pop("org")
 df_wbm["final_structure"] = df_wbm.pop("opt")
@@ -221,7 +221,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
         "GGA+U" if cse["parameters"]["is_hubbard"] else "GGA"
     )
     cse["entry_id"] = mat_id
-    assert cse["entry_id"].startswith("wbm-step-")
+    assert cse["entry_id"].startswith("wbm-")
 
 assert pd.Series(
     cse["parameters"]["run_type"] for cse in tqdm(df_wbm.computed_structure_entry)
@@ -230,8 +230,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 # drop two materials with missing initial structures
 assert list(df_wbm.query("initial_structure.isna()").index) == [
-    "wbm-step-5-23166",
-    "wbm-step-5-23294",
+    "wbm-5-23166",
+    "wbm-5-23294",
 ]
 df_wbm = df_wbm.dropna(subset=["initial_structure"])
 
@@ -247,12 +247,12 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 ]
 
 # all but 1 composition matches between CSE and final structure
-# mismatching ID: wbm-step-1-37977 which becomes equal on reduction:
+# mismatching ID: wbm-1-37977 which becomes equal on reduction:
 # CSE Comp: Ag4 Bi4 O12
 # final structure Comp: Ag16 Bi16 O48
 df_mismatch = df_wbm.query("composition_from_cse != composition_from_final_struct")
 assert len(df_mismatch) == 1
-assert df_mismatch.index[0] == "wbm-step-1-37977"
+assert df_mismatch.index[0] == "wbm-1-37977"
 assert (
     df_mismatch.iloc[0].composition_from_cse.reduced_composition
     == df_mismatch.iloc[0].composition_from_final_struct.reduced_composition
@@ -281,9 +281,12 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 ]
 
 for fname, cols in (
-    ("cses", ["computed_structure_entry"]),
+    ("computed-structure-entries", ["computed_structure_entry"]),
     ("init-structs", ["initial_structure"]),
-    ("cses+init-structs", ["initial_structure", "computed_structure_entry"]),
+    (
+        "computed-structure-entries+init-structs",
+        ["initial_structure", "computed_structure_entry"],
+    ),
 ):
     cols = ["formula_from_cse", *cols]
     df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{fname}.json.bz2")
@@ -310,7 +313,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
     f"{mat_cloud_url}&filename=summary.txt.bz2", sep="\t"
 ).rename(columns=col_map)
 
-# duplicate Ga3Ru2U3 step_3_28147 (1st one is wbm-step-2-18689) has 0 volume in
+# duplicate Ga3Ru2U3 step_3_28147 (1st one is wbm-2-18689) has 0 volume in
 # df_summary_bz2 vs 155.41 in df_summary
 query_str = "volume > 0 & formula != 'Ga3Ru2U3'"
 pd.testing.assert_frame_equal(
@@ -347,8 +350,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 
 # fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE
-df_summary.at["wbm-step-2-18689", "uncorrected_energy"] = df_wbm.loc[
-    "wbm-step-2-18689"
+df_summary.at["wbm-2-18689", "uncorrected_energy"] = df_wbm.loc[
+    "wbm-2-18689"
 ].computed_structure_entry["energy"]
 
 # NOTE careful with ComputedEntries as object vs as dicts, the meaning of keys changes:
@@ -373,6 +376,47 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 density_scatter(df_summary.uncorrected_energy, df_summary.uncorrected_energy_from_cse)
 
 
+# %% remove suspicious formation energy outliers
+e_form_cutoff = 5
+n_too_stable = sum(df_summary.e_form_per_atom_wbm < -e_form_cutoff)
+print(f"{n_too_stable = }")  # n_too_stable = 502
+n_too_unstable = sum(df_summary.e_form_per_atom_wbm > e_form_cutoff)
+print(f"{n_too_unstable = }")  # n_too_unstable = 22
+
+fig = df_summary.hist(x="e_form_per_atom_wbm", bins=100, backend="plotly", log_y=True)
+fig.add_vline(x=e_form_cutoff, line=dict(width=2, dash="dash", color="green"))
+fig.add_vline(x=-e_form_cutoff, line=dict(width=2, dash="dash", color="green"))
+fig.add_annotation(
+    **dict(x=0, y=1, yref="paper", yshift=20, font_color="green"),
+    text=f"<b>dataset cropped to within +/- {e_form_cutoff} eV/atom</b>",
+    showarrow=False,
+)
+fig.update_layout(
+    xaxis_title="WBM formation energy (eV/atom)", margin=dict(l=10, r=10, t=40, b=10)
+)
+
+fig.write_image(
+    f"{module_dir}/{today}-hist-e-form-per-atom.png", scale=5, width=800, height=300
+)
+
+
+# %%
+assert len(df_summary) == len(df_wbm) == 257_487
+
+query_str = f"{-e_form_cutoff} < e_form_per_atom_wbm < {e_form_cutoff}"
+dropped_ids = sorted(set(df_summary.index) - set(df_summary.query(query_str).index))
+assert len(dropped_ids) == 502 + 22
+assert dropped_ids[:3] == "wbm-1-12142 wbm-1-12143 wbm-1-12144".split()
+assert dropped_ids[-3:] == "wbm-5-9121 wbm-5-9211 wbm-5-934".split()
+
+df_summary = df_summary.query(query_str)
+df_wbm = df_wbm.loc[df_summary.index]
+
+
+# make sure we dropped the expected number 524 of materials
+assert len(df_summary) == len(df_wbm) == 257_487 - 502 - 22
+
+
 # %%
 # raw WBM ComputedStructureEntries have no energy corrections applied:
 assert all(cse.uncorrected_energy == cse.energy for cse in df_wbm.cse)
@@ -413,9 +457,9 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 # ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
 
 
-# %% Python crashes with segfault on correcting the energy of wbm-step-1-24459 due to
+# %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to
 # https://github.com/spglib/spglib/issues/194 when using spglib v2.0.{0,1}
-cse = df_wbm.computed_structure_entry["wbm-step-1-24459"]
+cse = df_wbm.computed_structure_entry["wbm-1-24459"]
 cse = ComputedStructureEntry.from_dict(cse)
 mp_compat.process_entry(cse)
 
@@ -432,7 +476,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 assert e_above_hull_key not in df_summary
 
 for entry in tqdm(df_wbm.cse):
-    assert entry.entry_id.startswith("wbm-step-")
+    assert entry.entry_id.startswith("wbm-")
 
     e_per_atom = entry.uncorrected_energy_per_atom
     e_hull_per_atom = ppd_mp.get_hull_energy_per_atom(entry.composition)
@@ -491,7 +535,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 # %% read WBM dataset from disk
 df_wbm = pd.read_json(
-    f"{module_dir}/2022-10-19-wbm-cses+init-structs.json.bz2"
+    f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
 ).set_index("material_id")
 
 df_wbm["cse"] = [
@@ -524,7 +568,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 # %% make sure material IDs within each step are consecutive
 for step in range(1, 6):
-    df = df_summary[df_summary.index.str.startswith(f"wbm-step-{step}-")]
+    df = df_summary[df_summary.index.str.startswith(f"wbm-{step}-")]
     step_len = step_lens[step - 1]
     assert len(df) == step_len, f"{step=} has {len(df)=}, expected {step_len=}"
 
 
@@ -1,39 +1,57 @@
 # WBM Dataset
 
-Source: [Predicting stable crystalline compounds using chemical similarity](https://nature.com/articles/s41524-020-00481-6) (2021)
+The **WBM dataset** was published in [Predicting stable crystalline compounds using chemical similarity][wbm paper] (Nature Computational Materials, Jan 2021, [doi:10.1038/s41524-020-00481-6](http://doi.org/10.1038/s41524-020-00481-6)). The authors generated 257,487 structures through single-element substitutions on Materials Project (MP) source structures. The replacement element was chosen based on chemical similarity determined by a matrix data-mined from the [Inorganic Crystal Structure Database (ICSD)](https://icsd.products.fiz-karlsruhe.de).
 
+The resulting novel structures were relaxed using MP-compatible VASP inputs (i.e. using `pymatgen`'s `MPRelaxSet`) and identical POTCARs in an attempt to create a database of Materials Project compatible novel crystals. Any degrade in model performance from training to test set should therefore largely be a result of extrapolation error rather thanyes,  covariate shift in the underlying data.
 
-## `wbm-summary.csv`
+The authors performed 5 rounds of elemental substitution in total, each time relaxing generated structures and adding those found to lie on the convex hull back to the source pool. In total, ~20k or close to 10% were found to lie on the Materials Project convex hull.
 
-Load with
+Since repeated substitutions should - on average - increase chemical dissimilarity, the 5 iterations of this data-generation process are a unique and compelling feature as it allows out-of distribution testing. We can check how model performance degrades when asked to predict on structures increasingly more dissimilar from the training set (which is restricted to the MP 2022 database release (or earlier) for all models in this benchmark).
 
-```py
-df_wbm_summary = pd.read_csv("data/wbm/2022-10-19-wbm-summary.csv").set_index("material_id")
-```
+## Data processing steps
 
-## Comprehensive Link Collection for WBM dataset
+The full set of processing steps used to curate the WBM test set from the raw data files (downloaded from the URLs listed below) can be found in [`data/wbm/fetch_process_wbm_dataset.py`](https://github.com/janosh/matbench-discovery/blob/site/data/wbm/fetch_process_wbm_dataset.py). Processing involved
+
+- re-formatting material IDs
+- correctly aligning initial structures to DFT-relaxed `ComputedStructureEntries`
+- remove 6 pathological structures (with 0 volume)
+- remove formation energy outliers below -5 and above 5 eV/atom (removed 502 and 22 crystals respectively out of 257,487 total, including an anomaly of 500 structures at exactly -10 eV/atom)
+ ![WBM formation energy histogram indicating outlier cutoffs](2022-12-07-hist-e-form-per-atom.png)
+- apply the latest `MaterialsProject2020Compatibility` energy correction scheme to the formation energies
+- compute energy to the convex hull constructed from all MP `ComputedStructureEntries` queried on 2022-09-16 (2020-09-08 database release)
+
+Invoking that script with `python fetch_process_wbm_dataset.py` will auto-download and regenerate the WBM test set files from scratch. If you find any questionable in the released test set or inconsistencies between the files on GitHub vs the output of that script, please [raise an issue](https://github.com/janosh/matbench-discovery/issues).
+
+## Links to WBM data files
 
 Links to WBM data files have proliferated. This is an attempt to keep track of all of them.
 
 Initial structures were sent as Google Drive links via email by Hai-Chen Wang on 2021-09-01.
 
-step 1: https://drive.google.com/file/d/1ZUgtYwrfZn_P8bULWRtTXepyAxHVxS5C
-step 2: https://drive.google.com/file/d/1-3uu2AcARJxH7GReteGVASZTuttFGiW_
-step 3: https://drive.google.com/file/d/1hc5BvDiFfTu_tc5F8m7ONSw2OgL9vN6o
-step 4: https://drive.google.com/file/d/1aMYxG5YJUgMHpbWmHpzL4hRfmP26UQqh
-step 5: https://drive.google.com/file/d/17kQt2r78ReWle4PhEIOXG7w7BFdezGM1
-summary: https://drive.google.com/file/d/1639IFUG7poaDE2uB6aISUOi65ooBwCIg
+step 1: <https://drive.google.com/file/d/1ZUgtYwrfZn_P8bULWRtTXepyAxHVxS5C>
+step 2: <https://drive.google.com/file/d/1-3uu2AcARJxH7GReteGVASZTuttFGiW_>
+step 3: <https://drive.google.com/file/d/1hc5BvDiFfTu_tc5F8m7ONSw2OgL9vN6o>
+step 4: <https://drive.google.com/file/d/1aMYxG5YJUgMHpbWmHpzL4hRfmP26UQqh>
+step 5: <https://drive.google.com/file/d/17kQt2r78ReWle4PhEIOXG7w7BFdezGM1>
+summary: <https://drive.google.com/file/d/1639IFUG7poaDE2uB6aISUOi65ooBwCIg>
 
-The `ComputedStructureEntries` for steps 1-3 were also linked from the Nature paper:
+The `ComputedStructureEntries` for steps 1-3 were also linked from the [WBM Nature paper][wbm paper]:
 
-Index page: https://tddft.org/bmg/data.php
-step 1 CSEs: https://tddft.org/bmg/files/data/substitutions_000.json.bz2
-step 2 CSEs: https://tddft.org/bmg/files/data/substitutions_001.json.bz2
-step 3 CSEs: https://tddft.org/bmg/files/data/substitutions_002.json.bz2
-CIF files: https://tddft.org/bmg/files/data/similarity-cifs.tar.gz
+Index page: <https://tddft.org/bmg/data.php>
+step 1 CSEs: <https://tddft.org/bmg/files/data/substitutions_000.json.bz2>
+step 2 CSEs: <https://tddft.org/bmg/files/data/substitutions_001.json.bz2>
+step 3 CSEs: <https://tddft.org/bmg/files/data/substitutions_002.json.bz2>
+CIF files: <https://tddft.org/bmg/files/data/similarity-cifs.tar.gz>
 
-Materials Cloud archive: https://archive.materialscloud.org/record/2021.68
+Materials Cloud archive: <https://archive.materialscloud.org/record/2021.68>
 File URLs:
-readme: https://archive.materialscloud.org/record/file?record_id=840&filename=README.txt
-summary: https://archive.materialscloud.org/record/file?record_id=840&filename=summary.txt.bz2
-step 1: https://archive.materialscloud.org/record/file?record_id=840&filename=step_1.json.bz2 etc.
+
+- readme: <https://archive.materialscloud.org/record/file?record_id=840&filename=README.txt>
+- summary: <https://archive.materialscloud.org/record/file?record_id=840&filename=summary.txt.bz2>
+- step 1: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_1.json.bz2>
+- step 2: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_2.json.bz2>
+- step 3: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_3.json.bz2>
+- step 4: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_4.json.bz2>
+- step 5: <https://archive.materialscloud.org/record/file?record_id=840&filename=step_5.json.bz2>
+
+[wbm paper]: https://nature.com/articles/s41524-020-00481-6
@@ -10,13 +10,13 @@
 from tqdm import tqdm
 
 DATA_FILENAMES = {
-    "wbm-summary": "wbm/2022-10-19-wbm-summary.csv",
-    "wbm-initial-structures": "wbm/2022-10-19-wbm-init-structs.json.bz2",
-    "wbm-computed-structure-entries": "wbm/2022-10-19-wbm-cses.json.bz2",
-    "mp-energies": "mp/2022-08-13-mp-energies.json.gz",
     "mp-computed-structure-entries": "mp/2022-09-16-mp-computed-structure-entries.json.gz",
-    "mp-patched-phase-diagram": "mp/2022-09-18-ppd-mp.pkl.gz",
     "mp-elemental-ref-energies": "mp/2022-09-19-mp-elemental-ref-energies.json",
+    "mp-energies": "mp/2022-08-13-mp-energies.json.gz",
+    "mp-patched-phase-diagram": "mp/2022-09-18-ppd-mp.pkl.gz",
+    "wbm-computed-structure-entries": "wbm/2022-10-19-wbm-computed-structure-entries.json.bz2",
+    "wbm-initial-structures": "wbm/2022-10-19-wbm-init-structs.json.bz2",
+    "wbm-summary": "wbm/2022-10-19-wbm-summary.csv",
 }
 
 RAW_REPO_URL = "https://raw.githubusercontent.com/janosh/matbench-discovery"
@@ -45,20 +45,19 @@ def load_train_test(
     cache_dir: str | None = default_cache_dir,
     hydrate: bool = False,
 ) -> pd.DataFrame | dict[str, pd.DataFrame]:
-    """Download the MP training data and WBM test data in parts or in full as pandas
+    """Download parts of or the full MP training data and WBM test data as pandas
     DataFrames. The full training and test sets are each about ~500 MB as compressed
-    JSON will be cached locally for faster re-loading unless cache_dir is set to None.
+    JSON which will be cached locally to cache_dir for faster re-loading unless
+    cache_dir is set to None.
 
-    Hint: Import DATA_FILES from the same module as this function and
-    print(list(DATA_FILES)) to see permissible data names.
+    Recognized data keys are mp-computed-structure-entries, mp-elemental-ref-energies,
+    mp-energies, mp-patched-phase-diagram, wbm-computed-structure-entries,
+    wbm-initial-structures, wbm-summary. See
+    https://matbench-discovery.janosh.dev/how-to-use for brief data descriptions.
 
     Args:
         parts (str | list[str], optional): Which parts of the MP/WBM dataset to load.
-            Can be any subset of list(DATA_FILES). Defaults to ["summary"], a dataframe
-            with columns for material properties like VASP energy, formation energy,
-            energy above the convex hull (3 columns with old, new and no Materials
-            Project energy corrections applied for each), volume, band gap, number of
-            sites per unit cell, and more.
+            Can be any subset of the above data names. Defaults to ["summary"].
         version (int, optional): Which version of the dataset to load. Defaults to 1
             (currently the only available option).
         cache_dir (str, optional): Where to cache data files on local drive. Defaults to