fix /contribute

janosh · janosh · commit dfa6c24b1227 · 2023-06-19T20:29:25.000-07:00
API and data underwent several breaking changes not yet reflected on this page
diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -54,8 +54,9 @@ def load(
     JSON which will be cached locally to cache_dir for faster re-loading unless
     cache_dir is set to None.
 
-    See matbench_discovery.data.DATA_FILES for recognized data keys. For descriptions,
-    see https://janosh.github.io/matbench-discovery/contribute#--direct-download.
+    See matbench_discovery.data.DATA_FILES for recognized data keys. See [here]
+    (https://janosh.github.io/matbench-discovery/contribute#--direct-download) for file
+    descriptions.
 
     Args:
         data_key (str): Which parts of the MP/WBM data to load. Must be one of
diff --git a/pyproject.toml b/pyproject.toml
@@ -86,6 +86,8 @@ select = [
   "E",   # pycodestyle error
   "F",   # pyflakes
   "I",   # isort
+  "ICN", # flake8-import-conventions
+  "ISC", # flake8-implicit-str-concat
   "N",   # pep8-naming
   "PD",  # pandas-vet
   "PIE", # flake8-pie
@@ -94,7 +96,7 @@ select = [
   "PYI", # flakes8-pyi
   "Q",   # flake8-quotes
   "RET", # flake8-return
-  "RUF", # ruff
+  "RUF", # Ruff-specific rules
   "SIM", # flake8-simplify
   "TID", # tidy imports
   "UP",  # pyupgrade
diff --git a/site/src/routes/+layout.svelte b/site/src/routes/+layout.svelte
@@ -8,7 +8,7 @@
   import { GitHubCorner, PrevNext } from 'svelte-zoo'
   import '../app.css'
 
-  const routes = Object.keys(import.meta.glob(`./*/+page.{svx,svelte,md}`)).map(
+  const routes = Object.keys(import.meta.glob(`./*/+page.{svelte,md}`)).map(
     (filename) => `/` + filename.split(`/`)[1]
   )
 
@@ -31,7 +31,7 @@
   if (url && !description) console.warn(`No description for url=${url}`)
   $: title = url == `/` ? `` : `${url} • `
 
-  const actions = Object.keys(import.meta.glob(`./**/+page.{svx,svelte,md}`)).map(
+  const actions = Object.keys(import.meta.glob(`./**/+page.{svelte,md}`)).map(
     (filename) => {
       const parts = filename.split(`/`).filter((part) => !part.startsWith(`(`)) // remove hidden route segments
       const route = `/${parts.slice(1, -1).join(`/`)}`
diff --git a/site/src/routes/contribute/+page.md b/site/src/routes/contribute/+page.md
@@ -6,32 +6,33 @@
 
 ## 🔨 &thinsp; Installation
 
-The recommended way to acquire the training and test sets for this benchmark is through our Python package [available on PyPI](https://pypi.org/project/{name}):
+To download the training and test sets for this benchmark, we recommend installing our [PyPI package](https://pypi.org/project/{name}):
 
 ```zsh
 pip install matbench-discovery
 ```
 
 ## 📙 &thinsp; Usage
 
-This example script downloads the training and test data for training a model:
+When you access an attribute of the `DATA_FILES` class, it automatically downloads and caches the corresponding data file. For example:
 
 ```py
-from matbench_discovery.data import load
-from matbench_discovery.data import df_wbm, DATA_FILES
+from matbench_discovery.data import DATA_FILES, load
 
-# any subset of these keys can be passed to load()
+# available data files
 assert sorted(DATA_FILES) == [
-    "mp-computed-structure-entries",
-    "mp-elemental-ref-energies",
-    "mp-energies",
-    "mp-patched-phase-diagram",
-    "wbm-computed-structure-entries",
-    "wbm-initial-structures",
-    "wbm-summary",
+    "mp_computed_structure_entries",
+    "mp_elemental_ref_entries",
+    "mp_energies",
+    "mp_patched_phase_diagram",
+    "wbm_computed_structure_entries",
+    "wbm_cses_plus_init_structs",
+    "wbm_initial_structures",
+    "wbm_summary",
 ]
 
-df_wbm = load("wbm-summary", version="v1.0.0")
+# version defaults to latest, set a specific version to avoid breaking changes
+df_wbm = load("wbm_summary", version="1.0.0")
 
 assert df_wbm.shape == (256963, 15)
 
@@ -43,13 +44,14 @@ assert list(df_wbm) == [
     "e_form_per_atom_wbm",
     "e_above_hull_wbm",
     "bandgap_pbe",
+    "wyckoff_spglib",
     "uncorrected_energy_from_cse",
     "e_correction_per_atom_mp2020",
     "e_correction_per_atom_mp_legacy",
-    "e_above_hull_mp2020_corrected_ppd_mp",
     "e_form_per_atom_uncorrected",
     "e_form_per_atom_mp2020_corrected",
-    "wyckoff_spglib",
+    "e_above_hull_mp2020_corrected_ppd_mp",
+    "site_stats_fingerprint_init_final_norm_diff",
 ]
 ```
 
@@ -60,14 +62,16 @@ assert list(df_wbm) == [
 1. **`volume`**: Relaxed structure volume in cubic Angstrom
 1. **`uncorrected_energy`**: Raw VASP-computed energy
 1. **`e_form_per_atom_wbm`**: Original formation energy per atom from [WBM paper]
-1. **`e_hull_wbm`**: Original energy above the convex hull in (eV/atom) from [WBM paper]
+1. **`e_above_hull_wbm`**: Original energy above the convex hull in (eV/atom) from [WBM paper]
+1. **`wyckoff_spglib`**: Aflow label strings built from spacegroup and Wyckoff positions of the relaxed structure as computed by [spglib](https://spglib.readthedocs.io/en/latest/python-spglib.html?highlight=get_symmetry_dataset#get-symmetry-dataset).
 1. **`bandgap_pbe`**: PBE-level DFT band gap from [WBM paper]
-1. **`uncorrected_energy_from_cse`**: Should be the same as `uncorrected_energy`. There are 2 cases where the absolute difference reported in the summary file and in the computed structure entries exceeds 0.1 eV (`wbm-2-3218`, `wbm-1-56320`) which we attribute to rounding errors.
-1. **`e_form_per_atom_mp2020_corrected`**: Matbench Discovery takes these as ground truth for the formation energy. Includes MP2020 energy corrections (latest correction scheme at time of release).
+1. **`uncorrected_energy_from_cse`**: Uncorrected DFT energy stored in `ComputedStructureEntries`. Should be the same as `uncorrected_energy`. There are 2 cases where the absolute difference reported in the summary file and in the computed structure entries exceeds 0.1 eV (`wbm-2-3218`, `wbm-1-56320`) which we attribute to rounding errors.
+1. **`e_form_per_atom_uncorrected`**: Uncorrected DFT formation energy per atom in eV/atom.
+1. **`e_form_per_atom_mp2020_corrected`**: Matbench Discovery takes these as ground truth for the formation energy. The result of applying the [MP2020 energy corrections](https://pymatgen.org/pymatgen.entries.compatibility.html#pymatgen.entries.compatibility.MaterialsProject2020Compatibility) (latest correction scheme at time of release) to `e_form_per_atom_uncorrected`.
 1. **`e_correction_per_atom_mp2020`**: [`MaterialsProject2020Compatibility`](https://pymatgen.org/pymatgen.entries.compatibility.html#pymatgen.entries.compatibility.MaterialsProject2020Compatibility) energy corrections in eV/atom.
 1. **`e_correction_per_atom_mp_legacy`**: Legacy [`MaterialsProjectCompatibility`](https://pymatgen.org/pymatgen.entries.compatibility.html#pymatgen.entries.compatibility.MaterialsProjectCompatibility) energy corrections in eV/atom. Having both old and new corrections allows updating predictions from older models like MEGNet that were trained on MP formation energies treated with the old correction scheme.
 1. **`e_above_hull_mp2020_corrected_ppd_mp`**: Energy above hull distances in eV/atom after applying the MP2020 correction scheme. The convex hull in question is the one spanned by all ~145k Materials Project `ComputedStructureEntries`. Matbench Discovery takes these as ground truth for material stability. Any value above 0 is assumed to be an unstable/metastable material.
-<!-- TODO document remaining columns, or maybe drop them from df -->
+1. **`site_stats_fingerprint_init_final_norm_diff`**: The norm of the difference between the initial and final site fingerprints. This is a volume-independent measure of how much the structure changed during DFT relaxation. Uses the `matminer` [`SiteStatsFingerprint`](https://github.com/hackingmaterials/matminer/blob/33bf112009b67b108f1008b8cc7398061b3e6db2/matminer/featurizers/structure/sites.py#L21-L33) (v0.8.0).
 
 ## 📥 &thinsp; Direct Download
 
diff --git a/site/src/routes/preprint/+page.md b/site/src/routes/preprint/+page.md
@@ -123,7 +123,7 @@ Our initial benchmark release includes 8 models. @Fig:metrics-table includes all
 
    CGCNN was among the first to show that just like in other areas of ML, given large enough training sets, neural networks can learn embeddings that reliably outperform all human-engineered structure features directly from the data.
 
-1. **CGCNN+P** @gibson_data-augmentation_2022 - This work proposes a simple, physically motivated structure perturbations to augment stock CGCNN's training data of relaxed structures with structures resembling unrelaxed ones but mapped to the same DFT final energy. Here we chose $P=5$, meaning the training set was augmented with 5 random perturbations of each relaxed MP structure mapped to the same target energy.
+1. **CGCNN+P** @gibson_data-augmentation_2022 - Identical to CGCNN except for a difference in training procedure. The +P stands for training set augmentation using random structure perturbations. We apply a slight lattice strain and nudge the atoms but keep the same energy target to create additional training samples. Here we chose $P=5$ meaning random perturbations are repeated 5 times for each relaxed MP structure.
 
    In contrast to all other structure-based GNNs considered in this benchmark, CGCNN+P is not attempting to learn the Born-Oppenheimer potential energy surface. The model is instead taught the PES as a step-function that maps each valley to its local minimum. The idea is that during testing on unrelaxed structures, the model will predict the energy of the nearest basin in the PES. The authors confirm this by demonstrating a lowering of the energy error on unrelaxed structures.
 
@@ -202,15 +202,17 @@ A line terminates when a model believes there are no more materials in the WBM t
 <HistClfTrueHullDistModels />
 {/if}
 
-> @label:fig:hist-clf-true-hull-dist-models These histograms show the classification performance of models as a function of DFT-computed hull distance on the $x$ axis. Models are sorted top to bottom by F1 score. While CHGNet and M3GNet perform almost equally well overall, M3GNet makes fewer false negative but more false positives predictions compared to CHGNet. This observation is also reflected in the higher TPR and lower TNR of M3GNet vs CHGNet in @fig:metrics-table, as well as the lower error for CHGNet vs M3GNet on the stable side (left half) of @fig:rolling-mae-vs-hull-dist-models and M3GNet over CHGNet on the unstable side (right half) of @fig:rolling-mae-vs-hull-dist-models.
+> @label:fig:hist-clf-true-hull-dist-models These histograms show the classification performance of models as a function of DFT-computed hull distance on the $x$ axis. The colors visualize the proportion of true to false predictions as a function of crystal stability. Models are sorted top to bottom by F1 score. While CHGNet and M3GNet perform almost equally well overall, these plots reveal that they do so via different trade-offs. M3GNet commits fewer false negative but more false positives predictions compared to CHGNet. In a real discovery campaign, false positives come back to bite you more than false negatives since they result in wasted DFT relaxations or even synthesis time in the lab. A false negative by contrast is just one missed opportunity out of millions.
+> This observation is also reflected in the higher TPR and lower TNR of M3GNet vs CHGNet in @fig:metrics-table, as well as the lower error for CHGNet vs M3GNet on the stable side (left half) of @fig:rolling-mae-vs-hull-dist-models and M3GNet over CHGNet on the unstable side (right half) of @fig:rolling-mae-vs-hull-dist-models.
 
 ### Predicted Hull Distance Parity Plots
 
 {#if mounted}
 <EachScatterModels />
 {/if}
 
-> @label:fig:each-scatter-models Parity plot for each model's energy above hull predictions (based on their formation energy preds) vs DFT ground truth
+> @label:fig:each-scatter-models Parity plot of DFT hull distance vs model hull distance predictions (derived from predicted formation energies). All models do well on the outliers. They suffer most in the mode of the distribution around the convex hull.
+> Interestingly, all models do very well on the outliers. Where they suffer is in the mode of the distribution near the convex hull. All models exhibit a horizontal spike at 0 predicted hull distance for crystals that are actually very unstable, resulting in false positive predictions. Some models, Wrenformer in particular, also have a spike pointing upwards, which are materials actually close to the hull but predicted to be highly unstable by the model. These are false negatives, or missed needles in the haystack we're searching that is materials space.
 
 <!-- TODO maybe mention we consistently see deducting old MP corrections and applying 2020 scheme from MEGNEt e_form predictions increases MAE, no matter if paired with BOWSR, M3GNet, CHGNet or standalone -->
 
diff --git a/site/src/routes/si/+page.md b/site/src/routes/si/+page.md
@@ -102,7 +102,7 @@ Given its strong performance on batch 1, it is possible that given sufficiently
 <ScatterLargestErrorsModelsMeanVsTrueHullDist />
 {/if}
 
-> @label:fig:scatter-largest-errors-models-mean-vs-true-hull-dist DFT vs predicted hull distance (average over all models) for the 200 largest error structures colored by model disagreement (as measured by standard deviation in hull distance predictions from different models) and sized by number of atoms in the structures. This plot shows that high-error predictions are biased towards predicting too small hull distance. This is unsurprising considering MP training data mainly consists of low-energy structures.<br>
+> @label:fig:scatter-largest-errors-models-mean-vs-true-hull-dist DFT vs predicted hull distance (average over all models) for the 200 largest error structures colored by model disagreement (as measured by standard deviation in hull distance predictions from different models) and sized by number of atoms in the structures. This plot shows that high-error predictions are biased towards predicting too small hull distance. This is unsurprising considering MP training data mainly consists of low-energy structure.<br>
 > However, note the clear color separation between the mostly blue low-energy-bias predictions and the yellow/red high error prediction. Blue means models are in good agreement, i.e. all models are "wrong" together. Red/yellow are large-error predictions with little model agreement, i.e. all models are wrong in different ways. It is possible that some of the blue points with large error yet good agreement among models are in fact accurate ML predictions for a DFT relaxation gone wrong. Zooming in on the blue points reveals that many of them are large. Larger markers correspond to larger structures where DFT failures are less surprising. This suggests ML model committees could be used to cheaply screen large databases for DFT errors in a high-throughput manner.
 
 ## MEGNet formation energies from UIP-relaxed structures
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -199,7 +199,7 @@ def as_dict(self) -> dict[str, Any]:
 
 
 def test_df_wbm() -> None:
-    assert df_wbm.shape == (256963, 16)
+    assert df_wbm.shape == (256_963, 16)
     assert df_wbm.index.name == "material_id"
     assert set(df_wbm) > {"bandgap_pbe", "formula", "material_id"}