plot easy vs hard structures (for all models) norm of SiteStats fingerprint difference before/after relaxation

janosh · janosh · commit a6bfa749aba0 · 2023-06-19T20:29:24.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ default_install_hook_types: [pre-commit, commit-msg]
 
 repos:
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.0.252
+    rev: v0.0.255
     hooks:
       - id: ruff
         args: [--fix]
diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -63,7 +63,7 @@ class DataFiles(Files):
         "wbm/2022-10-19-wbm-computed-structure-entries.json.bz2"
     )
     wbm_initial_structures = "wbm/2022-10-19-wbm-init-structs.json.bz2"
-    wbm_computed_structure_entries_plus_init_structs = (
+    wbm_cses_plus_init_structs = (
         "wbm/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
     )
     wbm_summary = "wbm/2022-10-19-wbm-summary.csv"
diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py
@@ -754,8 +754,8 @@ def cumulative_precision_recall(
                 align="left",
             )
         fig.layout.legend.title = ""
-        fig.update_xaxes(showticklabels=True, title="")
-        fig.update_yaxes(showticklabels=True, title="")
+        fig.update_xaxes(showticklabels=True, title="", matches=None)
+        fig.update_yaxes(showticklabels=True, title="", matches=None)
 
     return fig, df_cum
 
diff --git a/models/m3gnet/pre_vs_post_m3gnet_relaxation.py b/models/m3gnet/pre_vs_post_m3gnet_relaxation.py
@@ -23,9 +23,7 @@
 
 
 # %%
-df_wbm = pd.read_json(
-    DATA_FILES.wbm_computed_structure_entries_plus_init_structs
-).set_index("material_id")
+df_wbm = pd.read_json(DATA_FILES.wbm_cses_plus_init_structs).set_index("material_id")
 
 df_summary = pd.read_csv(DATA_FILES.wbm_summary).set_index("material_id")
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,7 +54,7 @@ running-models = ["aviary", "m3gnet", "maml", "megnet"]
 3d-structures = ["crystaltoolkit"]
 
 [tool.setuptools.packages]
-find = { include = ["matbench_discovery"] }
+find = { include = ["matbench_discovery*"], exclude = ["tests*"] }
 
 [tool.setuptools.package-data]
 matbench_discovery = ["data/mp/*.json"]
@@ -66,7 +66,7 @@ universal = true
 target-version = "py39"
 select = [
   "B",   # flake8-bugbear
-  "C4",  # flake8-comprehensions
+  "C40", # flake8-comprehensions
   "D",   # pydocstyle
   "E",   # pycodestyle
   "F",   # pyflakes
diff --git a/scripts/difficult_structures.py b/scripts/difficult_structures.py
@@ -5,56 +5,128 @@
 
 
 # %%
+import itertools
+
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
-from pymatgen.core import Structure
-from pymatviz import plot_structure_2d, ptable_heatmap_plotly
+from matminer.featurizers.site import CrystalNNFingerprint
+from matminer.featurizers.structure import SiteStatsFingerprint
+from pymatgen.core import Composition, Element, Structure
+from pymatviz import count_elements, plot_structure_2d, ptable_heatmap_plotly
+from tqdm import tqdm
 
-from matbench_discovery import ROOT
+from matbench_discovery import MODELS, ROOT
 from matbench_discovery.data import DATA_FILES
+from matbench_discovery.data import df_wbm as df_summary
 from matbench_discovery.metrics import classify_stable
-from matbench_discovery.preds import df_each_err, df_each_pred, df_preds, each_true_col
+from matbench_discovery.preds import (
+    df_each_err,
+    df_each_pred,
+    df_metrics,
+    df_preds,
+    each_true_col,
+)
 
 __author__ = "Janosh Riebesell"
 __date__ = "2023-02-15"
 
 df_each_err[each_true_col] = df_preds[each_true_col]
-mean_ae_col = "All models mean absolute error (eV/atom)"
+mean_ae_col = "All models MAE (eV/atom)"
 df_each_err[mean_ae_col] = df_preds[mean_ae_col] = df_each_err.abs().mean(axis=1)
 
 
 # %%
-df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
-    "material_id"
-)
+df_wbm = pd.read_json(DATA_FILES.wbm_cses_plus_init_structs).set_index("material_id")
 
 
 # %%
 n_rows, n_cols = 5, 4
-for which in ("best", "worst"):
+for good_bad, init_final in itertools.product(("best", "worst"), ("initial", "final")):
     fig, axs = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 3 * n_rows))
-    n_axs = len(axs.flat)
+    n_structs = len(axs.flat)
+    struct_col = {
+        "initial": "initial_structure",
+        "final": "computed_structure_entry",
+    }[init_final]
 
-    errs = (
-        df_each_err.mean_ae.nsmallest(n_axs)
-        if which == "best"
-        else df_each_err.mean_ae.nlargest(n_axs)
+    errs = {
+        "best": df_each_err[mean_ae_col].nsmallest(n_structs),
+        "worst": df_each_err[mean_ae_col].nlargest(n_structs),
+    }[good_bad]
+    title = (
+        f"{good_bad.title()} {len(errs)} {init_final} structures (across "
+        f"{len(list(df_each_pred))} models)\nErrors in (ev/atom)"
     )
-    title = f"{which} {len(errs)} structures (across {len(list(df_each_pred))} models)"
-    fig.suptitle(title, fontsize=16, fontweight="bold", y=0.95)
+    fig.suptitle(title, fontsize=20, fontweight="bold", y=1.05)
 
-    for idx, (ax, (id, err)) in enumerate(zip(axs.flat, errs.items()), 1):
-        struct = Structure.from_dict(
-            df_cse.computed_structure_entry.loc[id]["structure"]
-        )
+    for idx, (ax, (id, error)) in enumerate(zip(axs.flat, errs.items()), 1):
+        struct = df_wbm[struct_col].loc[id]
+        if init_final == "relaxed":
+            struct = struct["structure"]
+        struct = Structure.from_dict(struct)
         plot_structure_2d(struct, ax=ax)
         _, spg_num = struct.get_space_group_info()
         formula = struct.composition.reduced_formula
         ax.set_title(
-            f"{idx}. {formula} (spg={spg_num})\n{id} {err=:.2f}", fontweight="bold"
+            f"{idx}. {formula} (spg={spg_num})\n{id} {error=:.2f}", fontweight="bold"
         )
+    out_path = f"{ROOT}/tmp/figures/{good_bad}-{len(errs)}-structures-{init_final}.webp"
+    fig.savefig(out_path, dpi=300)
+
+
+# %%
+n_structs = 100
+worst_ids = df_each_err[mean_ae_col].nlargest(n_structs).index.tolist()
+best_ids = df_each_err[mean_ae_col].nsmallest(n_structs).index.tolist()
+
+best_init_structs = df_wbm.initial_structure.loc[best_ids].map(Structure.from_dict)
+worst_init_structs = df_wbm.initial_structure.loc[worst_ids].map(Structure.from_dict)
+best_final_structs = df_wbm.computed_structure_entry.loc[best_ids].map(
+    lambda cse: Structure.from_dict(cse["structure"])
+)
+worst_final_structs = df_wbm.computed_structure_entry.loc[worst_ids].map(
+    lambda cse: Structure.from_dict(cse["structure"])
+)
+
+
+# %%
+cnn_fp = CrystalNNFingerprint.from_preset("ops")
+site_stats_fp = SiteStatsFingerprint(
+    cnn_fp, stats=("mean", "std_dev", "minimum", "maximum")
+)
+
+worst_fp_diff_norms = (
+    worst_final_structs.map(site_stats_fp.featurize).map(np.array)
+    - worst_init_structs.map(site_stats_fp.featurize).map(np.array)
+).map(np.linalg.norm)
 
-    fig.savefig(f"{ROOT}/tmp/figures/{which}-{len(errs)}-structures.webp", dpi=300)
+best_fp_diff_norms = (
+    best_final_structs.map(site_stats_fp.featurize).map(np.array)
+    - best_init_structs.map(site_stats_fp.featurize).map(np.array)
+).map(np.linalg.norm)
+
+df_fp = pd.DataFrame(
+    [worst_fp_diff_norms.values, best_fp_diff_norms.values],
+    index=["highest-error structures", "lowest-error structures"],
+).T
+
+
+# %%
+fig = df_fp.plot.hist(backend="plotly", nbins=50, barmode="overlay", opacity=0.8)
+title = (
+    f"SiteStatsFingerprint norm-diff between initial/final {n_structs}<br>"
+    f"highest/lowest-error structures (mean over {len(list(df_each_pred))} models)"
+)
+fig.layout.title.update(text=title, font_size=20, xanchor="center", x=0.5)
+fig.layout.legend.update(
+    title="", yanchor="top", y=0.98, xanchor="right", x=0.98, font_size=16
+)
+fig.layout.xaxis.title = "|SSFP<sub>initial</sub> - SSFP<sub>final</sub>|"
+fig.show()
+fig.write_image(
+    f"{ROOT}/tmp/figures/init-final-fp-diff-norms.webp", width=1000, scale=2
+)
 
 
 # %% plotly scatter plot of largest model errors with points sized by mean error and
@@ -99,8 +171,92 @@
 
 
 # %%
-ptable_heatmap_plotly(df_preds[df_preds.all_false_pos].formula, colorscale="Viridis")
-ptable_heatmap_plotly(df_preds[df_preds.all_false_neg].formula, colorscale="Viridis")
+elem_counts: dict[str, pd.Series] = {}
+for col in ("all_false_neg", "all_false_pos"):
+    elem_counts[col] = elem_counts.get(col, count_elements(df_preds.query(col).formula))
+    fig = ptable_heatmap_plotly(elem_counts[col], font_size=10)
+    fig.layout.title = col
+    fig.show()
+
+
+# %% scatter plot error by element against prevalence in training set
+df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index("material_id")
+# compute number of samples per element in training set
+# counting element occurrences not weighted by composition, assuming model don't learn
+# much more about iron and oxygen from Fe2O3 than from FeO
+
+count_col = "MP Occurrences"
+df_elem_err = count_elements(df_mp.formula_pretty, count_mode="occurrence").to_frame(
+    name=count_col
+)
+
+title = "Number of MP structures containing each element"
+fig = df_elem_err[count_col].plot.bar(backend="plotly", title=title)
+fig.update_layout(showlegend=False)
+fig.show()
+
+fig = ptable_heatmap_plotly(df_elem_err[count_col], font_size=10)
+fig.layout.title.update(text=title, x=0.35, y=0.9, font_size=20)
+fig.show()
+
+
+# %% map average model error onto elements
+df_summary["fractional_composition"] = [
+    Composition(comp).fractional_composition for comp in tqdm(df_summary.formula)
+]
+
+df_frac_comp = pd.json_normalize(
+    [comp.as_dict() for comp in df_summary["fractional_composition"]]
+).set_index(df_summary.index)
+assert all(
+    df_frac_comp.sum(axis=1).round(6) == 1
+), "composition fractions don't sum to 1"
+
+(len(df_frac_comp) - df_frac_comp.isna().sum()).sort_values().plot.bar(backend="plotly")
+
+# df_frac_comp = df_frac_comp.dropna(axis=1, thresh=100)  # remove Xe with only 1 entry
+
+
+# %%
+for model in (*df_metrics, mean_ae_col):
+    df_elem_err[model] = (
+        df_frac_comp * df_each_err[model].abs().values[:, None]
+    ).mean()
+    fig = ptable_heatmap_plotly(
+        df_elem_err[model],
+        precision=".2f",
+        fill_value=None,
+        cbar_max=0.2,
+        colorscale="Turbo",
+    )
+    fig.layout.title.update(text=model, x=0.35, y=0.9, font_size=20)
+    fig.show()
+
+
+# %%
+df_elem_err.to_json(f"{MODELS}/per-element/per-element-model-each-errors.json")
+
+
+# %%
+df_elem_err["elem_name"] = [Element(el).long_name for el in df_elem_err.index]
+fig = df_elem_err.plot.scatter(
+    x=count_col,
+    y=mean_ae_col,
+    backend="plotly",
+    hover_name="elem_name",
+    text=df_elem_err.index.where(
+        (df_elem_err[mean_ae_col] > 0.04) | (df_elem_err[count_col] > 10_000)
+    ),
+    title="Correlation between element-error and element-occurrence in<br>training "
+    f"set: {df_elem_err[mean_ae_col].corr(df_elem_err[count_col]):.2f}",
+    hover_data={mean_ae_col: ":.2f", count_col: ":,.0f"},
+)
+
+fig.update_traces(textposition="top center")
+fig.show()
+
+# save_fig(fig, f"{ROOT}/tmp/figures/element-occu-vs-err.webp", scale=2)
+# save_fig(fig, f"{ROOT}/tmp/figures/element-occu-vs-err.pdf")
 
 
 # %%
diff --git a/site/package.json b/site/package.json
@@ -22,10 +22,10 @@
     "@sveltejs/adapter-static": "^2.0.1",
     "@sveltejs/kit": "^1.11.0",
     "@sveltejs/vite-plugin-svelte": "^2.0.3",
-    "@typescript-eslint/eslint-plugin": "^5.54.1",
-    "@typescript-eslint/parser": "^5.54.1",
+    "@typescript-eslint/eslint-plugin": "^5.55.0",
+    "@typescript-eslint/parser": "^5.55.0",
     "elementari": "^0.1.0",
-    "eslint": "^8.35.0",
+    "eslint": "^8.36.0",
     "eslint-plugin-svelte3": "^4.0.0",
     "hastscript": "^7.2.0",
     "js-yaml": "^4.1.0",
@@ -38,14 +38,14 @@
     "rehype-slug": "^5.1.0",
     "remark-math": "3.0.0",
     "svelte": "^3.56.0",
-    "svelte-check": "^3.1.0",
+    "svelte-check": "^3.1.4",
     "svelte-multiselect": "^8.5.0",
-    "svelte-preprocess": "^5.0.1",
-    "svelte-toc": "^0.5.2",
-    "svelte-zoo": "^0.3.4",
-    "svelte2tsx": "^0.6.3",
+    "svelte-preprocess": "^5.0.2",
+    "svelte-toc": "^0.5.3",
+    "svelte-zoo": "^0.4.3",
+    "svelte2tsx": "^0.6.9",
     "tslib": "^2.5.0",
-    "typescript": "^4.9.5",
+    "typescript": "5.0.1-rc",
     "vite": "^4.1.4"
   },
   "prettier": {
diff --git a/site/src/app.css b/site/src/app.css
@@ -24,6 +24,9 @@
   --sms-focus-border: 0.1px solid white;
   --sms-active-color: cornflowerblue;
 }
+html {
+  scroll-behavior: smooth;
+}
 body {
   background: var(--night);
   font-family: -apple-system, BlinkMacSystemFont, Roboto, sans-serif;
diff --git a/site/src/routes/+layout.svelte b/site/src/routes/+layout.svelte
@@ -5,7 +5,7 @@
   import { repository } from '$site/package.json'
   import { CmdPalette } from 'svelte-multiselect'
   import Toc from 'svelte-toc'
-  import { GitHubCorner } from 'svelte-zoo'
+  import { GitHubCorner, PrevNext } from 'svelte-zoo'
   import '../app.css'
 
   const routes = Object.keys(import.meta.glob(`./*/+page.{svx,svelte,md}`)).map(
@@ -16,10 +16,6 @@
     $page.url.pathname === `/api` ? `h1, ` : ``
   }h2, h3, h4):not(.toc-exclude)`
 
-  $: current_route_idx = routes.findIndex((route) => route === $page.url.pathname)
-  // get prev/next route with wrap-around
-  $: next_route = routes[(current_route_idx + 1) % routes.length]
-  $: prev_route = routes[(current_route_idx - 1 + routes.length) % routes.length]
   $: description = {
     '/': `Benchmarking machine learning energy models for materials discovery.`,
     '/about-the-data': `Details about provenance, chemistry and energies in the benchmark's train and test set.`,
@@ -64,10 +60,10 @@
 
   <slot />
 
-  <section>
-    <a href={prev_route} class="link">&laquo; {prev_route}</a>
-    <a href={next_route} class="link">{next_route} &raquo;</a>
-  </section>
+  <PrevNext items={routes} current={$page.url.pathname} let:item={href}>
+    <a {href} class="link" slot="next">{href} &raquo;</a>
+    <a {href} class="link" slot="prev">&laquo; {href}</a>
+  </PrevNext>
 </main>
 
 <Footer />
diff --git a/site/static/prism-vsc-dark-plus.css b/site/static/prism-vsc-dark-plus.css
@@ -27,14 +27,6 @@ pre[class*='language-'] {
   hyphens: none;
 }
 
-pre[class*='language-']::-moz-selection,
-pre[class*='language-'] ::-moz-selection,
-code[class*='language-']::-moz-selection,
-code[class*='language-'] ::-moz-selection {
-  text-shadow: none;
-  background: rgba(29, 59, 83, 0.99);
-}
-
 pre[class*='language-']::selection,
 pre[class*='language-'] ::selection,
 code[class*='language-']::selection,

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ class DataFiles(Files):`
`63`	`63`	`"wbm/2022-10-19-wbm-computed-structure-entries.json.bz2"`
`64`	`64`	`)`
`65`	`65`	`wbm_initial_structures = "wbm/2022-10-19-wbm-init-structs.json.bz2"`
`66`		`- wbm_computed_structure_entries_plus_init_structs = (`
	`66`	`+ wbm_cses_plus_init_structs = (`
`67`	`67`	`"wbm/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"`
`68`	`68`	`)`
`69`	`69`	`wbm_summary = "wbm/2022-10-19-wbm-summary.csv"`
Original file line number	Diff line number	Diff line change
`@@ -754,8 +754,8 @@ def cumulative_precision_recall(`
`754`	`754`	`align="left",`
`755`	`755`	`)`
`756`	`756`	`fig.layout.legend.title = ""`
`757`		`- fig.update_xaxes(showticklabels=True, title="")`
`758`		`- fig.update_yaxes(showticklabels=True, title="")`
	`757`	`+ fig.update_xaxes(showticklabels=True, title="", matches=None)`
	`758`	`+ fig.update_yaxes(showticklabels=True, title="", matches=None)`
`759`	`759`
`760`	`760`	`return fig, df_cum`
`761`	`761`