fix pymatviz bin_df_cols util imports + ruff fixes + bump site deps

janosh · janosh · commit a3b43624b5bb · 2024-04-16T10:58:30.000+02:00
diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml
@@ -11,10 +11,11 @@ jobs:
   build:
     uses: janosh/workflows/.github/workflows/nodejs-gh-pages.yml@main
     with:
+      install-cmd: npm install --force
       python-version: "3.11"
       working-directory: site
       pre-build: |
         pip install lazydocs
         # lazydocs needs package deps to be installed
-        pip install -e ..
+        pip install --editable ..
         python ../scripts/make_api_docs.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,14 +1,14 @@
 ci:
   autoupdate_schedule: quarterly
-  skip: [pyright]
+  skip: [pyright, eslint]
 
 default_stages: [commit]
 
 default_install_hook_types: [pre-commit, commit-msg]
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.4
+    rev: v0.3.7
     hooks:
       - id: ruff
         args: [--fix]
@@ -20,7 +20,7 @@ repos:
       - id: format-ipy-cells
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: check-case-conflict
       - id: check-symlinks
@@ -57,31 +57,29 @@ repos:
         exclude: ^(site/src/figs/.+\.svelte|data/wbm/20.+\..+|site/src/(routes|figs).+\.(yaml|json)|changelog.md)$
 
   - repo: https://github.com/pre-commit/mirrors-eslint
-    rev: v9.0.0-rc.0
+    rev: v9.0.0
     hooks:
       - id: eslint
         types: [file]
-        args: [--fix]
+        args: [--fix, --config, site/eslint.config.js]
         files: \.(js|ts|svelte)$
         additional_dependencies:
           - eslint
+          - eslint-plugin-svelte
           - svelte
           - typescript
-          - eslint-plugin-svelte
-          - "@typescript-eslint/eslint-plugin"
-          - "@typescript-eslint/parser"
-          - svelte-eslint-parser
+          - typescript-eslint
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: 0.28.0
+    rev: 0.28.2
     hooks:
       - id: check-jsonschema
         files: ^models/(.+)/\1.*\.yml$
         args: [--schemafile, tests/model-schema.yml]
       - id: check-github-actions
 
   - repo: https://github.com/RobertCraigie/pyright-python
-    rev: v1.1.356
+    rev: v1.1.358
     hooks:
       - id: pyright
         args: [--level, error]
diff --git a/data/mp/build_phase_diagram.py b/data/mp/build_phase_diagram.py
@@ -45,7 +45,7 @@
 # drop the structure, just load ComputedEntry, makes the PPD faster to build and load
 mp_computed_entries = [ComputedEntry.from_dict(dct) for dct in tqdm(df.entry)]
 
-print(f"{len(mp_computed_entries) = :,} on {today}")
+print(f"{len(mp_computed_entries)=:,} on {today}")
 # len(mp_computed_entries) = 146,323 on 2022-09-16
 # len(mp_computed_entries) = 154,719 on 2023-02-07
 
diff --git a/data/mp/eda_mp_trj.py b/data/mp/eda_mp_trj.py
@@ -71,11 +71,14 @@
 
 
 # %%
-info_to_id = lambda info: f"{info[Key.task_id]}-{info['calc_id']}-{info['ionic_step']}"
+def info_dict_to_id(info: dict[str, int | str]) -> str:
+    """Construct a unique frame ID from the atoms info dict."""
+    return f"{info[Key.task_id]}-{info['calc_id']}-{info['ionic_step']}"
+
 
 df_mp_trj = pd.DataFrame(
     {
-        info_to_id(atoms.info): atoms.info
+        info_dict_to_id(atoms.info): atoms.info
         | {key: atoms.arrays.get(key) for key in ("forces", "magmoms")}
         | {"formula": str(atoms.symbols), Key.site_nums: atoms.symbols}
         for atoms_list in tqdm(mp_trj_atoms.values(), total=len(mp_trj_atoms))
@@ -101,8 +104,8 @@
 # %%
 def tile_count_anno(hist_vals: list[Any]) -> dict[str, Any]:
     """Annotate each periodic table tile with the number of values in its histogram."""
-    facecolor = cmap(norm(np.sum(len(hist_vals)))) if hist_vals else "none"
-    bbox = dict(facecolor=facecolor, alpha=0.4, pad=2, edgecolor="none")
+    face_color = cmap(norm(np.sum(len(hist_vals)))) if hist_vals else "none"
+    bbox = dict(facecolor=face_color, alpha=0.4, pad=2, edgecolor="none")
     return dict(text=si_fmt(len(hist_vals), ".0f"), bbox=bbox)
 
 
@@ -116,7 +119,7 @@ def tile_count_anno(hist_vals: list[Any]) -> dict[str, Any]:
     # project magmoms onto symbols in dict
     df_mp_trj_elem_magmom = pd.DataFrame(
         [
-            dict(zip(elems, magmoms))
+            dict(zip(elems, magmoms, strict=False))
             for elems, magmoms in df_mp_trj.set_index(Key.site_nums)[Key.magmoms]
             .dropna()
             .items()
@@ -159,7 +162,7 @@ def tile_count_anno(hist_vals: list[Any]) -> dict[str, Any]:
 if srs_mp_trj_elem_forces is None:
     df_mp_trj_elem_forces = pd.DataFrame(
         [
-            dict(zip(elems, np.abs(forces).mean(axis=1)))
+            dict(zip(elems, np.abs(forces).mean(axis=1), strict=False))
             for elems, forces in df_mp_trj.set_index(Key.site_nums)[Key.forces].items()
         ]
     )
diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -42,7 +42,7 @@
     docs = mpr.thermo.search(fields=fields, thermo_types=["GGA_GGA+U"])
 
 assert fields == set(docs[0]), f"missing fields: {fields - set(docs[0])}"
-print(f"{today}: {len(docs) = :,}")
+print(f"{today}: {len(docs)=:,}")
 # 2022-08-13: len(docs) = 146,323
 # 2023-01-10: len(docs) = 154,718
 
diff --git a/data/mp/get_mp_traj.py b/data/mp/get_mp_traj.py
@@ -54,7 +54,7 @@
         key=lambda doc: int(doc[Key.task_id].split("-")[1]),
     )
 
-    print(f"{today}: {len(task_docs) = :,}")
+    print(f"{today}: {len(task_docs)=:,}")
 
     df_tasks = pd.DataFrame(task_docs).drop(columns=["_id"]).set_index(Key.task_id)
     df_tasks.task_type.value_counts(dropna=False).plot.pie()
diff --git a/data/wbm/eda_wbm.py b/data/wbm/eda_wbm.py
@@ -59,6 +59,7 @@
 
 
 # %% print prevalence of stable structures in full WBM and uniq-prototypes only
+print(f"{STABILITY_THRESHOLD=}")
 for df, label in (
     (df_wbm, "full WBM"),
     (df_wbm.query(Key.uniq_proto), "WBM unique prototypes"),
@@ -67,6 +68,10 @@
     stable_rate = n_stable / len(df)
     print(f"{label}: {stable_rate=:.1%} ({n_stable:,} out of {len(df):,})")
 
+# on 2024-04-15: STABILITY_THRESHOLD=0
+# full WBM: stable_rate=16.7% (42,825 out of 256,963)
+# WBM unique prototypes: stable_rate=15.3% (32,942 out of 215,488)
+
 
 # %%
 for dataset, count_mode, elem_counts in all_counts:
diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py
@@ -39,7 +39,7 @@
 colorway = ("lightseagreen", "orange", "lightsalmon", "dodgerblue")
 clf_labels = ("True Positive", "False Negative", "False Positive", "True Negative")
 clf_colors = ("lightseagreen", "orange", "lightsalmon", "dodgerblue")
-clf_color_map = dict(zip(clf_labels, clf_colors))
+clf_color_map = dict(zip(clf_labels, clf_colors, strict=True))
 
 
 def hist_classified_stable_vs_hull_dist(
diff --git a/matbench_discovery/structure.py b/matbench_discovery/structure.py
@@ -45,7 +45,7 @@ def perturb_structure(struct: Structure, gamma: float = 1.5) -> Structure:
     plt.axvline(mean, color="gray", linestyle="dashed", linewidth=1)
     # annotate the mean line
     plt.annotate(
-        f"{mean = :.2f}",
+        f"{mean=:.2f}",
         xy=(mean, 1),
         # use ax coords for y
         xycoords=("data", "axes fraction"),
diff --git a/models/bowsr/join_bowsr_results.py b/models/bowsr/join_bowsr_results.py
@@ -38,9 +38,7 @@
 df_wbm = pd.read_csv(DATA_FILES.wbm_summary).set_index(Key.mat_id)
 
 
-print(
-    f"{len(df_bowsr) - len(df_wbm) = :,} missing ({len(df_bowsr):,} - {len(df_wbm):,})"
-)
+print(f"{len(df_bowsr) - len(df_wbm)=:,} missing ({len(df_bowsr):,} - {len(df_wbm):,})")
 
 
 # %% sanity check: since Bowsr uses MEGNet as energy model final BOWSR energy and Megnet
diff --git a/models/chgnet/analyze_chgnet.py b/models/chgnet/analyze_chgnet.py
@@ -79,11 +79,9 @@
 struct_col = Key.init_struct
 
 fig.suptitle(f"{n_struct} {struct_col} {title}", fontsize=16, fontweight="bold", y=1.05)
-for idx, (ax, row) in enumerate(
-    zip(axs.flat, df_cse.loc[df_diff.index].itertuples()), 1
-):
+for idx, row in enumerate(df_cse.loc[df_diff.index].itertuples(), 1):
     struct = Structure.from_dict(getattr(row, struct_col))
-    plot_structure_2d(struct, ax=ax)
+    ax = plot_structure_2d(struct, ax=axs.flat[idx - 1])
     _, spg_num = struct.get_space_group_info()
     formula = struct.composition.reduced_formula
     ax.set_title(f"{idx}. {formula} (spg={spg_num})\n{row.Index}", fontweight="bold")
diff --git a/models/chgnet/ctk_trajectory_viewer.py b/models/chgnet/ctk_trajectory_viewer.py
@@ -187,8 +187,8 @@ def update_structure(step: int) -> tuple[Structure, go.Figure]:
             init_struct.lattice = lattice
             if len(init_struct) != len(coords):
                 raise ValueError(f"{len(init_struct)} != {len(coords)}")
-            for site, coord in zip(init_struct, coords):
-                site.coords = coord
+            for idx, site in enumerate(init_struct):
+                site.coords = coords[idx]
 
             spg = init_struct.get_space_group_info()
             title = f"{material_id} - Spacegroup = {spg}"
diff --git a/models/m3gnet/pre_vs_post_m3gnet_relaxation.py b/models/m3gnet/pre_vs_post_m3gnet_relaxation.py
@@ -162,17 +162,17 @@
 )
 
 wbm_pbc_diffs_mean = df_m3gnet_is2re.wbm_pbc_diffs.mean()
-print(f"{wbm_pbc_diffs_mean = :.3}")
+print(f"{wbm_pbc_diffs_mean=:.3}")
 
 m3gnet_pbc_diffs_mean = df_m3gnet_is2re.m3gnet_pbc_diffs.mean()
-print(f"{m3gnet_pbc_diffs_mean = :.3}")
+print(f"{m3gnet_pbc_diffs_mean=:.3}")
 
 m3gnet_to_final_wbm_pbc_diffs_mean = (
     df_m3gnet_is2re.m3gnet_to_final_wbm_pbc_diffs.mean()
 )
-print(f"{m3gnet_to_final_wbm_pbc_diffs_mean = :.3}")
+print(f"{m3gnet_to_final_wbm_pbc_diffs_mean=:.3}")
 
-print(f"{wbm_pbc_diffs_mean / m3gnet_pbc_diffs_mean = :.3}")
+print(f"{wbm_pbc_diffs_mean / m3gnet_pbc_diffs_mean=:.3}")
 
 
 # %%
diff --git a/models/wrenformer/analyze_wrenformer.py b/models/wrenformer/analyze_wrenformer.py
@@ -4,10 +4,12 @@
 import numpy as np
 import pandas as pd
 from aviary.wren.utils import get_isopointal_proto_from_aflow
+from IPython.display import display
 from pymatviz import spacegroup_hist, spacegroup_sunburst
 from pymatviz.io import df_to_html_table, df_to_pdf, save_fig
-from pymatviz.powerups import add_identity_line, bin_df_cols
+from pymatviz.powerups import add_identity_line
 from pymatviz.ptable import ptable_heatmap_plotly
+from pymatviz.utils import bin_df_cols
 
 from matbench_discovery import PDF_FIGS, SITE_FIGS, Model
 from matbench_discovery.data import DATA_FILES, df_wbm
@@ -20,6 +22,7 @@
 
 # %%
 model = Model.wrenformer
+model_low = model.lower()
 max_each_true = 1
 min_each_pred = 1
 df_each_pred[Key.each_true] = df_preds[Key.each_true]
@@ -42,9 +45,11 @@
 
 
 # %%
-ax = spacegroup_hist(df_bad[Key.spacegroup])
-ax.set_title(f"Spacegroup hist for {title}", y=1.15)
-save_fig(ax, f"{PDF_FIGS}/spacegroup-hist-{model.lower()}-failures.pdf")
+fig = spacegroup_hist(df_bad[Key.spacegroup])
+fig.layout.title.update(text=f"Spacegroup hist for {title}", y=0.96)
+fig.layout.margin.update(l=0, r=0, t=80, b=0)
+save_fig(fig, f"{PDF_FIGS}/spacegroup-hist-{model.lower()}-failures.pdf")
+fig.show()
 
 
 # %%
@@ -68,29 +73,32 @@
 df_proto_counts[proto_col] = df_proto_counts[proto_col].str.replace("_", "-")
 
 styler = df_proto_counts.head(10).style.background_gradient(cmap="viridis")
-
-df_to_html_table(styler, f"{SITE_FIGS}/proto-counts-{model}-failures.svelte")
-df_to_pdf(styler, f"{PDF_FIGS}/proto-counts-{model}-failures.pdf")
+styler.set_caption(f"Top 10 {proto_col} in {len(df_bad)} {model} failures")
+display(styler)
+df_to_html_table(styler, f"{SITE_FIGS}/proto-counts-{model_low}-failures.svelte")
+df_to_pdf(styler, f"{PDF_FIGS}/proto-counts-{model_low}-failures.pdf")
 
 
 # %%
-fig = spacegroup_sunburst(df_bad[Key.spacegroup], width=350, height=350)
+fig = spacegroup_sunburst(
+    df_bad[Key.spacegroup], width=350, height=350, show_counts="percent"
+)
 # fig.layout.title.update(text=f"Spacegroup sunburst for {title}", x=0.5, font_size=14)
 fig.layout.margin.update(l=1, r=1, t=1, b=1)
 fig.show()
 
 
 # %%
-save_fig(fig, f"{PDF_FIGS}/spacegroup-sunburst-{model.lower()}-failures.pdf")
-save_fig(fig, f"{SITE_FIGS}/spacegroup-sunburst-{model}-failures.svelte")
+save_fig(fig, f"{PDF_FIGS}/spacegroup-sunburst-{model_low}-failures.pdf")
+save_fig(fig, f"{SITE_FIGS}/spacegroup-sunburst-{model_low}-failures.svelte")
 
 
 # %%
 fig = ptable_heatmap_plotly(df_bad[Key.formula])
 fig.layout.title = f"Elements in {title}"
 fig.layout.margin = dict(l=0, r=0, t=50, b=0)
 fig.show()
-save_fig(fig, f"{PDF_FIGS}/elements-{model.lower()}-failures.pdf")
+save_fig(fig, f"{PDF_FIGS}/elements-{model_low}-failures.pdf")
 
 
 # %%
diff --git a/scripts/analyze_model_failure_cases.py b/scripts/analyze_model_failure_cases.py
@@ -41,29 +41,27 @@
     n_structs = len(axs.flat)
     struct_col = {"initial": Key.init_struct, "final": Key.cse}[init_or_final]
 
-    errs = {
+    errors = {
         "best": df_each_err[Key.each_err_models].nsmallest(n_structs),
         "worst": df_each_err[Key.each_err_models].nlargest(n_structs),
     }[good_or_bad]
     title = (
-        f"{good_or_bad.title()} {len(errs)} {init_or_final} structures (across "
+        f"{good_or_bad.title()} {len(errors)} {init_or_final} structures (across "
         f"{len(list(df_each_pred))} models)\nErrors in (ev/atom)"
     )
     fig.suptitle(title, fontsize=20, fontweight="bold", y=1.05)
 
-    for idx, (ax, (mat_id, error)) in enumerate(zip(axs.flat, errs.items()), 1):
+    for idx, (mat_id, error) in enumerate(errors.items(), 1):
         struct = df_cse[struct_col].loc[mat_id]
         if "structure" in struct:
             struct = struct["structure"]
         struct = Structure.from_dict(struct)
-        plot_structure_2d(struct, ax=ax)
+        ax = plot_structure_2d(struct, ax=axs.flat[idx - 1])
         _, spg_num = struct.get_space_group_info()
         formula = struct.composition.reduced_formula
-        ax.set_title(
-            f"{idx}. {formula} (spg={spg_num})\n{mat_id} {error=:.2f}",
-            fontweight="bold",
-        )
-    out_path = f"{PDF_FIGS}/{good_or_bad}-{len(errs)}-structures-{init_or_final}.webp"
+        ax_title = f"{idx}. {formula} (spg={spg_num})\n{mat_id} {error=:.2f}"
+        ax.set_title(ax_title, fontweight="bold")
+    out_path = f"{PDF_FIGS}/{good_or_bad}-{len(errors)}-structures-{init_or_final}.webp"
     # fig.savefig(out_path, dpi=300)
 
 
@@ -73,7 +71,7 @@
 for idx, model in enumerate((Key.each_err_models, *df_metrics)):
     large_errors = df_each_err[model].abs().nlargest(n_structs)
     small_errors = df_each_err[model].abs().nsmallest(n_structs)
-    for label, errors in zip(("min", "max"), (large_errors, small_errors)):
+    for label, errors in (("min", large_errors), ("max", small_errors)):
         fig.add_histogram(
             x=df_wbm.loc[errors.index][fp_diff_col].values,
             name=f"{model} err<sub>{label}</sub>",
@@ -339,7 +337,7 @@
 y_label = "E<sub>above hull</sub> error (eV/atom)"
 n_structs = 1000
 
-for label, which in zip(("min", "max"), ("nlargest", "nsmallest")):
+for label, which in (("min", "nlargest"), ("max", "nsmallest")):
     fig = go.Figure()
     for model in df_metrics:
         errors = getattr(df_each_err[model].abs(), which)(n_structs)
diff --git a/scripts/model_figs/parity_energy_models.py b/scripts/model_figs/parity_energy_models.py
@@ -10,7 +10,8 @@
 import numpy as np
 import plotly.express as px
 from pymatviz.io import save_fig
-from pymatviz.powerups import add_identity_line, bin_df_cols
+from pymatviz.powerups import add_identity_line
+from pymatviz.utils import bin_df_cols
 
 from matbench_discovery import PDF_FIGS, SITE_FIGS
 from matbench_discovery.enums import Key, TestSubset
diff --git a/scripts/model_figs/rolling_mae_vs_hull_dist_wbm_batches.py b/scripts/model_figs/rolling_mae_vs_hull_dist_wbm_batches.py
@@ -69,7 +69,7 @@
     df_each_step = df_each_pred[df_each_pred.index.str.startswith(f"wbm-{idx}-")]
 
     title = f"Batch {idx} ({len(df_step.filter(like='e_').dropna()):,})"
-    assert 1e4 < len(df_step) < 1e5, print(f"{len(df_step) = :,}")
+    assert 1e4 < len(df_step) < 1e5, print(f"{len(df_step)=:,}")
     assert (df_step.index == df_each_step.index).all()
 
     ax, df_err, df_std = rolling_mae_vs_hull_dist(
diff --git a/site/.eslintrc.yml b/site/.eslintrc.yml
diff --git a/site/eslint.config.js b/site/eslint.config.js
diff --git a/site/package.json b/site/package.json
diff --git a/site/src/figs/spacegroup-sunburst-wrenformer-failures.svelte b/site/src/figs/spacegroup-sunburst-wrenformer-failures.svelte
diff --git a/tests/test_slurm.py b/tests/test_slurm.py

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@`
`54`	`54`	`key=lambda doc: int(doc[Key.task_id].split("-")[1]),`
`55`	`55`	`)`
`56`	`56`
`57`		`- print(f"{today}: {len(task_docs) = :,}")`
	`57`	`+ print(f"{today}: {len(task_docs)=:,}")`
`58`	`58`
`59`	`59`	`df_tasks = pd.DataFrame(task_docs).drop(columns=["_id"]).set_index(Key.task_id)`
`60`	`60`	`df_tasks.task_type.value_counts(dropna=False).plot.pie()`