janosh
diff --git a/‎data/mp/build_phase_diagram.py
+8-7 b/‎data/mp/build_phase_diagram.py
+8-7
diff --git a/‎data/mp/eda_mp_trj.py
+36-71 b/‎data/mp/eda_mp_trj.py
+36-71
diff --git a/‎data/mp/get_mp_energies.py
+18-17 b/‎data/mp/get_mp_energies.py
+18-17
diff --git a/‎data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
+3-2 b/‎data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
+3-2
diff --git a/‎data/wbm/compile_wbm_test_set.py
+14-15 b/‎data/wbm/compile_wbm_test_set.py
+14-15
diff --git a/‎data/wbm/eda_wbm.py
+2-2 b/‎data/wbm/eda_wbm.py
+2-2
@@ -14,6 +14,7 @@
 from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from pymatgen.ext.matproj import MPRester
+from pymatviz.io import save_fig
 from tqdm import tqdm
 
 from matbench_discovery import MP_DIR, ROOT, today
@@ -29,21 +30,21 @@
 
 # save all ComputedStructureEntries to disk
 # mp-15590 appears twice so we drop_duplicates()
-df = pd.DataFrame(all_mp_computed_structure_entries, columns=["entry"])
-df.index.name = Key.mat_id
-df.index = [e.entry_id for e in df.entry]
-df.reset_index().to_json(
+df_mp_cse = pd.DataFrame(all_mp_computed_structure_entries, columns=["entry"])
+df_mp_cse.index.name = Key.mat_id
+df_mp_cse.index = [e.entry_id for e in df_mp_cse.entry]
+df_mp_cse.reset_index().to_json(
     f"{module_dir}/{today}-mp-computed-structure-entries.json.gz",
     default_handler=lambda x: x.as_dict(),
 )
 
 
 # %%
 data_path = f"{module_dir}/2023-02-07-mp-computed-structure-entries.json.gz"
-df = pd.read_json(data_path).set_index(Key.mat_id)
+df_mp_cse = pd.read_json(data_path).set_index(Key.mat_id)
 
 # drop the structure, just load ComputedEntry, makes the PPD faster to build and load
-mp_computed_entries = [ComputedEntry.from_dict(dct) for dct in tqdm(df.entry)]
+mp_computed_entries = [ComputedEntry.from_dict(dct) for dct in tqdm(df_mp_cse.entry)]
 
 print(f"{len(mp_computed_entries)=:,} on {today}")
 # len(mp_computed_entries) = 146,323 on 2022-09-16
@@ -118,4 +119,4 @@
     xlabel="MP Formation Energy (eV/atom)",
     ylabel="Our Formation Energy (eV/atom)",
 )
-ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.webp", dpi=300)
+save_fig(ax, f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.webp", dpi=300)
@@ -15,7 +15,13 @@
 import plotly.express as px
 from matplotlib.colors import SymLogNorm
 from pymatgen.core import Composition, Element
-from pymatviz import count_elements, ptable_heatmap, ptable_heatmap_ratio, ptable_hists
+from pymatviz import (
+    count_elements,
+    plot_histogram,
+    ptable_heatmap,
+    ptable_heatmap_ratio,
+    ptable_hists,
+)
 from pymatviz.io import save_fig
 from pymatviz.utils import si_fmt
 from tqdm import tqdm
@@ -321,91 +327,50 @@ def tile_count_anno(hist_vals: list[Any]) -> dict[str, Any]:
 
 
 # %% plot formation energy per atom distribution
+# pdf_kwds defined to use the same figure size for all plots
+fig = plot_histogram(df_mp_trj[Key.e_form], bins=300)
+# fig.update_yaxes(type="log")
+fig.layout.xaxis.title = "E<sub>form</sub> (eV/atom)"
 count_col = "Number of Structures"
-axes_kwds = dict(linewidth=1, ticks="outside")
-pdf_kwds = dict(width=500, height=300)
-
-x_col, y_col = "E<sub>form</sub> (eV/atom)", count_col
-df_e_form = locals().get("df_e_form")
-
-if df_e_form is None:  # only compute once for speed
-    e_form_hist = np.histogram(df_mp_trj[Key.e_form], bins=300)
-    df_e_form = pd.DataFrame(e_form_hist, index=[y_col, x_col]).T.round(3)
-
-fig = px.bar(df_e_form, x=x_col, y=count_col, log_y=True)
-
-bin_width = df_e_form[x_col].diff().iloc[-1] * 1.2
-fig.update_traces(width=bin_width, marker_line_width=0)
-fig.layout.xaxis.update(**axes_kwds)
-fig.layout.yaxis.update(**axes_kwds)
-fig.layout.margin = dict(l=5, r=5, b=5, t=5)
+fig.layout.yaxis.title = count_col
 fig.show()
-save_fig(fig, f"{PDF_FIGS}/mp-trj-e-form-hist.pdf", **pdf_kwds)
-save_fig(fig, f"{SITE_FIGS}/mp-trj-e-form-hist.svelte")
-
-
-# %% plot forces distribution
-# use numpy to pre-compute histogram
-x_col, y_col = "|Forces| (eV/Å)", count_col
-df_forces = locals().get("df_forces")
 
-if df_forces is None:  # only compute once for speed
-    forces_hist = np.histogram(
-        df_mp_trj[Key.forces].explode().explode().abs(), bins=300
-    )
-    df_forces = pd.DataFrame(forces_hist, index=[y_col, x_col]).T.round(3)
+pdf_kwds = dict(width=500, height=300)
+# save_fig(fig, f"{PDF_FIGS}/mp-trj-e-form-hist.pdf", **pdf_kwds)
+# save_fig(fig, f"{SITE_FIGS}/mp-trj-e-form-hist.svelte")
 
-fig = px.bar(df_forces, x=x_col, y=count_col, log_y=True)
 
-bin_width = df_forces[x_col].diff().iloc[-1] * 1.2
-fig.update_traces(width=bin_width, marker_line_width=0)
-fig.layout.xaxis.update(**axes_kwds)
-fig.layout.yaxis.update(**axes_kwds)
-fig.layout.margin = dict(l=5, r=5, b=5, t=5)
+# %% plot forces distribution
+fig = plot_histogram(df_mp_trj[Key.forces].explode().explode().abs(), bins=300)
+fig.layout.xaxis.title = "|Forces| (eV/Å)"
+fig.layout.yaxis.title = count_col
+fig.update_yaxes(type="log")
 fig.show()
-save_fig(fig, f"{PDF_FIGS}/mp-trj-forces-hist.pdf", **pdf_kwds)
-save_fig(fig, f"{SITE_FIGS}/mp-trj-forces-hist.svelte")
 
+# save_fig(fig, f"{PDF_FIGS}/mp-trj-forces-hist.pdf", **pdf_kwds)
+# save_fig(fig, f"{SITE_FIGS}/mp-trj-forces-hist.svelte")
 
-# %% plot hydrostatic stress distribution
-x_col, y_col = "1/3 Tr(σ) (eV/Å³)", count_col  # noqa: RUF001
-df_stresses = locals().get("df_stresses")
-
-if df_stresses is None:  # only compute once for speed
-    stresses_hist = np.histogram(df_mp_trj[Key.stress_trace], bins=300)
-    df_stresses = pd.DataFrame(stresses_hist, index=[y_col, x_col]).T.round(3)
 
-fig = px.bar(df_stresses, x=x_col, y=y_col, log_y=True)
-
-bin_width = (df_stresses[x_col].diff().mean()) * 1.2
-fig.update_traces(width=bin_width, marker_line_width=0)
-fig.layout.xaxis.update(**axes_kwds)
-fig.layout.yaxis.update(**axes_kwds)
-fig.layout.margin = dict(l=5, r=5, b=5, t=5)
+# %% plot hydrostatic stress distribution
+fig = plot_histogram(df_mp_trj[Key.stress_trace], bins=300)
+fig.layout.xaxis.title = "1/3 Tr(σ) (eV/Å³)"  # noqa: RUF001
+fig.layout.yaxis.title = count_col
+fig.update_yaxes(type="log")
 fig.show()
 
-save_fig(fig, f"{PDF_FIGS}/mp-trj-stresses-hist.pdf", **pdf_kwds)
-save_fig(fig, f"{SITE_FIGS}/mp-trj-stresses-hist.svelte")
+# save_fig(fig, f"{PDF_FIGS}/mp-trj-stresses-hist.pdf", **pdf_kwds)
+# save_fig(fig, f"{SITE_FIGS}/mp-trj-stresses-hist.svelte")
 
 
 # %% plot magmoms distribution
-x_col, y_col = "Magmoms (μ<sub>B</sub>)", count_col
-df_magmoms = locals().get("df_magmoms")
-
-if df_magmoms is None:  # only compute once for speed
-    magmoms_hist = np.histogram(df_mp_trj[Key.magmoms].dropna().explode(), bins=300)
-    df_magmoms = pd.DataFrame(magmoms_hist, index=[y_col, x_col]).T.round(3)
-
-fig = px.bar(df_magmoms, x=x_col, y=y_col, log_y=True)
-
-bin_width = df_magmoms[x_col].diff().iloc[-1] * 1.2
-fig.update_traces(width=bin_width, marker_line_width=0)
-fig.layout.xaxis.update(**axes_kwds)
-fig.layout.yaxis.update(**axes_kwds)
-fig.layout.margin = dict(l=5, r=5, b=5, t=5)
+fig = plot_histogram(df_mp_trj[Key.magmoms].dropna().explode(), bins=300)
+fig.layout.xaxis.title = "Magmoms (μB)"
+fig.layout.yaxis.title = count_col
+fig.update_yaxes(type="log")
 fig.show()
-save_fig(fig, f"{PDF_FIGS}/mp-trj-magmoms-hist.pdf", **pdf_kwds)
-save_fig(fig, f"{SITE_FIGS}/mp-trj-magmoms-hist.svelte")
+
+# save_fig(fig, f"{PDF_FIGS}/mp-trj-magmoms-hist.pdf", **pdf_kwds)
+# save_fig(fig, f"{SITE_FIGS}/mp-trj-magmoms-hist.svelte")
 
 
 # %%
 
@@ -11,6 +11,7 @@
 from aviary.wren.utils import get_aflow_label_from_spglib
 from mp_api.client import MPRester
 from pymatgen.core import Structure
+from pymatviz.io import save_fig  # noqa: F401
 from pymatviz.powerups import annotate_metrics
 from tqdm import tqdm
 
@@ -47,13 +48,13 @@
 
 
 # %%
-df = pd.DataFrame(docs).set_index(Key.mat_id)
-df = df.rename(columns={"formula_pretty": Key.formula, "nsites": Key.n_sites})
+df_mp = pd.DataFrame(docs).set_index(Key.mat_id)
+df_mp = df_mp.rename(columns={"formula_pretty": Key.formula, "nsites": Key.n_sites})
 
-df_spg = pd.json_normalize(df.pop("symmetry"))[["number", "symbol"]]
-df["spacegroup_symbol"] = df_spg.symbol.to_numpy()
+df_spg = pd.json_normalize(df_mp.pop("symmetry"))[["number", "symbol"]]
+df_mp["spacegroup_symbol"] = df_spg.symbol.to_numpy()
 
-df.energy_type.value_counts().plot.pie(backend="plotly", autopct="%1.1f%%")
+df_mp.energy_type.value_counts().plot.pie(backend="plotly", autopct="%1.1f%%")
 # GGA: 72.2%, GGA+U: 27.8%
 
 
@@ -69,39 +70,39 @@
 ]
 # make sure symmetry detection succeeded for all structures
 assert df_cse[Key.wyckoff].str.startswith("invalid").sum() == 0
-df[Key.wyckoff] = df_cse[Key.wyckoff]
+df_mp[Key.wyckoff] = df_cse[Key.wyckoff]
 
-spg_nums = df[Key.wyckoff].str.split("_").str[2].astype(int)
+spg_nums = df_mp[Key.wyckoff].str.split("_").str[2].astype(int)
 # make sure all our spacegroup numbers match MP's
 assert (spg_nums.sort_index() == df_spg["number"].sort_index()).all()
 
-df.to_csv(DATA_FILES.mp_energies)
+df_mp.to_csv(DATA_FILES.mp_energies)
 # df = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(Key.mat_id)
 
 
 # %% reproduce fig. 1b from https://arxiv.org/abs/2001.10591 (as data consistency check)
-ax = df.plot.scatter(
+ax = df_mp.plot.scatter(
     x=Key.form_energy,
     y="decomposition_enthalpy",
     alpha=0.1,
     xlim=[-5, 1],
     ylim=[-1, 1],
-    color=(df.decomposition_enthalpy > STABILITY_THRESHOLD).map(
+    color=(df_mp.decomposition_enthalpy > STABILITY_THRESHOLD).map(
         {True: "red", False: "blue"}
     ),
-    title=f"{today} - {len(df):,} MP entries",
+    title=f"{today} - {len(df_mp):,} MP entries",
 )
 
-annotate_metrics(df.formation_energy_per_atom, df.decomposition_enthalpy)
+annotate_metrics(df_mp.formation_energy_per_atom, df_mp.decomposition_enthalpy)
 # result on 2023-01-10: plots match. no correlation between formation energy and
 # decomposition enthalpy. R^2 = -1.571, MAE = 1.604
-# ax.figure.savefig(f"{module_dir}/mp-decomp-enth-vs-e-form.webp", dpi=300)
+# save_fig(ax, f"{module_dir}/mp-decomp-enth-vs-e-form.webp", dpi=300)
 
 
 # %% scatter plot energy above convex hull vs decomposition enthalpy
 # https://berkeleytheory.slack.com/archives/C16RE1TUN/p1673887564955539
-mask_above_line = df.energy_above_hull - df.decomposition_enthalpy.clip(0) > 0.1
-ax = df.plot.scatter(
+mask_above_line = df_mp.energy_above_hull - df_mp.decomposition_enthalpy.clip(0) > 0.1
+ax = df_mp.plot.scatter(
     x="decomposition_enthalpy",
     y="energy_above_hull",
     color=mask_above_line.map({True: "red", False: "blue"}),
@@ -110,7 +111,7 @@
 # most points lie on line y=x for x > 0 and y = 0 for x < 0.
 n_above_line = sum(mask_above_line)
 ax.set(
-    title=f"{n_above_line:,} / {len(df):,} = {n_above_line / len(df):.1%} "
+    title=f"{n_above_line:,} / {len(df_mp):,} = {n_above_line / len(df_mp):.1%} "
     "MP materials with\nenergy_above_hull - decomposition_enthalpy.clip(0) > 0.1"
 )
-# ax.figure.savefig(f"{module_dir}/mp-e-above-hull-vs-decomp-enth.webp", dpi=300)
+# save_fig(ax, f"{module_dir}/mp-e-above-hull-vs-decomp-enth.webp", dpi=300)
@@ -13,6 +13,7 @@
     MaterialsProjectCompatibility,
 )
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
+from pymatviz.io import save_fig
 from tqdm import tqdm
 
 from matbench_discovery import ROOT, today
@@ -93,7 +94,7 @@
 
 ax.axline((0, 0), slope=1, color="gray", linestyle="dashed", zorder=-1)
 
-ax.figure.savefig(f"{ROOT}/tmp/{today}-ce-vs-cse-corrections-outliers.pdf")
+save_fig(ax, f"{ROOT}/tmp/{today}-ce-vs-cse-corrections-outliers.pdf")
 
 
 # %%
@@ -114,7 +115,7 @@
 # insight: all materials for which ComputedEntry and ComputedStructureEntry give
 # different formation energies are oxides or sulfides for which MP 2020 compat takes
 # into account structural information to make more accurate corrections.
-ax.figure.savefig(f"{ROOT}/tmp/{today}-ce-vs-cse-e-form-outliers.pdf")
+save_fig(ax, f"{ROOT}/tmp/{today}-ce-vs-cse-e-form-outliers.pdf")
 
 
 # %% below code resulted in
 
@@ -102,27 +102,31 @@
         continue
 
     print(f"{step=}")
-    df = pd.read_json(json_path).T
+    df_wbm_step = pd.read_json(json_path).T
 
     # we hash index only for speed
     # could use joblib.hash(df) to hash whole df but it's slow
-    checksum = pd.util.hash_pandas_object(df.index).sum()
+    checksum = pd.util.hash_pandas_object(df_wbm_step.index).sum()
     expected = wbm_structs_index_checksums[step - 1]
     assert checksum == expected, (
         f"bad df.index checksum for {step=}, {expected=}, got {checksum=}\n"
         f"\n{json_path=}"
     )
 
     if step == 3:
-        df = df.drop(index=[f"step_3_{wbm_id}" for wbm_id in bad_struct_ids])
+        df_wbm_step = df_wbm_step.drop(
+            index=[f"step_3_{wbm_id}" for wbm_id in bad_struct_ids]
+        )
         # re-index after dropping bad structures to get same indices as summary file
         # where IDs are consecutive, i.e. step_3_70801 is followed by step_3_70802,
         # not step_3_70804, etc.
         # df.index = [f"step_3_{idx + 1}" for idx in range(len(df))]
 
     step_len = step_lens[step - 1]
-    assert len(df) == step_len, f"bad len for {step=}: {len(df)} != {step_len}"
-    dfs_wbm_structs[step] = df
+    assert (
+        len(df_wbm_step) == step_len
+    ), f"bad len for {step=}: {len(df_wbm_step)} != {step_len}"
+    dfs_wbm_structs[step] = df_wbm_step
 
 
 # NOTE step 5 is missing 2 initial structures, see nan_init_structs_ids below
@@ -212,11 +216,11 @@ def increment_wbm_material_id(wbm_id: str) -> str:
         print(f"{json_path=} already loaded.")
         continue
 
-    df = pd.read_json(json_path)
+    df_wbm_step = pd.read_json(json_path)
 
     step_len = step_lens[step - 1]
-    dfs_wbm_cses[step] = df
-    assert len(df) == step_len, f"{step=}: {len(df)} != {step_len}"
+    dfs_wbm_cses[step] = df_wbm_step
+    assert len(df_wbm_step) == step_len, f"{step=}: {len(df_wbm_step)} != {step_len}"
 
 
 # %%
@@ -589,14 +593,9 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 try:
     from aviary.wren.utils import get_aflow_label_from_spglib
 
-    # add Aflow-style Wyckoff labels for initial and relaxed structures
-    for key in (Key.init_wyckoff, Key.wyckoff):
-        if key not in df_wbm:
-            df_summary[key] = None
-
     # from initial structures
     for idx in tqdm(df_wbm.index):
-        if not pd.isna(df_summary.loc[idx, Key.init_wyckoff]):
+        if not pd.isna(df_summary.loc[idx].get(Key.init_wyckoff)):
             continue  # Aflow label already computed
         try:
             struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
@@ -606,7 +605,7 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 
     # from relaxed structures
     for idx in tqdm(df_wbm.index):
-        if not pd.isna(df_summary.loc[idx, Key.wyckoff]):
+        if not pd.isna(df_summary.loc[idx].get(Key.wyckoff)):
             continue
 
         try:
 
@@ -24,7 +24,7 @@
 from matbench_discovery import PDF_FIGS, ROOT, SITE_FIGS, STABILITY_THRESHOLD
 from matbench_discovery import plots as plots
 from matbench_discovery.data import DATA_FILES, df_wbm
-from matbench_discovery.energy import mp_elem_reference_entries
+from matbench_discovery.energy import mp_elem_ref_entries
 from matbench_discovery.enums import Key, Model
 from matbench_discovery.preds import df_each_err
 
@@ -237,7 +237,7 @@
         "Name": entry.composition.elements[0].long_name,
         "Material ID": entry.entry_id.replace("-GGA", ""),
     }
-    for key, entry in mp_elem_reference_entries.items()
+    for key, entry in mp_elem_ref_entries.items()
 ]
 df_ref = pd.DataFrame(mp_ref_data).sort_values(atom_num_col)