janosh
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/wbm/analysis.py
Lines changed: 29 additions & 31 deletions b/‎data/wbm/analysis.py
Lines changed: 29 additions & 31 deletions
diff --git a/‎data/wbm/fetch_process_wbm_dataset.py
Lines changed: 18 additions & 4 deletions b/‎data/wbm/fetch_process_wbm_dataset.py
Lines changed: 18 additions & 4 deletions
diff --git a/‎data/wbm/2022-12-07-hist-e-form-per-atom.svg renamed to ‎data/wbm/figs/2022-12-07-hist-e-form-per-atom.svg b/‎data/wbm/2022-12-07-hist-e-form-per-atom.svg renamed to ‎data/wbm/figs/2022-12-07-hist-e-form-per-atom.svg
diff --git a/‎data/wbm/figs/2023-01-08-mp-elements.svg
Lines changed: 1 addition & 0 deletions b/‎data/wbm/figs/2023-01-08-mp-elements.svg
Lines changed: 1 addition & 0 deletions
diff --git a/‎data/wbm/figs/2023-01-08-wbm-elements.svg
Lines changed: 1 addition & 0 deletions b/‎data/wbm/figs/2023-01-08-wbm-elements.svg
Lines changed: 1 addition & 0 deletions
diff --git a/‎data/wbm/readme.md
Lines changed: 6 additions & 6 deletions b/‎data/wbm/readme.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎matbench_discovery/__init__.py
Lines changed: 2 additions & 4 deletions b/‎matbench_discovery/__init__.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎matbench_discovery/data.py
Lines changed: 5 additions & 0 deletions b/‎matbench_discovery/data.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎matbench_discovery/plots.py
Lines changed: 2 additions & 2 deletions b/‎matbench_discovery/plots.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎models/bowsr/join_bowsr_results.py
Lines changed: 14 additions & 3 deletions b/‎models/bowsr/join_bowsr_results.py
Lines changed: 14 additions & 3 deletions
diff --git a/‎models/bowsr/test_bowsr.py
Lines changed: 5 additions & 6 deletions b/‎models/bowsr/test_bowsr.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎models/m3gnet/test_m3gnet.py
Lines changed: 1 addition & 2 deletions b/‎models/m3gnet/test_m3gnet.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎readme.md
Lines changed: 2 additions & 2 deletions b/‎readme.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/cumulative_clf_metrics.py
Lines changed: 12 additions & 7 deletions b/‎scripts/cumulative_clf_metrics.py
Lines changed: 12 additions & 7 deletions
@@ -72,7 +72,7 @@ repos:
           - prettier
           - prettier-plugin-svelte
           - svelte
-        exclude: ^(site/static/.+\.svelte|data/wbm/20.+\..+|site/src/routes/.+\.(yml|yaml|json))$
+        exclude: ^(site/src/figs/.+\.svelte|data/wbm/20.+\..+|site/src/routes/.+\.(yml|yaml|json))$
 
   - repo: https://github.com/pre-commit/mirrors-eslint
     rev: v8.31.0
 
@@ -5,7 +5,8 @@
 from pymatviz import count_elements, ptable_heatmap_plotly
 from pymatviz.utils import save_fig
 
-from matbench_discovery import ROOT, today
+from matbench_discovery import FIGS, today
+from matbench_discovery.data import df_wbm
 
 module_dir = os.path.dirname(__file__)
 
@@ -15,71 +16,68 @@
 
 
 # %%
-df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
-    "material_id"
-)
-elem_counts = count_elements(df_summary.formula).astype(int)
+wbm_elem_counts = count_elements(df_wbm.formula).astype(int)
 
-elem_counts.to_json(
-    f"{ROOT}/site/src/routes/about-the-test-set/{today}-wbm-element-counts.json"
-)
+# wbm_elem_counts.to_json(
+#     f"{ROOT}/site/src/routes/about-the-test-set/{today}-wbm-element-counts.json"
+# )
 
 
 # %%
-fig = ptable_heatmap_plotly(
-    elem_counts,
+wbm_fig = ptable_heatmap_plotly(
+    wbm_elem_counts.drop("Xe"),
     log=True,
-    colorscale="YlGnBu",
+    colorscale="RdBu",
     hover_props=dict(atomic_number="atomic number"),
-    hover_data=elem_counts,
-    font_size="1vw",
+    hover_data=wbm_elem_counts,
 )
 
 title = "WBM Elements"
-fig.update_layout(
+wbm_fig.update_layout(
     title=dict(text=title, x=0.35, y=0.9, font_size=20),
     xaxis=dict(fixedrange=True),
     yaxis=dict(fixedrange=True),
     paper_bgcolor="rgba(0,0,0,0)",
 )
-fig.show()
+wbm_fig.show()
 
 
 # %%
-fig.write_image(f"{module_dir}/{today}-wbm-elements.svg", width=1000, height=500)
-save_fig(fig, f"{module_dir}/{today}-wbm-elements.svelte")
+wbm_fig.write_image(
+    f"{module_dir}/figs/{today}-wbm-elements.svg", width=1000, height=500
+)
+save_fig(wbm_fig, f"{FIGS}/{today}-wbm-elements.svelte")
 
 
 # %% load MP training set
 df = pd.read_json(f"{module_dir}/../mp/2022-08-13-mp-energies.json.gz")
-elem_counts = count_elements(df.formula_pretty).astype(int)
+mp_elem_counts = count_elements(df.formula_pretty).astype(int)
 
-elem_counts.to_json(
-    f"{ROOT}/site/src/routes/about-the-test-set/{today}-mp-element-counts.json"
-)
-elem_counts.describe()
+# mp_elem_counts.to_json(
+#     f"{ROOT}/site/src/routes/about-the-test-set/{today}-mp-element-counts.json"
+# )
+mp_elem_counts.describe()
 
 
 # %%
-fig = ptable_heatmap_plotly(
-    elem_counts[elem_counts > 1],
+mp_fig = ptable_heatmap_plotly(
+    mp_elem_counts[mp_elem_counts > 1],
     log=True,
-    colorscale="YlGnBu",
+    colorscale="RdBu",
     hover_props=dict(atomic_number="atomic number"),
-    hover_data=elem_counts,
-    font_size="1vw",
+    hover_data=mp_elem_counts,
 )
 
 title = "MP Elements"
-fig.update_layout(
+mp_fig.update_layout(
     title=dict(text=title, x=0.35, y=0.9, font_size=20),
     xaxis=dict(fixedrange=True),
     yaxis=dict(fixedrange=True),
     paper_bgcolor="rgba(0,0,0,0)",
 )
-fig.show()
+mp_fig.show()
 
 
 # %%
-fig.write_image(f"{module_dir}/{today}-mp-elements.svg", width=1000, height=500)
-save_fig(fig, f"{module_dir}/{today}-mp-elements.svelte")
+mp_fig.write_image(f"{module_dir}/figs/{today}-mp-elements.svg", width=1000, height=500)
+# save_fig(mp_fig, f"{FIGS}/{today}-mp-elements.svelte")
@@ -433,7 +433,9 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 n_too_unstable = sum(df_summary.e_form_per_atom_wbm > e_form_cutoff)
 print(f"{n_too_unstable = }")  # n_too_unstable = 22
 
-fig = df_summary.hist(x="e_form_per_atom_wbm", backend="plotly", log_y=True)
+fig = df_summary.hist(
+    x="e_form_per_atom_wbm", backend="plotly", log_y=True, range_x=[-5.5, 5.5]
+)
 fig.add_vline(x=e_form_cutoff, line=dict(width=2, dash="dash", color="green"))
 fig.add_vline(x=-e_form_cutoff, line=dict(width=2, dash="dash", color="green"))
 fig.add_annotation(
@@ -443,15 +445,27 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 )
 x_axis_title = "WBM uncorrected formation energy (eV/atom)"
 fig.update_layout(xaxis_title=x_axis_title, margin=dict(l=10, r=10, t=40, b=10))
+# disabling zooming y-axis
+fig.update_yaxes(fixedrange=True)
+fig.show(
+    config=dict(
+        modeBarButtonsToRemove=["lasso2d", "select2d", "autoScale2d", "toImage"],
+        displaylogo=False,
+    )
+)
 
 
 # %%
 # no need to store all 250k x values in plot, leads to 1.7 MB file, subsample every 10th
 # point is enough to see the distribution
-fig.data[0].x = fig.data[0].x[::10]
+if not fig.data[0].compressed:
+    fig.data[0].compressed = True
+    # keep only every 10th data point, round to 3 decimal places to reduce file size
+    fig.data[0].x = [round(x, 3) for x in fig.data[0].x[::10]]
+
 # recommended to upload SVG to vecta.io/nano afterwards for compression
-img_path = f"{module_dir}/{today}-hist-e-form-per-atom"
-save_fig(fig, f"{img_path}.svg", width=800, height=300)
+img_path = f"{module_dir}/2022-12-07-hist-e-form-per-atom"
+# save_fig(fig, f"{img_path}.svg", width=800, height=300)
 save_fig(fig, f"{img_path}.svelte")
 
 
 
@@ -23,9 +23,9 @@ The full set of processing steps used to curate the WBM test set from the raw da
 - remove 6 pathological structures (with 0 volume)
 - remove formation energy outliers below -5 and above 5 eV/atom (502 and 22 crystals respectively out of 257,487 total, including an anomaly of 500 structures at exactly -10 eV/atom)
 
-  <caption>WBM Formation energy distribution. 524 materials outside green dashed lines were discarded.</caption>
+  <caption>WBM Formation energy distribution. 524 materials outside green dashed lines were discarded.<br />(zoom out on this plot to see discarded samples)</caption>
   <slot name="hist-e-form-per-atom">
-    <img src="./2022-12-07-hist-e-form-per-atom.svg" alt="WBM formation energy histogram indicating outlier cutoffs">
+    <img src="./figs/2022-12-07-hist-e-form-per-atom.svg" alt="WBM formation energy histogram indicating outlier cutoffs">
   </slot>
 
 - apply the [`MaterialsProject2020Compatibility`](https://pymatgen.org/pymatgen.entries.compatibility.html#pymatgen.entries.compatibility.MaterialsProject2020Compatibility) energy correction scheme to the formation energies
@@ -75,13 +75,13 @@ materialscloud:2021.68 includes a readme file with a description of the dataset,
 ## 📊 &thinsp; Plots
 
 <slot name="wbm-elements-heatmap">
-  <img src="./2023-01-08-wbm-elements.svg" alt="Periodic table log heatmap of WBM elements">
+  <img src="./figs/2023-01-08-wbm-elements.svg" alt="Periodic table log heatmap of WBM elements">
 </slot>
-<caption>Test set element counts consisting of 256,963 WBM <code>ComputedStructureEntries</code></caption>
+<caption>Element counts for test set consisting of 256,963 WBM <code>ComputedStructureEntries</code></caption>
 
 By comparison, the training set of MP ComputedStructureEntries has this element distribution.
 
 <slot name="mp-elements-heatmap">
-  <img src="./2023-01-08-mp-elements.svg" alt="Periodic table log heatmap of MP elements">
+  <img src="./figs/2023-01-08-mp-elements.svg" alt="Periodic table log heatmap of MP elements">
 </slot>
-<caption>Training set element counts consisting of 146,323 MP <code>ComputedStructureEntries</code></caption>
+<caption>Element counts for training set consisting of 146,323 MP <code>ComputedStructureEntries</code></caption>
@@ -1,14 +1,12 @@
 """Global variables used all across the matbench_discovery package."""
 
-from __future__ import annotations
-
 import os
 import sys
 from datetime import datetime
 
 ROOT = os.path.dirname(os.path.dirname(__file__))  # repository root
-FIGS = f"{ROOT}/site/static/figs"  # directory to store figures
-PAPER = f"{ROOT}/site/src/routes/paper/figs"  # directory to store figures
+FIGS = f"{ROOT}/site/src/figs"  # directory to store interactive figures
+STATIC = f"{ROOT}/site/static/figs"  # directory to store static figures
 # whether a currently running slurm job is in debug mode
 DEBUG = "DEBUG" in os.environ or (
     "slurm-submit" not in sys.argv and "SLURM_JOB_ID" not in os.environ
 
@@ -224,19 +224,24 @@ def load_df_wbm_with_preds(
         model_key = model_name.lower().replace(" ", "_")
         if f"e_form_per_atom_{model_key}" in df:
             df_out[model_name] = df[f"e_form_per_atom_{model_key}"]
+
         elif len(pred_cols := df.filter(like="_pred_ens").columns) > 0:
             assert len(pred_cols) == 1
             df_out[model_name] = df[pred_cols[0]]
             if len(std_cols := df.filter(like="_std_ens").columns) > 0:
                 df_out[f"{model_name}_std"] = df[std_cols[0]]
+
         elif len(pred_cols := df.filter(like=r"_pred_").columns) > 1:
             # make sure we average the expected number of ensemble member predictions
             assert len(pred_cols) == 10, f"{len(pred_cols) = }, expected 10"
             df_out[model_name] = df[pred_cols].mean(axis=1)
+
         elif "e_form_per_atom_voronoi_rf" in df:  # new voronoi
             df_out[model_name] = df.e_form_per_atom_voronoi_rf
+
         elif "e_form_pred" in df:  # old voronoi
             df_out[model_name] = df.e_form_pred
+
         else:
             raise ValueError(
                 f"No pred col for {model_name=}, available cols={list(df)}"
 
@@ -61,7 +61,7 @@
     margin=dict(l=30, r=20, t=60, b=20),
     paper_bgcolor="rgba(0,0,0,0)",
     # plot_bgcolor="rgba(0,0,0,0)",
-    font_size=15,
+    font_size=13,
 )
 pio.templates["global"] = dict(layout=global_layout)
 pio.templates.default = "plotly_dark+global"
@@ -181,7 +181,7 @@ def hist_classified_stable_vs_hull_dist(
             # add moving average of the accuracy computed within given window
             # as a function of e_above_hull shown as blue line (right axis)
             ax_acc = ax.twinx()
-            ax_acc.set_ylabel("Accuracy", color="darkblue")
+            ax_acc.set_ylabel("Rolling Accuracy", color="darkblue")
             ax_acc.tick_params(labelcolor="darkblue")
             ax_acc.set(ylim=(0, 1))
 
 
@@ -17,8 +17,9 @@
 # %%
 module_dir = os.path.dirname(__file__)
 task_type = "IS2RE"
-date = "2022-11-22"
-glob_pattern = f"{date}-bowsr-megnet-wbm-{task_type}/*.json.gz"
+date = "2023-01-20"
+energy_model = "megnet"
+glob_pattern = f"{date}-bowsr-{energy_model}-wbm-{task_type}/*.json.gz"
 file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
 print(f"Found {len(file_paths):,} files for {glob_pattern = }")
 
@@ -43,7 +44,17 @@
 df_wbm = pd.read_csv(data_path).set_index("material_id")
 
 
-print(f"{len(df_bowsr):,} - {len(df_wbm):,} = {len(df_bowsr) - len(df_wbm) = :,}")
+print(
+    f"{len(df_bowsr) - len(df_wbm) = :,} missing ({len(df_bowsr):,} - {len(df_wbm):,})"
+)
+
+
+# %% sanity check: since Bowsr uses MEGNet as energy model final BOWSR energy and Megnet
+# formation energy should be the same
+pymatviz.density_scatter(
+    x=df_bowsr.e_form_per_atom_bowsr_megnet,
+    y=df_bowsr[f"energy_bowsr_{energy_model}"],
+)
 
 
 # %%
 
@@ -28,9 +28,6 @@
 
 task_type = "IS2RE"  # "RS2RE"
 module_dir = os.path.dirname(__file__)
-# --mem 12000 avoids slurmstepd: error: Detected 1 oom-kill event(s)
-#     Some of your processes may have been killed by the cgroup out-of-memory handler.
-slurm_mem_per_node = 12000
 # set large job array size for fast testing/debugging
 slurm_array_task_count = 500
 # see https://stackoverflow.com/a/55431306 for how to change array throttling
@@ -45,12 +42,14 @@
 slurm_vars = slurm_submit(
     job_name=job_name,
     out_dir=out_dir,
-    partition="icelake-himem",
+    partition="skylake",
     account="LEE-SL3-CPU",
     time="12:0:0",
     # --time 2h is probably enough but best be safe.
     array=f"1-{slurm_array_task_count}%{slurm_max_parallel}",
-    slurm_flags=("--mem", str(slurm_mem_per_node)),
+    # --mem 12000 avoids slurmstepd: error: Detected 1 oom-kill event(s)
+    #     Some of your processes may have been killed by the cgroup out-of-memory handler.
+    slurm_flags=("--mem", str(12_000)),
     # TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
     # https://stackoverflow.com/a/40982782
     pre_cmd="TF_CPP_MIN_LOG_LEVEL=2",
@@ -141,7 +140,7 @@
                 structure_bowsr
             ),
             "structure_bowsr": structure_bowsr,
-            "energy_bowsr": energy_bowsr,
+            f"energy_bowsr_{energy_model}": energy_bowsr,
         }
 
         relax_results[material_id] = results
 
@@ -29,7 +29,6 @@
 module_dir = os.path.dirname(__file__)
 # set large job array size for fast testing/debugging
 slurm_array_task_count = 100
-slurm_mem_per_node = 12000
 job_name = f"m3gnet-wbm-{task_type}{'-debug' if DEBUG else ''}"
 out_dir = os.environ.get("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
 
@@ -40,7 +39,7 @@
     account="LEE-SL3-CPU",
     time="3:0:0",
     array=f"1-{slurm_array_task_count}",
-    slurm_flags=("--mem", str(slurm_mem_per_node)),
+    slurm_flags=("--mem", str(12_000)),
     # TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
     # https://stackoverflow.com/a/40982782
     pre_cmd="TF_CPP_MIN_LOG_LEVEL=2",
 
@@ -13,9 +13,9 @@ Matbench Discovery
 
 </h4>
 
-Matbench Discovery is an [interactive leaderboard](https://matbench-discovery.janosh.dev/figures) and associated [PyPI package](https://pypi.org/project/matbench-discovery) for benchmarking ML energy models on a task designed to closely emulate a real-world computational materials discovery workflow. In it, these models take on the role of a triaging step prior to DFT to determine how to allocate limited compute budget for structure relaxations.
+Matbench Discovery is an [interactive leaderboard](https://matbench-discovery.janosh.dev/figures) and associated [PyPI package](https://pypi.org/project/matbench-discovery) for benchmarking ML energy models on a task designed to closely emulate a real-world computational materials discovery workflow. In it, these models take on the role of a triaging step prior to DFT to decide how to allocate limited compute budget for structure relaxations.
 
-We welcome contributions that add new models to the leaderboard through [GitHub PRs](https://github.com/janosh/matbench-discovery/pulls). See the [usage and contributing guide](https://janosh.github.io/matbench-discovery/how-to-contribute).
+We welcome contributions that add new models to the leaderboard through [GitHub PRs](https://github.com/janosh/matbench-discovery/pulls). See the [usage and contributing guide](https://janosh.github.io/matbench-discovery/how-to-contribute) for details.
 
 Several new energy models specifically designed to handle unrelaxed structures were published in 2021/22
 
 
@@ -2,7 +2,7 @@
 import pandas as pd
 from pymatviz.utils import save_fig
 
-from matbench_discovery import FIGS, today
+from matbench_discovery import STATIC, today
 from matbench_discovery.data import load_df_wbm_with_preds
 from matbench_discovery.plots import cumulative_precision_recall
 
@@ -12,8 +12,8 @@
 
 # %%
 models = (
-    # Wren, CGCNN IS2RE, CGCNN RS2RE
-    "Voronoi RF, Wrenformer, MEGNet, M3GNet, BOWSR MEGNet, CGCNN, CGCNN debug"
+    # Wren, CGCNN IS2RE, CGCNN RS2RE, CGCNN
+    "Voronoi RF, Wrenformer, MEGNet, M3GNet, BOWSR MEGNet"
 ).split(", ")
 
 df_wbm = load_df_wbm_with_preds(models=models).round(3)
@@ -37,17 +37,21 @@
     show_optimal=True,
 )
 
-title = f"{today} - Cumulative Precision, Recall and F1 Score for Stable Materials"
+title = f"{today} - Cumulative Precision, Recall, F1 scores for classifying stable materials"
 # xlabel_cumulative = "Materials predicted stable sorted by hull distance"
 if backend == "matplotlib":
     fig.suptitle(title)
     # fig.text(0.5, -0.08, xlabel_cumulative, ha="center", fontdict={"size": 16})
 elif backend == "plotly":
-    fig.update_layout(title=title)
+    # place legend in lower right corner
+    fig.update_layout(
+        title=title,
+        legend=dict(yanchor="bottom", y=0.02, xanchor="right", x=1),
+    )
     fig.update_xaxes(matches=None, showticklabels=True)
     fig.update_yaxes(matches=None, showticklabels=True)
 
-fig.show(config=dict(responsive=True))
+fig.show()
 
 
 # %%
@@ -57,6 +61,7 @@
     assert isinstance(trace.y[0], float)
     trace.y = [round(y, 3) for y in trace.y]
 
-img_path = f"{FIGS}/{today}-cumulative-clf-metrics"
+img_path = f"{STATIC}/{today}-cumulative-clf-metrics"
 # save_fig(fig, f"{img_path}.pdf")
 save_fig(fig, f"{img_path}.svelte")
+# save_fig(fig, f"{img_path}.png", scale=3)