change scatter_hull_dist_models colorscale (turbo->PuOr) to improve outlier visibility on dark bg

janosh · janosh · commit 467d77746c1f · 2023-10-31T18:58:36.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -42,7 +42,7 @@ repos:
         stages: [commit, commit-msg]
         exclude_types: [csv, json, svg]
         exclude: ^(.+references.yaml|site/src/figs/.+)$
-        args: [--ignore-words-list, "nd,te,fpr"]
+        args: [--ignore-words-list, "nd,te,fpr", --check-filenames]
 
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v3.0.3
diff --git a/models/chgnet/analyze_chgnet.py b/models/chgnet/analyze_chgnet.py
@@ -23,15 +23,14 @@
 
 
 # %%
-df_chgnet = pd.read_csv(PRED_FILES.CHGNet)
-df_chgnet = df_chgnet.set_index(id_col).add_suffix("_2000")
-df_chgnet_500 = pd.read_csv(PRED_FILES.CHGNet.replace("-06", "-04"))
-df_chgnet_500 = df_chgnet_500.set_index(id_col).add_suffix("_500")
-df_chgnet[list(df_chgnet_500)] = df_chgnet_500
+df_chgnet = df_chgnet_v030 = pd.read_csv(PRED_FILES.CHGNet)
+df_chgnet_v020 = pd.read_csv(
+    f"{module_dir}/2023-03-06-chgnet-0.2.0-wbm-IS2RE.csv.gz", index_col=id_col
+)
 df_chgnet["formula"] = df_wbm.formula
 
-e_form_2000 = "e_form_per_atom_chgnet_2000"
-e_form_500 = "e_form_per_atom_chgnet_500"
+e_form_2000 = "e_form_per_atom_chgnet_relax_steps_2000"
+e_form_500 = "e_form_per_atom_chgnet_relax_steps_500"
 
 min_e_diff = 0.1
 # structures with smaller energy after longer relaxation need many steps
diff --git a/models/chgnet/metadata.yml b/models/chgnet/metadata.yml
@@ -13,6 +13,10 @@ authors:
   - name: KyuJung Jun
     affiliation: UC Berkeley
     orcid: https://orcid.org/0000-0003-1974-028X
+  - name: Janosh Riebesell
+    affiliation: University of Cambridge, Lawrence Berkeley National Laboratory
+    email: janosh@lbl.gov
+    orcid: https://orcid.org/0000-0001-5233-3462
   - name: Kevin Han
     affiliation: UC Berkeley
     orcid: https://orcid.org/0000-0002-4028-2108
diff --git a/models/chgnet/test_chgnet.py b/models/chgnet/test_chgnet.py
@@ -82,6 +82,7 @@
     max_steps=max_steps,
     fmax=fmax,
     device=device,
+    trainable_params=chgnet.n_params,
 )
 
 run_name = f"{job_name}-{slurm_array_task_id}"
diff --git a/models/m3gnet/test_m3gnet.py b/models/m3gnet/test_m3gnet.py
@@ -29,8 +29,8 @@
 
 task_type = "IS2RE"  # "RS2RE"
 module_dir = os.path.dirname(__file__)
-# direct: cluster sampling, ms: manual sampling
-model_type: Literal["orig", "direct", "ms"] = "ms"
+# direct: DIRECT cluster sampling, ms: manual sampling
+model_type: Literal["orig", "direct", "manual-sampling"] = "orig"
 # set large job array size for smaller data splits and faster testing/debugging
 slurm_array_task_count = 100
 job_name = f"m3gnet-{model_type}-wbm-{task_type}"
@@ -74,26 +74,28 @@
     pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
 )[slurm_array_task_id - 1]
 
+checkpoint = None
+if model_type == "direct":
+    checkpoint = f"{ROOT}/models/m3gnet/2023-05-26-DI-DFTstrictF10-TTRS-128U-442E"
+if model_type == "ms":
+    checkpoint = f"{ROOT}/models/m3gnet/2023-05-26-MS-DFTstrictF10-128U-154E"
+relax_results: dict[str, dict[str, Any]] = {}
+m3gnet = Relaxer(potential=checkpoint)  # load pre-trained M3GNet model
+
 run_params = dict(
     data_path=data_path,
     versions={dep: version(dep) for dep in ("m3gnet", "numpy")},
     task_type=task_type,
     df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     slurm_vars=slurm_vars,
+    trainable_params=sum(param.numel() for param in m3gnet.parameters()),
 )
 
 run_name = f"{job_name}-{slurm_array_task_id}"
 wandb.init(project="matbench-discovery", name=run_name, config=run_params)
 
 
 # %%
-checkpoint = None
-if model_type == "direct":
-    checkpoint = f"{ROOT}/models/m3gnet/2023-05-26-DI-DFTstrictF10-TTRS-128U-442E"
-if model_type == "ms":
-    checkpoint = f"{ROOT}/models/m3gnet/2023-05-26-MS-DFTstrictF10-128U-154E"
-m3gnet = Relaxer(potential=checkpoint)  # load pre-trained M3GNet model
-relax_results: dict[str, dict[str, Any]] = {}
 input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
 
 if task_type == "RS2RE":
diff --git a/models/mace/test_mace.py b/models/mace/test_mace.py
@@ -12,6 +12,7 @@
 from ase.constraints import ExpCellFilter
 from ase.optimize import FIRE, LBFGS
 from mace.calculators.mace import MACECalculator
+from mace.tools import count_parameters
 from pymatgen.core import Structure
 from pymatgen.core.trajectory import Trajectory
 from pymatgen.io.ase import AseAtomsAdaptor
@@ -25,6 +26,8 @@
 __author__ = "Janosh Riebesell"
 __date__ = "2023-03-01"
 
+
+# %%
 task_type = "IS2RE"  # "RS2RE"
 module_dir = os.path.dirname(__file__)
 # set large job array size for smaller data splits and faster testing/debugging
@@ -40,15 +43,17 @@
     # MACE trained by Yuan Chiang on CHGNet training set
     "2023-08-14-mace-yuan-mptrj-04",
     "2023-09-03-mace-yuan-mptrj-slower-14-lr-13_run-3",
+    "2023-10-29-mace-pbenner-mptrj-no-conditional-loss",
 ][-1]
 
 slurm_vars = slurm_submit(
     job_name=job_name,
     out_dir=out_dir,
     account="matgen",
-    time="11:55:0",
+    time="4:55:0",
     array=f"1-{slurm_array_task_count}",
-    slurm_flags="--qos regular --constraint gpu --gpus 1",
+    # slurm_flags="--qos shared --constraint gpu --gpus 1",
+    slurm_flags="--qos shared --constraint cpu --mem 16G",
 )
 
 
@@ -72,11 +77,14 @@
 max_steps = 500
 force_max = 0.05  # Run until the forces are smaller than this in eV/A
 checkpoint = f"{ROOT}/models/mace/checkpoints/{model_name}.model"
+mace_calc = MACECalculator(checkpoint, device=device)
 
 df_in: pd.DataFrame = np.array_split(
     pd.read_json(data_path).set_index(id_col), slurm_array_task_count
 )[slurm_array_task_id - 1]
 
+
+# %%
 run_params = dict(
     data_path=data_path,
     versions={dep: version(dep) for dep in ("mace", "numpy", "torch")},
@@ -89,14 +97,14 @@
     force_max=force_max,
     ase_optimizer=ase_optimizer,
     device=device,
+    trainable_params=count_parameters(mace_calc.models[0]),
 )
 
 run_name = f"{job_name}-{slurm_array_task_id}"
 wandb.init(project="matbench-discovery", name=run_name, config=run_params)
 
 
 # %%
-mace_calc = MACECalculator(checkpoint, device=device, default_dtype="float32")
 relax_results: dict[str, dict[str, Any]] = {}
 input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
 
diff --git a/models/wrenformer/analyze_wrenformer.py b/models/wrenformer/analyze_wrenformer.py
@@ -6,7 +6,7 @@
 import pandas as pd
 from aviary.wren.utils import get_isopointal_proto_from_aflow
 from pymatviz import spacegroup_hist, spacegroup_sunburst
-from pymatviz.io import df_to_pdf, df_to_svelte_table, save_fig
+from pymatviz.io import df_to_html_table, df_to_pdf, save_fig
 from pymatviz.ptable import ptable_heatmap_plotly
 from pymatviz.utils import add_identity_line, bin_df_cols
 
@@ -68,7 +68,7 @@
 
 styler = df_proto_counts.head(10).style.background_gradient(cmap="viridis")
 
-df_to_svelte_table(styler, f"{SITE_FIGS}/proto-counts-{model}-failures.svelte")
+df_to_html_table(styler, f"{SITE_FIGS}/proto-counts-{model}-failures.svelte")
 df_to_pdf(styler, f"{PDF_FIGS}/proto-counts-{model}-failures.pdf")
 
 
diff --git a/readme.md b/readme.md
@@ -1,32 +1,3 @@
-<script>
-  import { onMount } from 'svelte'
-  import all_stats from './site/src/routes/models/model-stats.json'
-
-  let best = Object.entries(all_stats).reduce(
-    (acc, [model, stats]) => {
-      if (stats.F1 > acc.F1) {
-        return { model, ...stats }
-      }
-      return acc
-    },
-    { model: `CHGNet`, F1: 0.6 }
-  )
-
-  let best_report // HTMLDivElement
-  onMount(async () => {
-    if (best_report && best) {
-      best_report.style.display = `block`
-
-      const { default: metadata } = await import(
-        `$root/models/${best.model.toLowerCase()}/metadata.yml`
-      )
-
-      best = { ...best, ...metadata }
-      console.log(`best`, best)
-    }
-  })
-</script>
-
 <h1 align="center">
   <img src="https://github.com/janosh/matbench-discovery/raw/main/site/static/favicon.svg" alt="Logo" width="60px"><br>
   Matbench Discovery
@@ -48,11 +19,7 @@ Matbench Discovery is an [interactive leaderboard](https://janosh.github.io/matb
 
 So far, we've tested 8 models covering multiple methodologies ranging from random forests with structure fingerprints to graph neural networks, from one-shot predictors to iterative Bayesian optimizers and interatomic potential relaxers.
 
-<div bind:this={best_report} style="display: none;">
-
-We find [{best.model}]({best?.repo}) ([paper]({best?.doi})) to achieve the highest F1 score of {best.F1}, $R^2$ of {best.R2} and a discovery acceleration factor (DAF) of {best.DAF} (meaning a ~{Number(best.DAF).toFixed(0)}x higher rate of stable structures compared to dummy selection in our already enriched search space).
-
-</div>
+<slot name="best-report" />
 
 Our results show that ML models have become robust enough to deploy them as triaging steps to more effectively allocate compute in high-throughput DFT relaxations. This work provides valuable insights for anyone looking to build large-scale materials databases.
 
diff --git a/scripts/model_figs/make_metrics_tables.py b/scripts/model_figs/make_metrics_tables.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 import pandas as pd
-from pymatviz.io import df_to_pdf, df_to_svelte_table
+from pymatviz.io import df_to_html_table, df_to_pdf
 from sklearn.dummy import DummyClassifier
 
 from matbench_discovery import PDF_FIGS, SITE_FIGS
@@ -156,7 +156,7 @@
     table::-webkit-scrollbar {
         display: none;  /* Safari and Chrome */
     }"""
-    df_to_svelte_table(
+    df_to_html_table(
         styler,
         f"{SITE_FIGS}/metrics-table{label}.svelte",
         inline_props="class='roomy'",
diff --git a/scripts/model_figs/scatter_hull_dist_models.py b/scripts/model_figs/scatter_hull_dist_models.py
@@ -150,7 +150,8 @@
     range_x=(domain := (-4, 7)),
     range_y=domain,
     category_orders={facet_col: legend_order},
-    color_continuous_scale="turbo",  # "thermal"
+    # pick from https://plotly.com/python/builtin-colorscales
+    color_continuous_scale="agsunset",
 )
 
 # manually set colorbar ticks and labels (needed after log1p transform)
diff --git a/site/src/figs/each-scatter-models-5x2.svelte b/site/src/figs/each-scatter-models-5x2.svelte
diff --git a/site/src/routes/+page.svelte b/site/src/routes/+page.svelte
@@ -1,8 +1,40 @@
 <script lang="ts">
   import MetricsTable from '$figs/metrics-table.svelte'
+  import type { ModelData } from '$lib'
   import Readme from '$root/readme.md'
+  import { onMount } from 'svelte'
+  import all_stats from './models/model-stats.json'
+
+  let best_model = Object.entries(all_stats).reduce((current, [model_name, stats]) => {
+    if (!current?.F1 || stats.F1 > current.F1) {
+      return { model_name, ...stats }
+    }
+    return current
+  }, {}) as ModelData
+
+  const metadata = import.meta.glob(`$root/models/**/metadata.yml`, {
+    eager: true,
+    import: `default`,
+  }) as Record<string, ModelData | ModelData[]>
+
+  onMount(async () => {
+    if (best_model) {
+      const md = metadata[`../models/${best_model.model_name.toLowerCase()}/metadata.yml`]
+      best_model = { ...best_model, ...md }
+    }
+  })
 </script>
 
 <Readme>
+  <div slot="best-report">
+    {#if best_model}
+      {@const { model_name, F1, R2, DAF, repo, doi } = best_model}
+      We find <a href={repo}>{model_name}</a> (<a href={doi}>paper</a>) to achieve the
+      highest F1 score of {F1}, R<sup>2</sup> of {R2}
+      and a discovery acceleration factor (DAF) of {DAF}
+      (meaning a ~{Number(DAF).toFixed(0)}x higher rate of stable structures compared to
+      dummy selection in our already enriched search space).
+    {/if}
+  </div>
   <MetricsTable slot="metrics-table" />
 </Readme>

Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@`
`82`	`82`	`max_steps=max_steps,`
`83`	`83`	`fmax=fmax,`
`84`	`84`	`device=device,`
	`85`	`+ trainable_params=chgnet.n_params,`
`85`	`86`	`)`
`86`	`87`
`87`	`88`	`run_name = f"{job_name}-{slurm_array_task_id}"`
Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,8 @@`
`150`	`150`	`range_x=(domain := (-4, 7)),`
`151`	`151`	`range_y=domain,`
`152`	`152`	`category_orders={facet_col: legend_order},`
`153`		`- color_continuous_scale="turbo", # "thermal"`
	`153`	`+ # pick from https://plotly.com/python/builtin-colorscales`
	`154`	`+ color_continuous_scale="agsunset",`
`154`	`155`	`)`
`155`	`156`
`156`	`157`	`# manually set colorbar ticks and labels (needed after log1p transform)`