toggle site tips on cmd+j, update deps, tweak copy (abstract, metrics-table-first-10k, readme)

janosh · janosh · commit 137c414badc8 · 2023-06-19T20:29:25.000-07:00
diff --git a/readme.md b/readme.md
@@ -13,9 +13,9 @@
 
 </h4>
 
-> TL;DR: We benchmark ML models on crystal stability prediction from unrelaxed structures finding interatomic potentials in particular to be a valuable addition to high-throughput discovery pipelines.
+> TL;DR: We benchmark ML models on crystal stability prediction from unrelaxed structures finding universal interatomic potentials (UIP) like [M3GNet](https://github.com/materialsvirtuallab/m3gnet) and [CHGNet](https://github.com/CederGroupHub/chgnet) to be highly accurate, robust across chemistries and ready for production use in high-throughput discovery pipelines.
 
-Matbench Discovery is an [interactive leaderboard](https://janosh.github.io/matbench-discovery/models) and associated [PyPI package](https://pypi.org/project/matbench-discovery) which together make it easy to benchmark ML energy models on a task designed to closely simulate a high-throughput discovery campaign for new stable inorganic crystals.
+Matbench Discovery is an [interactive leaderboard](https://janosh.github.io/matbench-discovery/models) and associated [PyPI package](https://pypi.org/project/matbench-discovery) which together make it easy to rank ML energy models on a task designed to closely simulate a high-throughput discovery campaign for new stable inorganic crystals.
 
 So far, we've tested 8 models covering multiple methodologies ranging from random forests with structure fingerprints to graph neural networks, from one-shot predictors to iterative Bayesian optimizers and interatomic potential-based relaxers. We find [CHGNet](https://github.com/CederGroupHub/chgnet) ([paper](https://doi.org/10.48550/arXiv.2302.14231)) to achieve the highest F1 score of 0.59, $R^2$ of 0.61 and a discovery acceleration factor (DAF) of 3.06 (meaning a 3x higher rate of stable structures compared to dummy selection in our already enriched search space). We believe our results show that ML models have become robust enough to deploy them as triaging steps to more effectively allocate compute in high-throughput DFT relaxations. This work provides valuable insights for anyone looking to build large-scale materials databases.
 
diff --git a/site/package.json b/site/package.json
@@ -16,19 +16,19 @@
     "check": "svelte-check"
   },
   "devDependencies": {
-    "@iconify/svelte": "^3.1.1",
+    "@iconify/svelte": "^3.1.3",
     "@rollup/plugin-yaml": "^4.0.1",
     "@sveltejs/adapter-static": "^2.0.2",
-    "@sveltejs/kit": "^1.15.5",
+    "@sveltejs/kit": "^1.15.7",
     "@sveltejs/vite-plugin-svelte": "^2.0.4",
-    "@typescript-eslint/eslint-plugin": "^5.58.0",
-    "@typescript-eslint/parser": "^5.58.0",
+    "@typescript-eslint/eslint-plugin": "^5.59.0",
+    "@typescript-eslint/parser": "^5.59.0",
     "elementari": "^0.1.6",
     "eslint": "^8.38.0",
     "eslint-plugin-svelte3": "^4.0.0",
     "hastscript": "^7.2.0",
     "js-yaml": "^4.1.0",
-    "katex": "^0.16.4",
+    "katex": "^0.16.6",
     "mdsvex": "^0.10.6",
     "prettier": "^2.8.7",
     "prettier-plugin-svelte": "^2.10.0",
@@ -41,11 +41,11 @@
     "svelte-multiselect": "^8.6.0",
     "svelte-preprocess": "^5.0.3",
     "svelte-toc": "^0.5.4",
-    "svelte-zoo": "^0.4.3",
+    "svelte-zoo": "^0.4.5",
     "svelte2tsx": "^0.6.11",
     "tslib": "^2.5.0",
     "typescript": "5.0.4",
-    "vite": "^4.2.1"
+    "vite": "^4.3.1"
   },
   "prettier": {
     "semi": false,
diff --git a/site/src/figs/proto-counts-wrenformer-failures.svelte b/site/src/figs/proto-counts-wrenformer-failures.svelte
diff --git a/site/src/lib/Footer.svelte b/site/src/lib/Footer.svelte
@@ -1,7 +1,6 @@
 <script lang="ts">
   import { repository } from '$site/package.json'
   import Icon from '@iconify/svelte'
-  import { fade } from 'svelte/transition'
 
   let show_tips: boolean
   const tips_title = `Usage Tips`
@@ -19,12 +18,13 @@
     }
   }
 
-  function close_on_escape(event: KeyboardEvent) {
-    if (event.key === `Escape`) show_tips = false
+  function toggle(event: KeyboardEvent) {
+    if (event.key == `Escape`) show_tips = false
+    if (event.key == `j` && event.metaKey) show_tips = !show_tips
   }
 </script>
 
-<svelte:window on:click={close_if_outside_click} on:keydown={close_on_escape} />
+<svelte:window on:click={close_if_outside_click} on:keydown={toggle} />
 
 <footer>
   Questions/feedback?
@@ -38,14 +38,15 @@
   </button>
 </footer>
 
-{#if show_tips}
-  <dialog bind:this={dialog} transition:fade={{ duration: 150 }}>
-    <h3>{tips_title}</h3>
-    <p title="For keyboad-only site navigation">
-      Use <kbd>cmd+k</kbd> to bring up a nav palette.
-    </p>
-  </dialog>
-{/if}
+<dialog bind:this={dialog} open={show_tips}>
+  <h3>{tips_title}</h3>
+  <p title="For keyboard-only site navigation">
+    Use <kbd>cmd+k</kbd> to bring up a nav palette.
+  </p>
+  <p title="For keyboard-only site navigation">
+    Use <kbd>cmd+j</kbd> to bring up these site options.
+  </p>
+</dialog>
 
 <style>
   footer {
@@ -61,6 +62,8 @@
     color: var(--blue);
   }
   dialog {
+    visibility: hidden;
+    opacity: 0;
     position: fixed;
     top: 40%;
     background: var(--sms-options-bg);
@@ -69,6 +72,11 @@
     border: none;
     border-radius: 3pt;
     padding: 1ex 2ex;
+    transition: 0.2s;
+  }
+  dialog[open] {
+    visibility: visible;
+    opacity: 1;
   }
   dialog > * {
     margin: 0;
diff --git a/site/src/routes/contribute/+page.md b/site/src/routes/contribute/+page.md
@@ -185,4 +185,4 @@ This information can be useful for others looking to reproduce your results or c
 
 ## 😵‍💫 &thinsp; Troubleshooting
 
-Having problems using or contributing to the project? Please [open an issue on GitHub]({repo}/issues) or [start a discussion]({repo}/discussions) for open-ended conversations. Happy to help! 😊
+Having problems? Please [open an issue on GitHub]({repo}/issues). We're happy to help! 😊
diff --git a/site/src/routes/preprint/+page.md b/site/src/routes/preprint/+page.md
@@ -14,7 +14,7 @@
 
 <summary>
 
-We present a new machine learning (ML) evaluation framework for materials stability predictions named `Matbench Discovery`. Our work highlights the need to focus on metrics that directly measure stability hit rate in prospective discovery campaigns as opposed to analyzing models based on predictive accuracy alone. Our evaluation task is designed to closely simulate the deployment of ML energy models in a high-throughput search for stable inorganic crystals. To shed light on the question which type of ML performs best at materials discovery, we explore a wide variety of models covering multiple methodologies. Our selection ranges from random forests to GNNs, from one-shot predictors over iterative Bayesian optimizers to universal interatomic potentials (UIP) that closely emulate DFT. We find [CHGNet](https://github.com/CederGroupHub/chgnet) to achieve the highest F1 score of 0.59, $R^2$ of 0.61 and a discovery acceleration factor (DAF) of 3.06 (3x more stable structures found compared to dummy selection in our already enriched search space). We also identify a sharp disconnect between commonly used global regression metrics and more task-relevant classification metrics. In particular, CGCNN and MEGNet are worse than dummy regressors, but substantially better than dummy classifiers, demonstrating the importance of focusing on the most salient performance indicators for the task at hand. Our results provide valuable insights for maintainers of high throughput materials databases to start using these models as triaging steps for effectively allocating DFT relaxations.
+We present a new machine learning (ML) evaluation framework for materials stability predictions named `Matbench Discovery`. Our task closely simulates the deployment of ML energy models in a high-throughput search for stable inorganic crystals. It is accompanied by an interactive leaderboard and a Python package for easy ingestion of our training/test sets into future model submissions. To answer the question which ML methodology performs best at materials discovery, we explore a wide variety of models. Our initial selection ranges from random forests to GNNs, from one-shot predictors to iterative Bayesian optimizers and universal interatomic potentials (UIP) that closely emulate DFT. We find UIPs to be in a class of their own, achieving the highest F1 scores and discovery acceleration factors (DAF) of more than 3, i.e. 3x more stable structures found compared to dummy selection in our already enriched search space. We also identify a sharp disconnect between commonly used regression metrics and more task-relevant classification metrics. CGCNN and MEGNet are worse than dummy regressors, but substantially better than dummy classifiers, suggesting that the field overemphasizes the wrong performance indicators. Our results highlight the need to optimize metrics that measure true stability hit rate improvements and provide valuable insights for maintainers of high throughput materials databases by demonstrating that these models have matured enough to play a vital role as pre-filtering steps to effectively allocate compute budget for DFT relaxations.
 
 </summary>
 
@@ -212,9 +212,7 @@ Many GNN message-passing functions incorporate a soft attention coefficient desi
 
 ## Discussion
 
-From @fig:metrics-table we see several models achieve a DAF > 2.5 in this realistic benchmark scenario with the SOTA model CHGNet even reaching 3.06.
-
-@Fig:metrics-table shows that several models achieve a DAF (discovery acceptance factor) greater than 2 in a realistic benchmark scenario. The CHGNet model, which is currently the best-performing model, achieves a DAF of 3.06. These results demonstrate the effectiveness of using machine learning-based triage in high-throughput computational materials discovery applications. This indicates that it is worthwhile investing time and resources in integrating these methods into future discovery efforts.
+@Fig:metrics-table shows that several models achieve a DAF (discovery acceptance factor) > 2.5 in this realistic benchmark scenario. CHGNet, the current SOTA, even achieves a DAF of 3.06. These results demonstrate the effectiveness of using machine learning-based triage in high-throughput computational materials discovery. We believe we have demonstrated it to be worthwhile investing time and resources in integrating these methods into future discovery efforts.
 
 Consequently, the benefits of deploying ML-based triage in high-throughput computational materials discovery applications have matured to the point where they likely warrant the time and setup required to incorporate them into future discovery efforts.
 However, there are many aspects on which further progress is necessary, for example, models still make large numbers of false positive predictions for materials over 50 meV above the convex hull and much less likely to be synthesizable, greatly reducing the DAF.
diff --git a/site/src/routes/si/+page.md b/site/src/routes/si/+page.md
@@ -16,7 +16,7 @@
   import SpacegroupSunburstWrenformerFailures from '$figs/spacegroup-sunburst-wrenformer-failures.svelte'
   import ScatterLargestErrorsModelsMeanVsEachTrue from '$figs/scatter-largest-errors-models-mean-vs-each-true.svelte'
   import EAboveHullScatterWrenformerFailures from '$figs/e-above-hull-scatter-wrenformer-failures.svelte'
-  import ProtoCountsWrenformerFailures from '$figs/proto-counts-Wrenformer-failures.svelte'
+  import ProtoCountsWrenformerFailures from '$figs/proto-counts-wrenformer-failures.svelte'
   import { onMount } from 'svelte'
 
   let mounted = false
@@ -30,7 +30,7 @@
 <MetricsTableFirst10k />
 
 > @label:fig:metrics-table-first-10k An actual discovery campaign is unlikely to validate every single stable prediction coming from a model like we did in the [metrics table](/preprint#fig:metrics-table). Presumably it will rank model predictions from most to least stable and go down that list as far their time and compute budget permits. Assuming that increases in compute resources will allow future discovery campaigns to grow in scope, we believe 10 k model validations to be a reasonable cutoff. To simulate this scenario, we calculated classification and regression metrics for the 10 k test set materials predicted to be most stable by each model.<br>
-> We again show dummy performance in the bottom row. Note that each model is now evaluated on a different slice of the data, this is still dummy performance across the whole dataset. CHGNet and M3GNet achieve a very impressive 83% and 80% precision, respectively. In concrete terms, this means in a prospective discovery campaign that sets out to validate 10 k model predictions from a search pool of 257 k crystals that are chemically dissimilar from the training set and of which 16.7 % are stable, CHGNet and M3GNet would deliver 4 stable structures for every 5 that are validated.
+> We again show dummy performance in the bottom row. Note that each model is now evaluated on a different slice of the data, but this is still dummy performance across the whole dataset. CHGNet and M3GNet achieve a very impressive 83% and 80% precision, respectively. In concrete terms, this means in a discovery campaign that validates 10 k model predictions from a search pool of 257 k crystals which are chemically dissimilar from the training set and of which 16.7 % are stable, CHGNet and M3GNet would deliver 4 stable structures for every 5 predictions validated.
 
 ## ROC Curves
 
@@ -100,7 +100,7 @@ Given its strong performance on batch 1, it is possible that given sufficiently
 {/if}
 
 > @label:fig:scatter-largest-errors-models-mean-vs-each-true The 200 structures with largest error averaged over all models vs their DFT hull distance colored by model disagreement (as measured by standard deviation in hull distance predictions from different models) and sized by number of training structures containing the least prevalent element (e.g. if a scatter point had composition FeO, MP has 6.6k structures containing Fe and 82k containing O so its size would be set to 6.6k). Thus smaller points have less training support. This plot suggests all models are biased to predict low energy and perhaps fail to capture certain physics resulting in highly unstable structures. This is unsurprising considering MP training data mainly consists of low energy structures.<br>
-> It is also possible that some of blue points with large error yet good agreement among models are in fact accurate ML predictions for a DFT relaxation gone wrong.
+> It is also possible that some of the blue points with large error yet good agreement among models are in fact accurate ML predictions for a DFT relaxation gone wrong.
 
 ## MEGNet formation energies from UIP-relaxed structures
 
@@ -127,8 +127,8 @@ We highlight this here to refute the suggestion that training on raw DFT energie
 <EAboveHullScatterWrenformerFailures style="height: 300; width: 300;" />
 {/if}
 
-> @label:fig:spacegroup-prevalence-wrenformer-failures The left spacegroup sunburst shows spacegroup 71 is by far the dominant number among the 941 Wrenformer failure cases where $E_\text{above hull,DFT} < 1$ and $E_\text{above hull,Wrenformer} > 1$ (points inside the shaded rectangle). On the right side for comparison is the spacegroup sunburst for the entire WBM test set.
+> @label:fig:spacegroup-prevalence-wrenformer-failures The left spacegroup sunburst shows spacegroup 71 is by far the dominant lattice symmetry among the 941 Wrenformer failure cases where $E_\text{above hull,DFT} < 1$ and $E_\text{above hull,Wrenformer} > 1$ (points inside the shaded rectangle). On the right side for comparison is the spacegroup sunburst for the entire WBM test set.
 
-Looking at the occurrence counts of isopointal prototypes in the shaded rectangle and comparing them with the occurrence of those same prototypes in the MP training data counts, we find almost no support for failing structure prototypes. This suggests the reason Wrenformer fails so spectacularly on these structures is that it cannot deal with structure prototypes it has not seen at least several hundred examples of in its training data. Hence Wrenformer may not be useful for discovering new prototypes.
+Looking at the occurrence counts of isopointal prototypes in the shaded rectangle and comparing them with the occurrence of those same prototypes in the MP training data counts, we find almost no support for failing structure prototypes. This suggests the reason Wrenformer fails so spectacularly on these structures is that it cannot deal with structure prototypes it has not seen at least several hundred examples of in its training data. Hence Wrenformer may be unsuitable for discovering new prototypes.
 
 <ProtoCountsWrenformerFailures />

Original file line number	Diff line number	Diff line change
`@@ -185,4 +185,4 @@ This information can be useful for others looking to reproduce your results or c`
`185`	`185`
`186`	`186`	`## 😵‍💫   Troubleshooting`
`187`	`187`
`188`		`-Having problems using or contributing to the project? Please [open an issue on GitHub]({repo}/issues) or [start a discussion]({repo}/discussions) for open-ended conversations. Happy to help! 😊`
	`188`	`+Having problems? Please [open an issue on GitHub]({repo}/issues). We're happy to help! 😊`