color hull distance scatter plots by density

janosh · janosh · commit afe6ce9aafee · 2023-07-29T18:31:08.000-07:00
set ModelCards background color from interpolatePuOr based on current selected sorting metric going (purple: best to orange: worst)
diff --git a/.github/workflows/test-scripts.yml b/.github/workflows/test-scripts.yml
@@ -15,7 +15,7 @@ jobs:
       matrix:
         script:
           - scripts/model_figs/make_metrics_tables.py
-          - scripts/analyze_element_errors.py
+          - scripts/model_figs/per_element_errors.py
     steps:
       - name: Check out repository
         uses: actions/checkout@v3
diff --git a/scripts/model_figs/per_element_errors.py b/scripts/model_figs/per_element_errors.py
@@ -50,12 +50,11 @@
 # %% compute number of samples per element in training set
 # counting element occurrences not weighted by composition, assuming model don't learn
 # much more about iron and oxygen from Fe2O3 than from FeO
-df_elem_err = pd.read_json(
-    f"{ROOT}/site/src/routes/about-the-data/mp-element-counts-occurrence.json",
-    typ="series",
-)
+counts_path = f"{ROOT}/site/src/routes/about-the-data/mp-element-counts-occurrence.json"
+df_elem_err = pd.read_json(counts_path, typ="series")
 train_count_col = "MP Occurrences"
 df_elem_err = df_elem_err.reset_index(name=train_count_col).set_index("index")
+df_elem_err.index.name = "symbol"
 
 
 # %%
diff --git a/scripts/model_figs/scatter_e_above_hull_models.py b/scripts/model_figs/scatter_e_above_hull_models.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import plotly.express as px
+import scipy.stats
 from pymatviz.utils import add_identity_line, bin_df_cols, save_fig
 
 from matbench_discovery import FIGS, PDF_FIGS
@@ -119,26 +120,34 @@
 
 
 # %% plot all models in separate subplots
-domain = (-4, 7)
 n_cols = 2
 n_rows = math.ceil(len(models) / n_cols)
 
+
+def get_density(xs: np.ndarray, ys: np.ndarray) -> np.ndarray:
+    """Get kernel density estimate for each (x, y) point."""
+    return scipy.stats.gaussian_kde([xs, ys])([xs, ys])
+
+
+# scatter plot of DFT vs predicted hull distance
 fig = px.scatter(
     df_bin,
     x=each_true_col,
     y=each_pred_col,
     facet_col=facet_col,
     facet_col_wrap=n_cols,
+    color=get_density(df_bin[each_true_col], df_bin[each_pred_col]),
     facet_col_spacing=0.02,
     facet_row_spacing=0.04,
     hover_data=hover_cols,
     hover_name=df_preds.index.name,
-    color=clf_col,
+    # color=clf_col,
     color_discrete_map=clf_color_map,
     # opacity=0.4,
-    range_x=domain,
+    range_x=(domain := (-4, 7)),
     range_y=domain,
     category_orders={facet_col: legend_order},
+    color_continuous_scale="turbo",
 )
 
 x_title = fig.layout.xaxis.title.text  # used in annotations below
@@ -147,7 +156,7 @@
 # iterate over subplots and set new title
 for idx, anno in enumerate(fig.layout.annotations, 1):
     traces = [t for t in fig.data if t.xaxis == f"x{idx if idx > 1 else ''}"]
-    assert len(traces) in (0, 4), f"Plots be empty or have 4 traces, got {len(traces)=}"
+    # assert len(traces) in (0, 4), f"Plots must have 0 or 4 traces, got {len(traces)=}"
 
     model = anno.text.split("=", 1)[1]
     assert model in df_preds, f"Unexpected {model=} not in {list(df_preds)=}"
@@ -219,9 +228,10 @@
     textangle=-90,
     **axis_titles,
 )
-fig.layout.height = 1000
+fig.layout.height = 200 * n_rows
+fig.layout.coloraxis.showscale = False
 # fig.layout.width = 1100
-fig.layout.margin.update(l=40, r=10, t=10, b=50)
+fig.layout.margin.update(l=40, r=10, t=30, b=60)
 fig.update_xaxes(matches=None)
 fig.update_yaxes(matches=None)
 fig.show()
diff --git a/site/package.json b/site/package.json
@@ -24,6 +24,7 @@
     "@sveltejs/vite-plugin-svelte": "^2.4.3",
     "@typescript-eslint/eslint-plugin": "^6.2.0",
     "@typescript-eslint/parser": "^6.2.0",
+    "d3-scale-chromatic": "^3.0.0",
     "elementari": "^0.2.2",
     "eslint": "^8.45.0",
     "eslint-plugin-svelte": "^2.32.4",
diff --git a/site/src/figs/each-scatter-models-5x2.svelte b/site/src/figs/each-scatter-models-5x2.svelte
diff --git a/site/src/lib/index.ts b/site/src/lib/index.ts
@@ -36,7 +36,7 @@ export type ModelStats = {
   missing_preds: number
   missing_percent: number
   Accuracy: number
-  'Run Time (h)': string
+  'Run Time (h)': number
   TPR: number
   TNR: number
   DAF: number
diff --git a/site/src/routes/+layout.svelte b/site/src/routes/+layout.svelte
@@ -82,7 +82,11 @@
 
   <slot />
 
-  <PrevNext items={routes} current="/{url?.split(`/`)[1]}" style="margin-top: 4em;">
+  <PrevNext
+    items={routes}
+    current="/{url?.split(`/`)[1]}"
+    style="margin: 4em auto 1em; max-width: 60em;"
+  >
     <a slot="next" let:item={href} {href} class="link">{href} &raquo;</a>
     <a slot="prev" let:item={href} {href} class="link">&laquo; {href}</a>
   </PrevNext>
diff --git a/site/src/routes/about-the-data/+page.svelte b/site/src/routes/about-the-data/+page.svelte
@@ -22,10 +22,10 @@
   let active_mp_elem: ChemicalElement
   let active_wbm_elem: ChemicalElement
   const count_mode_ops = [`occurrence`, `composition`]
-  let count_mode = [count_mode_ops[0]]
+  let count_mode = count_mode_ops[0]
 
-  $: mp_elem_counts = elem_counts[`./mp-element-counts-${count_mode[0]}.json`]
-  $: wbm_elem_counts = elem_counts[`./wbm-element-counts-${count_mode[0]}.json`]
+  $: mp_elem_counts = elem_counts[`./mp-element-counts-${count_mode}.json`]
+  $: wbm_elem_counts = elem_counts[`./wbm-element-counts-${count_mode}.json`]
 
   export const snapshot: Snapshot = {
     capture: () => ({ color_scale, log, count_mode }),
@@ -65,7 +65,8 @@
     >
     <Select
       id="count-mode"
-      bind:selected={count_mode}
+      selected={[count_mode]}
+      bind:value={count_mode}
       options={count_mode_ops}
       minSelect={1}
       maxSelect={1}
diff --git a/site/src/routes/models/+page.svelte b/site/src/routes/models/+page.svelte
@@ -2,6 +2,7 @@
   import type { ModelStatLabel, ModelStats } from '$lib'
   import { ModelCard } from '$lib'
   import Icon from '@iconify/svelte'
+  import { interpolatePuOr } from 'd3-scale-chromatic'
   import { RadioButtons, Tooltip } from 'svelte-zoo'
   import { flip } from 'svelte/animate'
   import { fade } from 'svelte/transition'
@@ -13,9 +14,9 @@
   let sort_by: keyof ModelStats | 'model_name' = `F1`
   let show_details: boolean = false
   let order: 'asc' | 'desc' = `desc`
-  let show_n_best: number = 8 // show only best models
+  let show_n_best: number = data.models.length // show only best models
   const min_models: number = 2
-  $: sort_factor = { asc: -1, desc: 1 }[order]
+  const lower_is_better = [`RMSE`, `MAE`, `Run Time (h)`]
 
   $: models = data.models.sort((model_1, model_2) => {
     const [val_1, val_2] = [model_1[sort_by], model_2[sort_by]]
@@ -24,7 +25,7 @@
     } else if (typeof val_1 == `number`) {
       return sort_factor * (val_2 - val_1)
     } else {
-      console.error(`Sorting by key ${sort_by} gives unknown type: ${typeof val_1}`)
+      throw `Sorting by key ${sort_by} gives unknown type: ${typeof val_1}`
     }
   })
   const stats: ModelStatLabel[] = [
@@ -44,6 +45,16 @@
     capture: () => ({ show_details, sort_by, order, show_n_best }),
     restore: (values) => ({ show_details, sort_by, order, show_n_best } = values),
   }
+
+  $: sort_factor = { asc: -1, desc: 1 }[order]
+  $: min_val = Math.min(...models.map((model) => model[sort_by] as number))
+  $: max_val = Math.max(...models.map((model) => model[sort_by] as number))
+  $: if (lower_is_better.includes(sort_by)) [min_val, max_val] = [max_val, min_val]
+  $: order = lower_is_better.includes(sort_by) ? `asc` : `desc`
+
+  function bg_color(val: number, min: number, max: number) {
+    return interpolatePuOr(1 - (val - min) / (max - min)).replace(`)`, `, 0.3)`)
+  }
 </script>
 
 <div style="margin: 3vw;">
@@ -74,27 +85,30 @@
   </ul>
 
   <ol>
-    {#each models.slice(0, Math.max(min_models, show_n_best)) as data (data.model_name)}
+    {#each models.slice(0, Math.max(min_models, show_n_best)) as model (model.model_name)}
       <li
         animate:flip={{ duration: 400 }}
         in:fade={{ delay: 100 }}
         out:fade={{ delay: 100 }}
+        style="background-color: {bg_color(model[sort_by], min_val, max_val)};"
       >
-        <ModelCard {data} {stats} {sort_by} bind:show_details />
-        {#if data.training_set}
+        <ModelCard data={model} {stats} {sort_by} bind:show_details />
+        {#if model.training_set}
           <!-- maybe show this text in a tooltip: This model was not trained on the
-            canonical training set. It's results should not be seen as a one-to-one
-            comparison to the other models but rather proof of concept of what is possible. -->
+          canonical training set. It's results should not be seen as a one-to-one
+          comparison to the other models but rather proof of concept of what is possible. -->
           <strong class="train-set">
             <Icon icon="ion:ios-warning" inline />
-            Custom training set: {data.training_set}
+            Custom training set: {model.training_set}
           </strong>
         {/if}
       </li>
     {/each}
   </ol>
 
-  <h2 style="margin-top: 6em;">Per-Element Model Error Heatmaps</h2>
+  <h2 style="margin: 4em auto 1em; text-align: center;">
+    Per-Element Model Error Heatmaps
+  </h2>
 
   <ElementErrorsPtableHeatmap />
 </div>
diff --git a/site/src/routes/models/element-errors-ptable-heatmap.svelte b/site/src/routes/models/element-errors-ptable-heatmap.svelte
@@ -42,11 +42,13 @@
   }
 </script>
 
-This periodic table is shaded by the MAE for the model-predicted convex hull distance for
-each element. The errors for every structure in the test set are projected onto the
-fraction of each element in the composition and averaged over all structures. The error is
-the absolute difference per atom between predicted and actual energy distance to the
-convex hull.
+<p style="max-width: 45em; margin: auto;">
+  This periodic table is shaded by the MAE for the model-predicted convex hull distance
+  for each element. The errors for every structure in the test set are projected onto the
+  fraction of each element in the composition and averaged over all structures. The error
+  is the absolute difference per atom between predicted and actual energy distance to the
+  convex hull.
+</p>
 
 <MultiSelect bind:selected={current_model} options={models} maxSelect={1} minSelect={1} />
 
diff --git a/site/src/routes/preprint/references.yaml b/site/src/routes/preprint/references.yaml
@@ -47,6 +47,63 @@ references:
   type: chapter
   URL: https://doi.org/10.1007/978-94-011-4653-1_21
 
+- id: allen_learning_2023
+  abstract: >-
+    The development of machine learning models has led to an abundance of
+    datasets containing quantum mechanical (QM) calculations for molecular and
+    material systems. However, traditional training methods for machine learning
+    models are unable to leverage the plethora of data available as they require
+    that each dataset be generated using the same QM method. Taking machine
+    learning interatomic potentials (MLIPs) as an example, we show that
+    meta-learning techniques, a recent advancement from the machine learning
+    community, can be used to fit multiple levels of QM theory in the same
+    training process. Meta-learning changes the training procedure to learn a
+    representation that can be easily re-trained to new tasks with small amounts
+    of data. We then demonstrate that meta-learning enables simultaneously
+    training to multiple large organic molecule datasets. As a proof of concept,
+    we examine the performance of a MLIP refit to a small drug-like molecule and
+    show that pre-training potentials to multiple levels of theory with
+    meta-learning improves performance. This difference in performance can be
+    seen both in the reduced error and in the improved smoothness of the
+    potential energy surface produced. We therefore show that meta-learning can
+    utilize existing datasets with inconsistent QM levels of theory to produce
+    models that are better at specializing to new datasets. This opens new
+    routes for creating pre-trained, foundational models for interatomic
+    potentials.
+  accessed:
+    - year: 2023
+      month: 7
+      day: 30
+  author:
+    - family: Allen
+      given: Alice E. A.
+    - family: Lubbers
+      given: Nicholas
+    - family: Matin
+      given: Sakib
+    - family: Smith
+      given: Justin
+    - family: Messerly
+      given: Richard
+    - family: Tretiak
+      given: Sergei
+    - family: Barros
+      given: Kipton
+  citation-key: allen_learning_2023
+  issued:
+    - year: 2023
+      month: 7
+      day: 8
+  number: arXiv:2307.04012
+  publisher: arXiv
+  source: arXiv.org
+  title: >-
+    Learning Together: Towards foundational models for machine learning
+    interatomic potentials with meta-learning
+  title-short: Learning Together
+  type: article
+  URL: http://arxiv.org/abs/2307.04012
+
 - id: aykol_rational_2021
   abstract: >-
     The rational solid-state synthesis of inorganic compounds is formulated as
@@ -1152,7 +1209,7 @@ references:
   URL: https://www.nature.com/articles/s41524-022-00891-8
   volume: '8'
 
-- id: glawe_optimal_2016a
+- id: glawe_optimal_2016
   abstract: >-
     Starting from the experimental data contained in the inorganic crystal
     structure database, we use a statistical analysis to determine the
@@ -1177,7 +1234,7 @@ references:
       given: E. K. U.
     - family: Marques
       given: Miguel A. L.
-  citation-key: glawe_optimal_2016a
+  citation-key: glawe_optimal_2016
   container-title: New Journal of Physics
   container-title-short: New J. Phys.
   DOI: 10.1088/1367-2630/18/9/093011
@@ -1905,7 +1962,7 @@ references:
   URL: https://www.nature.com/articles/nature17439
   volume: '533'
 
-- id: rupp_fast_2012a
+- id: rupp_fast_2012
   abstract: >-
     We introduce a machine learning model to predict atomization energies of a
     diverse set of organic molecules, based on nuclear charges and atomic
@@ -1930,7 +1987,7 @@ references:
     - family: Lilienfeld
       given: O. Anatole
       non-dropping-particle: von
-  citation-key: rupp_fast_2012a
+  citation-key: rupp_fast_2012
   container-title: Physical Review Letters
   container-title-short: Phys. Rev. Lett.
   DOI: 10.1103/PhysRevLett.108.058301
@@ -2371,6 +2428,45 @@ references:
   type: article-journal
   URL: http://arxiv.org/abs/1706.03762
 
+- id: vonlilienfeld_retrospective_2020
+  abstract: >-
+    Over the last decade, we have witnessed the emergence of ever more machine
+    learning applications in all aspects of the chemical sciences. Here, we
+    highlight specific achievements of machine learning models in the field of
+    computational chemistry by considering selected studies of electronic
+    structure, interatomic potentials, and chemical compound space in
+    chronological order.
+  accessed:
+    - year: 2023
+      month: 7
+      day: 29
+  author:
+    - family: Lilienfeld
+      given: O. Anatole
+      non-dropping-particle: von
+    - family: Burke
+      given: Kieron
+  citation-key: vonlilienfeld_retrospective_2020
+  container-title: Nature Communications
+  container-title-short: Nat Commun
+  DOI: 10.1038/s41467-020-18556-9
+  ISSN: 2041-1723
+  issue: '1'
+  issued:
+    - year: 2020
+      month: 9
+      day: 29
+  language: en
+  license: 2020 The Author(s)
+  number: '1'
+  page: '4895'
+  publisher: Nature Publishing Group
+  source: www.nature.com
+  title: Retrospective on a decade of machine learning for chemical discovery
+  type: article-journal
+  URL: https://www.nature.com/articles/s41467-020-18556-9
+  volume: '11'
+
 - id: wang_predicting_2021
   abstract: >-
     We propose an efficient high-throughput scheme for the discovery of stable