Skip to content

Commit afe6ce9

Browse files
committed
color hull distance scatter plots by density
set ModelCards background color from interpolatePuOr based on current selected sorting metric going (purple: best to orange: worst)
1 parent 3aad858 commit afe6ce9

File tree

11 files changed

+164
-37
lines changed

11 files changed

+164
-37
lines changed

.github/workflows/test-scripts.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
matrix:
1616
script:
1717
- scripts/model_figs/make_metrics_tables.py
18-
- scripts/analyze_element_errors.py
18+
- scripts/model_figs/per_element_errors.py
1919
steps:
2020
- name: Check out repository
2121
uses: actions/checkout@v3

scripts/model_figs/per_element_errors.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,11 @@
5050
# %% compute number of samples per element in training set
5151
# counting element occurrences not weighted by composition, assuming model don't learn
5252
# much more about iron and oxygen from Fe2O3 than from FeO
53-
df_elem_err = pd.read_json(
54-
f"{ROOT}/site/src/routes/about-the-data/mp-element-counts-occurrence.json",
55-
typ="series",
56-
)
53+
counts_path = f"{ROOT}/site/src/routes/about-the-data/mp-element-counts-occurrence.json"
54+
df_elem_err = pd.read_json(counts_path, typ="series")
5755
train_count_col = "MP Occurrences"
5856
df_elem_err = df_elem_err.reset_index(name=train_count_col).set_index("index")
57+
df_elem_err.index.name = "symbol"
5958

6059

6160
# %%

scripts/model_figs/scatter_e_above_hull_models.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import numpy as np
1111
import plotly.express as px
12+
import scipy.stats
1213
from pymatviz.utils import add_identity_line, bin_df_cols, save_fig
1314

1415
from matbench_discovery import FIGS, PDF_FIGS
@@ -119,26 +120,34 @@
119120

120121

121122
# %% plot all models in separate subplots
122-
domain = (-4, 7)
123123
n_cols = 2
124124
n_rows = math.ceil(len(models) / n_cols)
125125

126+
127+
def get_density(xs: np.ndarray, ys: np.ndarray) -> np.ndarray:
128+
"""Get kernel density estimate for each (x, y) point."""
129+
return scipy.stats.gaussian_kde([xs, ys])([xs, ys])
130+
131+
132+
# scatter plot of DFT vs predicted hull distance
126133
fig = px.scatter(
127134
df_bin,
128135
x=each_true_col,
129136
y=each_pred_col,
130137
facet_col=facet_col,
131138
facet_col_wrap=n_cols,
139+
color=get_density(df_bin[each_true_col], df_bin[each_pred_col]),
132140
facet_col_spacing=0.02,
133141
facet_row_spacing=0.04,
134142
hover_data=hover_cols,
135143
hover_name=df_preds.index.name,
136-
color=clf_col,
144+
# color=clf_col,
137145
color_discrete_map=clf_color_map,
138146
# opacity=0.4,
139-
range_x=domain,
147+
range_x=(domain := (-4, 7)),
140148
range_y=domain,
141149
category_orders={facet_col: legend_order},
150+
color_continuous_scale="turbo",
142151
)
143152

144153
x_title = fig.layout.xaxis.title.text # used in annotations below
@@ -147,7 +156,7 @@
147156
# iterate over subplots and set new title
148157
for idx, anno in enumerate(fig.layout.annotations, 1):
149158
traces = [t for t in fig.data if t.xaxis == f"x{idx if idx > 1 else ''}"]
150-
assert len(traces) in (0, 4), f"Plots be empty or have 4 traces, got {len(traces)=}"
159+
# assert len(traces) in (0, 4), f"Plots must have 0 or 4 traces, got {len(traces)=}"
151160

152161
model = anno.text.split("=", 1)[1]
153162
assert model in df_preds, f"Unexpected {model=} not in {list(df_preds)=}"
@@ -219,9 +228,10 @@
219228
textangle=-90,
220229
**axis_titles,
221230
)
222-
fig.layout.height = 1000
231+
fig.layout.height = 200 * n_rows
232+
fig.layout.coloraxis.showscale = False
223233
# fig.layout.width = 1100
224-
fig.layout.margin.update(l=40, r=10, t=10, b=50)
234+
fig.layout.margin.update(l=40, r=10, t=30, b=60)
225235
fig.update_xaxes(matches=None)
226236
fig.update_yaxes(matches=None)
227237
fig.show()

site/package.json

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"@sveltejs/vite-plugin-svelte": "^2.4.3",
2525
"@typescript-eslint/eslint-plugin": "^6.2.0",
2626
"@typescript-eslint/parser": "^6.2.0",
27+
"d3-scale-chromatic": "^3.0.0",
2728
"elementari": "^0.2.2",
2829
"eslint": "^8.45.0",
2930
"eslint-plugin-svelte": "^2.32.4",

site/src/figs/each-scatter-models-5x2.svelte

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

site/src/lib/index.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ export type ModelStats = {
3636
missing_preds: number
3737
missing_percent: number
3838
Accuracy: number
39-
'Run Time (h)': string
39+
'Run Time (h)': number
4040
TPR: number
4141
TNR: number
4242
DAF: number

site/src/routes/+layout.svelte

+5-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,11 @@
8282

8383
<slot />
8484

85-
<PrevNext items={routes} current="/{url?.split(`/`)[1]}" style="margin-top: 4em;">
85+
<PrevNext
86+
items={routes}
87+
current="/{url?.split(`/`)[1]}"
88+
style="margin: 4em auto 1em; max-width: 60em;"
89+
>
8690
<a slot="next" let:item={href} {href} class="link">{href} &raquo;</a>
8791
<a slot="prev" let:item={href} {href} class="link">&laquo; {href}</a>
8892
</PrevNext>

site/src/routes/about-the-data/+page.svelte

+5-4
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@
2222
let active_mp_elem: ChemicalElement
2323
let active_wbm_elem: ChemicalElement
2424
const count_mode_ops = [`occurrence`, `composition`]
25-
let count_mode = [count_mode_ops[0]]
25+
let count_mode = count_mode_ops[0]
2626
27-
$: mp_elem_counts = elem_counts[`./mp-element-counts-${count_mode[0]}.json`]
28-
$: wbm_elem_counts = elem_counts[`./wbm-element-counts-${count_mode[0]}.json`]
27+
$: mp_elem_counts = elem_counts[`./mp-element-counts-${count_mode}.json`]
28+
$: wbm_elem_counts = elem_counts[`./wbm-element-counts-${count_mode}.json`]
2929
3030
export const snapshot: Snapshot = {
3131
capture: () => ({ color_scale, log, count_mode }),
@@ -65,7 +65,8 @@
6565
>
6666
<Select
6767
id="count-mode"
68-
bind:selected={count_mode}
68+
selected={[count_mode]}
69+
bind:value={count_mode}
6970
options={count_mode_ops}
7071
minSelect={1}
7172
maxSelect={1}

site/src/routes/models/+page.svelte

+24-10
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import type { ModelStatLabel, ModelStats } from '$lib'
33
import { ModelCard } from '$lib'
44
import Icon from '@iconify/svelte'
5+
import { interpolatePuOr } from 'd3-scale-chromatic'
56
import { RadioButtons, Tooltip } from 'svelte-zoo'
67
import { flip } from 'svelte/animate'
78
import { fade } from 'svelte/transition'
@@ -13,9 +14,9 @@
1314
let sort_by: keyof ModelStats | 'model_name' = `F1`
1415
let show_details: boolean = false
1516
let order: 'asc' | 'desc' = `desc`
16-
let show_n_best: number = 8 // show only best models
17+
let show_n_best: number = data.models.length // show only best models
1718
const min_models: number = 2
18-
$: sort_factor = { asc: -1, desc: 1 }[order]
19+
const lower_is_better = [`RMSE`, `MAE`, `Run Time (h)`]
1920
2021
$: models = data.models.sort((model_1, model_2) => {
2122
const [val_1, val_2] = [model_1[sort_by], model_2[sort_by]]
@@ -24,7 +25,7 @@
2425
} else if (typeof val_1 == `number`) {
2526
return sort_factor * (val_2 - val_1)
2627
} else {
27-
console.error(`Sorting by key ${sort_by} gives unknown type: ${typeof val_1}`)
28+
throw `Sorting by key ${sort_by} gives unknown type: ${typeof val_1}`
2829
}
2930
})
3031
const stats: ModelStatLabel[] = [
@@ -44,6 +45,16 @@
4445
capture: () => ({ show_details, sort_by, order, show_n_best }),
4546
restore: (values) => ({ show_details, sort_by, order, show_n_best } = values),
4647
}
48+
49+
$: sort_factor = { asc: -1, desc: 1 }[order]
50+
$: min_val = Math.min(...models.map((model) => model[sort_by] as number))
51+
$: max_val = Math.max(...models.map((model) => model[sort_by] as number))
52+
$: if (lower_is_better.includes(sort_by)) [min_val, max_val] = [max_val, min_val]
53+
$: order = lower_is_better.includes(sort_by) ? `asc` : `desc`
54+
55+
function bg_color(val: number, min: number, max: number) {
56+
return interpolatePuOr(1 - (val - min) / (max - min)).replace(`)`, `, 0.3)`)
57+
}
4758
</script>
4859

4960
<div style="margin: 3vw;">
@@ -74,27 +85,30 @@
7485
</ul>
7586

7687
<ol>
77-
{#each models.slice(0, Math.max(min_models, show_n_best)) as data (data.model_name)}
88+
{#each models.slice(0, Math.max(min_models, show_n_best)) as model (model.model_name)}
7889
<li
7990
animate:flip={{ duration: 400 }}
8091
in:fade={{ delay: 100 }}
8192
out:fade={{ delay: 100 }}
93+
style="background-color: {bg_color(model[sort_by], min_val, max_val)};"
8294
>
83-
<ModelCard {data} {stats} {sort_by} bind:show_details />
84-
{#if data.training_set}
95+
<ModelCard data={model} {stats} {sort_by} bind:show_details />
96+
{#if model.training_set}
8597
<!-- maybe show this text in a tooltip: This model was not trained on the
86-
canonical training set. It's results should not be seen as a one-to-one
87-
comparison to the other models but rather proof of concept of what is possible. -->
98+
canonical training set. It's results should not be seen as a one-to-one
99+
comparison to the other models but rather proof of concept of what is possible. -->
88100
<strong class="train-set">
89101
<Icon icon="ion:ios-warning" inline />
90-
Custom training set: {data.training_set}
102+
Custom training set: {model.training_set}
91103
</strong>
92104
{/if}
93105
</li>
94106
{/each}
95107
</ol>
96108

97-
<h2 style="margin-top: 6em;">Per-Element Model Error Heatmaps</h2>
109+
<h2 style="margin: 4em auto 1em; text-align: center;">
110+
Per-Element Model Error Heatmaps
111+
</h2>
98112

99113
<ElementErrorsPtableHeatmap />
100114
</div>

site/src/routes/models/element-errors-ptable-heatmap.svelte

+7-5
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,13 @@
4242
}
4343
</script>
4444

45-
This periodic table is shaded by the MAE for the model-predicted convex hull distance for
46-
each element. The errors for every structure in the test set are projected onto the
47-
fraction of each element in the composition and averaged over all structures. The error is
48-
the absolute difference per atom between predicted and actual energy distance to the
49-
convex hull.
45+
<p style="max-width: 45em; margin: auto;">
46+
This periodic table is shaded by the MAE for the model-predicted convex hull distance
47+
for each element. The errors for every structure in the test set are projected onto the
48+
fraction of each element in the composition and averaged over all structures. The error
49+
is the absolute difference per atom between predicted and actual energy distance to the
50+
convex hull.
51+
</p>
5052

5153
<MultiSelect bind:selected={current_model} options={models} maxSelect={1} minSelect={1} />
5254

site/src/routes/preprint/references.yaml

+100-4
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,63 @@ references:
4747
type: chapter
4848
URL: https://doi.org/10.1007/978-94-011-4653-1_21
4949

50+
- id: allen_learning_2023
51+
abstract: >-
52+
The development of machine learning models has led to an abundance of
53+
datasets containing quantum mechanical (QM) calculations for molecular and
54+
material systems. However, traditional training methods for machine learning
55+
models are unable to leverage the plethora of data available as they require
56+
that each dataset be generated using the same QM method. Taking machine
57+
learning interatomic potentials (MLIPs) as an example, we show that
58+
meta-learning techniques, a recent advancement from the machine learning
59+
community, can be used to fit multiple levels of QM theory in the same
60+
training process. Meta-learning changes the training procedure to learn a
61+
representation that can be easily re-trained to new tasks with small amounts
62+
of data. We then demonstrate that meta-learning enables simultaneously
63+
training to multiple large organic molecule datasets. As a proof of concept,
64+
we examine the performance of a MLIP refit to a small drug-like molecule and
65+
show that pre-training potentials to multiple levels of theory with
66+
meta-learning improves performance. This difference in performance can be
67+
seen both in the reduced error and in the improved smoothness of the
68+
potential energy surface produced. We therefore show that meta-learning can
69+
utilize existing datasets with inconsistent QM levels of theory to produce
70+
models that are better at specializing to new datasets. This opens new
71+
routes for creating pre-trained, foundational models for interatomic
72+
potentials.
73+
accessed:
74+
- year: 2023
75+
month: 7
76+
day: 30
77+
author:
78+
- family: Allen
79+
given: Alice E. A.
80+
- family: Lubbers
81+
given: Nicholas
82+
- family: Matin
83+
given: Sakib
84+
- family: Smith
85+
given: Justin
86+
- family: Messerly
87+
given: Richard
88+
- family: Tretiak
89+
given: Sergei
90+
- family: Barros
91+
given: Kipton
92+
citation-key: allen_learning_2023
93+
issued:
94+
- year: 2023
95+
month: 7
96+
day: 8
97+
number: arXiv:2307.04012
98+
publisher: arXiv
99+
source: arXiv.org
100+
title: >-
101+
Learning Together: Towards foundational models for machine learning
102+
interatomic potentials with meta-learning
103+
title-short: Learning Together
104+
type: article
105+
URL: http://arxiv.org/abs/2307.04012
106+
50107
- id: aykol_rational_2021
51108
abstract: >-
52109
The rational solid-state synthesis of inorganic compounds is formulated as
@@ -1152,7 +1209,7 @@ references:
11521209
URL: https://www.nature.com/articles/s41524-022-00891-8
11531210
volume: '8'
11541211

1155-
- id: glawe_optimal_2016a
1212+
- id: glawe_optimal_2016
11561213
abstract: >-
11571214
Starting from the experimental data contained in the inorganic crystal
11581215
structure database, we use a statistical analysis to determine the
@@ -1177,7 +1234,7 @@ references:
11771234
given: E. K. U.
11781235
- family: Marques
11791236
given: Miguel A. L.
1180-
citation-key: glawe_optimal_2016a
1237+
citation-key: glawe_optimal_2016
11811238
container-title: New Journal of Physics
11821239
container-title-short: New J. Phys.
11831240
DOI: 10.1088/1367-2630/18/9/093011
@@ -1905,7 +1962,7 @@ references:
19051962
URL: https://www.nature.com/articles/nature17439
19061963
volume: '533'
19071964

1908-
- id: rupp_fast_2012a
1965+
- id: rupp_fast_2012
19091966
abstract: >-
19101967
We introduce a machine learning model to predict atomization energies of a
19111968
diverse set of organic molecules, based on nuclear charges and atomic
@@ -1930,7 +1987,7 @@ references:
19301987
- family: Lilienfeld
19311988
given: O. Anatole
19321989
non-dropping-particle: von
1933-
citation-key: rupp_fast_2012a
1990+
citation-key: rupp_fast_2012
19341991
container-title: Physical Review Letters
19351992
container-title-short: Phys. Rev. Lett.
19361993
DOI: 10.1103/PhysRevLett.108.058301
@@ -2371,6 +2428,45 @@ references:
23712428
type: article-journal
23722429
URL: http://arxiv.org/abs/1706.03762
23732430

2431+
- id: vonlilienfeld_retrospective_2020
2432+
abstract: >-
2433+
Over the last decade, we have witnessed the emergence of ever more machine
2434+
learning applications in all aspects of the chemical sciences. Here, we
2435+
highlight specific achievements of machine learning models in the field of
2436+
computational chemistry by considering selected studies of electronic
2437+
structure, interatomic potentials, and chemical compound space in
2438+
chronological order.
2439+
accessed:
2440+
- year: 2023
2441+
month: 7
2442+
day: 29
2443+
author:
2444+
- family: Lilienfeld
2445+
given: O. Anatole
2446+
non-dropping-particle: von
2447+
- family: Burke
2448+
given: Kieron
2449+
citation-key: vonlilienfeld_retrospective_2020
2450+
container-title: Nature Communications
2451+
container-title-short: Nat Commun
2452+
DOI: 10.1038/s41467-020-18556-9
2453+
ISSN: 2041-1723
2454+
issue: '1'
2455+
issued:
2456+
- year: 2020
2457+
month: 9
2458+
day: 29
2459+
language: en
2460+
license: 2020 The Author(s)
2461+
number: '1'
2462+
page: '4895'
2463+
publisher: Nature Publishing Group
2464+
source: www.nature.com
2465+
title: Retrospective on a decade of machine learning for chemical discovery
2466+
type: article-journal
2467+
URL: https://www.nature.com/articles/s41467-020-18556-9
2468+
volume: '11'
2469+
23742470
- id: wang_predicting_2021
23752471
abstract: >-
23762472
We propose an efficient high-throughput scheme for the discovery of stable

0 commit comments

Comments
 (0)