Skip to content

Commit 9376630

Browse files
committed
add 2023-03-16-all-mp-tasks.zip figshare url
add up/down arrow to metrics table column indicating whether higher or lower is better tweak global table CSS fix metrics-tables not sortable by clicking column headers
1 parent e901031 commit 9376630

File tree

10 files changed

+64
-29
lines changed

10 files changed

+64
-29
lines changed

data/figshare/1.0.0.json

+4
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@
4747
"mp_trj_extxyz_by_yuan": [
4848
"https://figshare.com/ndownloader/files/43302033",
4949
"2023-11-22-mp-trj-extxyz-by-yuan.zip"
50+
],
51+
"all_mp_tasks": [
52+
"https://figshare.com/ndownloader/files/43350447",
53+
"2023-03-16-all-mp-tasks.zip"
5054
]
5155
},
5256
"article": "https://figshare.com/articles/dataset/22715158",

data/mp/get_mp_traj.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""Download all MP ionic steps using direct read-access to the mp_core DB.
22
33
Gzipped JSON is ~15GB.
4-
On a good connection, takes about 15 min per batch * 140 batches = 35 h.
4+
On a good connection, takes about 15 min per batch * 140 batches = 35 h to download
5+
all 1.6M task docs.
56
"""
67

78

@@ -71,7 +72,7 @@
7172

7273
os.makedirs(f"{module_dir}/mp-tasks", exist_ok=True)
7374
# Iterate over task_ids in batches
74-
desc = "Loading MP task docs"
75+
desc = "Fetching MP task docs..."
7576
pbar = trange(0, len(task_ids), batch_size, desc=desc, unit_scale=batch_size)
7677
for start_idx in pbar:
7778
# Define start and end indices for batch
@@ -114,7 +115,7 @@
114115
# %% use gzip CLI to check all files for archive corruption
115116
for path in tqdm(glob(f"{module_dir}/mp-tasks/*.json.gz")):
116117
try:
117-
subprocess.run(["gzip", "-t", path], check=True)
118+
subprocess.run(["gzip", "--test", path], check=True)
118119
except subprocess.CalledProcessError as exc:
119120
print(f"{path} raised {exc.stderr}")
120-
# os.remove(path)
121+
# os.remove(path) # delete corrupted file

matbench_discovery/data.py

+2
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ def _on_not_found(self, key: str, msg: str) -> None: # type: ignore[override]
251251
mace_checkpoint = "2023-08-14-mace-yuan-trained-mptrj-04.model"
252252

253253
mp_trj_extxyz = "mp/2023-11-22-mp-trj-extxyz-by-yuan.zip"
254+
# snapshot of every task (calculation) in MP as of 2023-03-16 (14 GB)
255+
all_mp_tasks = "mp/2023-03-16-all-mp-tasks.zip"
254256

255257
mace_checkpoint1 = "2023-08-14-mace-2M-yuan-mptrj-04.model"
256258
mace_checkpoint2 = "2023-10-29-mace-16M-pbenner-mptrj-no-conditional-loss"

matbench_discovery/plots.py

+2
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,8 @@ def cumulative_metrics(
606606

607607
# largest number of materials predicted stable by any model, determines x-axis range
608608
n_max_pred_stable = (df_preds < stability_threshold).sum().max()
609+
# use log2-spaced sampling to get higher sampling density at equal file size for
610+
# start of the discovery campaign where model performance fluctuates more
609611
longest_xs = np.logspace(0, np.log2(n_max_pred_stable - 1), n_points, base=2)
610612
for metric in metrics:
611613
dfs[metric].index = longest_xs

scripts/metrics-which-is-better.json

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
{
22
"higher_is_better": [
3+
"Acc",
4+
"Accuracy",
5+
"AUC",
36
"DAF",
4-
"R2",
7+
"F1",
8+
"Prec",
59
"Precision",
10+
"R2",
611
"Recall",
7-
"F1",
8-
"Accuracy",
9-
"TPR",
12+
"TN",
1013
"TNR",
1114
"TP",
12-
"TN"
15+
"TPR"
1316
],
1417
"lower_is_better": ["MAE", "RMSE", "FPR", "FNR", "FP", "FN"]
1518
}

scripts/model_figs/make_metrics_tables.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,12 @@
136136
)
137137
df_filtered = df_table.T[show_cols] # only keep columns we want to show
138138

139+
# abbreviate long column names: Precision, Accuracy -> Prec, Acc
140+
df_filtered = df_filtered.rename(columns={"Precision": "Prec", "Accuracy": "Acc"})
141+
139142
if label == "-first-10k":
140143
# hide redundant metrics for first 10k preds (all TPR = 1, TNR = 0)
141-
df_filtered = df_filtered.drop(["TPR", "TNR"], axis=1)
144+
df_filtered = df_filtered.drop(["TPR", "TNR"], axis="columns")
142145

143146
styler = (
144147
df_filtered.style.format(
@@ -154,6 +157,13 @@
154157
cmap="viridis_r", subset=list(lower_is_better & {*df_filtered})
155158
)
156159
)
160+
arrow_suffix = dict.fromkeys(higher_is_better, " ↑") | dict.fromkeys(
161+
lower_is_better, " ↓"
162+
)
163+
styler.relabel_index(
164+
[f"{col}{arrow_suffix.get(col, '')}" for col in df_filtered],
165+
axis="columns",
166+
)
157167

158168
# export model metrics as styled HTML table and Svelte component
159169
# get index of MAE column

site/src/app.css

+2
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,9 @@ img {
122122

123123
table {
124124
display: block;
125+
width: max-content;
125126
max-width: 100%;
127+
margin: auto;
126128
overflow: scroll;
127129
border-collapse: collapse;
128130
}

site/src/figs/metrics-table-first-10k.svelte

+13-8
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

site/src/figs/metrics-table.svelte

+15-10
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

site/src/routes/contribute/+page.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,13 @@
2525
wbm_summary:
2626
`Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.`,
2727
mp_trj_extxyz_by_yuan: `${mp_trj_link} converted to <code>ase</code>-compatible extended XYZ format and compressed (11.3 to 1.6 GB) by Yuan Chiang`,
28+
all_mp_tasks: `Complete copy of the MP database on 2023-03-16 (release <a href="https://docs.materialsproject.org/changes/database-versions#v2022.10.28">v2022.10.28</a>)`,
2829
}
2930
const desc_keys = Object.keys(descriptions).sort()
3031
const figshare_keys = Object.keys(figshare_urls.files).sort()
3132
const missing = figshare_keys.filter((key) => !desc_keys.includes(key))
3233
if (missing.length > 0) {
33-
throw `descriptions must contain all figshare_urls keys, missing=${missing}`
34+
throw`descriptions must contain all figshare_urls keys, missing=${missing}`
3435
}
3536
</script>
3637

0 commit comments

Comments
 (0)