add 2023-03-16-all-mp-tasks.zip figshare url

janosh · janosh · commit 937663048691 · 2023-11-29T19:06:49.000-08:00
add up/down arrow to metrics table column  indicating whether higher or lower is better
tweak global table CSS
fix metrics-tables not sortable by clicking column headers
diff --git a/data/figshare/1.0.0.json b/data/figshare/1.0.0.json
@@ -47,6 +47,10 @@
     "mp_trj_extxyz_by_yuan": [
       "https://figshare.com/ndownloader/files/43302033",
       "2023-11-22-mp-trj-extxyz-by-yuan.zip"
+    ],
+    "all_mp_tasks": [
+      "https://figshare.com/ndownloader/files/43350447",
+      "2023-03-16-all-mp-tasks.zip"
     ]
   },
   "article": "https://figshare.com/articles/dataset/22715158",
diff --git a/data/mp/get_mp_traj.py b/data/mp/get_mp_traj.py
@@ -1,7 +1,8 @@
 """Download all MP ionic steps using direct read-access to the mp_core DB.
 
 Gzipped JSON is ~15GB.
-On a good connection, takes about 15 min per batch * 140 batches = 35 h.
+On a good connection, takes about 15 min per batch * 140 batches = 35 h to download
+all 1.6M task docs.
 """
 
 
@@ -71,7 +72,7 @@
 
 os.makedirs(f"{module_dir}/mp-tasks", exist_ok=True)
 # Iterate over task_ids in batches
-desc = "Loading MP task docs"
+desc = "Fetching MP task docs..."
 pbar = trange(0, len(task_ids), batch_size, desc=desc, unit_scale=batch_size)
 for start_idx in pbar:
     # Define start and end indices for batch
@@ -114,7 +115,7 @@
 # %% use gzip CLI to check all files for archive corruption
 for path in tqdm(glob(f"{module_dir}/mp-tasks/*.json.gz")):
     try:
-        subprocess.run(["gzip", "-t", path], check=True)
+        subprocess.run(["gzip", "--test", path], check=True)
     except subprocess.CalledProcessError as exc:
         print(f"{path} raised {exc.stderr}")
-        # os.remove(path)
+        # os.remove(path)  # delete corrupted file
diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -251,6 +251,8 @@ def _on_not_found(self, key: str, msg: str) -> None:  # type: ignore[override]
     mace_checkpoint = "2023-08-14-mace-yuan-trained-mptrj-04.model"
 
     mp_trj_extxyz = "mp/2023-11-22-mp-trj-extxyz-by-yuan.zip"
+    # snapshot of every task (calculation) in MP as of 2023-03-16 (14 GB)
+    all_mp_tasks = "mp/2023-03-16-all-mp-tasks.zip"
 
     mace_checkpoint1 = "2023-08-14-mace-2M-yuan-mptrj-04.model"
     mace_checkpoint2 = "2023-10-29-mace-16M-pbenner-mptrj-no-conditional-loss"
diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py
@@ -606,6 +606,8 @@ def cumulative_metrics(
 
     # largest number of materials predicted stable by any model, determines x-axis range
     n_max_pred_stable = (df_preds < stability_threshold).sum().max()
+    # use log2-spaced sampling to get higher sampling density at equal file size for
+    # start of the discovery campaign where model performance fluctuates more
     longest_xs = np.logspace(0, np.log2(n_max_pred_stable - 1), n_points, base=2)
     for metric in metrics:
         dfs[metric].index = longest_xs
diff --git a/scripts/metrics-which-is-better.json b/scripts/metrics-which-is-better.json
@@ -1,15 +1,18 @@
 {
   "higher_is_better": [
+    "Acc",
+    "Accuracy",
+    "AUC",
     "DAF",
-    "R2",
+    "F1",
+    "Prec",
     "Precision",
+    "R2",
     "Recall",
-    "F1",
-    "Accuracy",
-    "TPR",
+    "TN",
     "TNR",
     "TP",
-    "TN"
+    "TPR"
   ],
   "lower_is_better": ["MAE", "RMSE", "FPR", "FNR", "FP", "FN"]
 }
diff --git a/scripts/model_figs/make_metrics_tables.py b/scripts/model_figs/make_metrics_tables.py
@@ -136,9 +136,12 @@
             )
     df_filtered = df_table.T[show_cols]  # only keep columns we want to show
 
+    # abbreviate long column names: Precision, Accuracy -> Prec, Acc
+    df_filtered = df_filtered.rename(columns={"Precision": "Prec", "Accuracy": "Acc"})
+
     if label == "-first-10k":
         # hide redundant metrics for first 10k preds (all TPR = 1, TNR = 0)
-        df_filtered = df_filtered.drop(["TPR", "TNR"], axis=1)
+        df_filtered = df_filtered.drop(["TPR", "TNR"], axis="columns")
 
     styler = (
         df_filtered.style.format(
@@ -154,6 +157,13 @@
             cmap="viridis_r", subset=list(lower_is_better & {*df_filtered})
         )
     )
+    arrow_suffix = dict.fromkeys(higher_is_better, " ↑") | dict.fromkeys(
+        lower_is_better, " ↓"
+    )
+    styler.relabel_index(
+        [f"{col}{arrow_suffix.get(col, '')}" for col in df_filtered],
+        axis="columns",
+    )
 
     # export model metrics as styled HTML table and Svelte component
     # get index of MAE column
diff --git a/site/src/app.css b/site/src/app.css
@@ -122,7 +122,9 @@ img {
 
 table {
   display: block;
+  width: max-content;
   max-width: 100%;
+  margin: auto;
   overflow: scroll;
   border-collapse: collapse;
 }
diff --git a/site/src/figs/metrics-table-first-10k.svelte b/site/src/figs/metrics-table-first-10k.svelte
diff --git a/site/src/figs/metrics-table.svelte b/site/src/figs/metrics-table.svelte
diff --git a/site/src/routes/contribute/+page.md b/site/src/routes/contribute/+page.md
@@ -25,12 +25,13 @@
     wbm_summary:
       `Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.`,
     mp_trj_extxyz_by_yuan: `${mp_trj_link} converted to <code>ase</code>-compatible extended XYZ format and compressed (11.3 to 1.6 GB) by Yuan Chiang`,
+    all_mp_tasks: `Complete copy of the MP database on 2023-03-16 (release <a href="https://docs.materialsproject.org/changes/database-versions#v2022.10.28">v2022.10.28</a>)`,
   }
   const desc_keys = Object.keys(descriptions).sort()
   const figshare_keys = Object.keys(figshare_urls.files).sort()
   const missing = figshare_keys.filter((key) => !desc_keys.includes(key))
   if (missing.length > 0) {
-    throw `descriptions must contain all figshare_urls keys, missing=${missing}`
+    throw`descriptions must contain all figshare_urls keys, missing=${missing}`
   }
 </script>
 

Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,9 @@ img {`
`122`	`122`
`123`	`123`	`table {`
`124`	`124`	`display: block;`
	`125`	`+ width: max-content;`
`125`	`126`	`max-width: 100%;`
	`127`	`+ margin: auto;`
`126`	`128`	`overflow: scroll;`
`127`	`129`	`border-collapse: collapse;`
`128`	`130`	`}`
Original file line number	Diff line number	Diff line change
`@@ -25,12 +25,13 @@`
`25`	`25`	`wbm_summary:`
`26`	`26`	`Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.`,
`27`	`27`	mp_trj_extxyz_by_yuan: `${mp_trj_link} converted to <code>ase</code>-compatible extended XYZ format and compressed (11.3 to 1.6 GB) by Yuan Chiang`,
	`28`	+ all_mp_tasks: `Complete copy of the MP database on 2023-03-16 (release <a href="https://docs.materialsproject.org/changes/database-versions#v2022.10.28">v2022.10.28</a>)`,
`28`	`29`	`}`
`29`	`30`	`const desc_keys = Object.keys(descriptions).sort()`
`30`	`31`	`const figshare_keys = Object.keys(figshare_urls.files).sort()`
`31`	`32`	`const missing = figshare_keys.filter((key) => !desc_keys.includes(key))`
`32`	`33`	`if (missing.length > 0) {`
`33`		- throw `descriptions must contain all figshare_urls keys, missing=${missing}`
	`34`	+ throw`descriptions must contain all figshare_urls keys, missing=${missing}`
`34`	`35`	`}`
`35`	`36`	`</script>`
`36`	`37`