pnpm add -D katex remark-math for equation support in docs

janosh · janosh · commit 18be9dcb95ca · 2023-06-19T20:29:22.000-07:00
diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -5,6 +5,7 @@
 from aviary.utils import as_dict_handler
 from aviary.wren.utils import get_aflow_label_from_spglib
 from mp_api.client import MPRester
+from pymatviz import density_scatter
 from tqdm import tqdm
 
 from matbench_discovery import today
@@ -19,7 +20,6 @@
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-13"
 
-
 module_dir = os.path.dirname(__file__)
 
 
@@ -33,22 +33,52 @@
     "structure",
     "symmetry",
     "energy_above_hull",
+    "decomposition_enthalpy",
+    "energy_type",
 ]
+
 with MPRester(use_document_model=False) as mpr:
-    docs = mpr.summary.search(fields=fields)
+    docs = mpr.thermo.search(fields=fields, thermo_types=["GGA_GGA+U"])
 
 print(f"{today}: {len(docs) = :,}")
 # 2022-08-13: len(docs) = 146,323
+# 2023-01-10: len(docs) = 154,718
 
 
 # %%
 df = pd.DataFrame(docs).set_index("material_id")
 df.pop("_id")
 
-df["spacegroup_number"] = df.pop("symmetry").map(lambda x: x.number)
+df.energy_type.value_counts().plot.pie(backend="matplotlib", autopct="%1.1f%%")
+
+
+# %%
+df["spacegroup_number"] = df.pop("symmetry").map(lambda x: x["number"])
 
 df["wyckoff_spglib"] = [get_aflow_label_from_spglib(x) for x in tqdm(df.structure)]
 
 df.to_json(f"{module_dir}/{today}-mp-energies.json.gz", default_handler=as_dict_handler)
 
 # df = pd.read_json(f"{module_dir}/2022-08-13-mp-energies.json.gz")
+
+
+# %% reproduce fig. 1b from https://arxiv.org/abs/2001.10591 (as data consistency check)
+ax = df.plot.scatter(
+    x="formation_energy_per_atom",
+    y="decomposition_enthalpy",
+    alpha=0.1,
+    backend="matplotlib",
+    xlim=[-5, 1],
+    ylim=[-1, 1],
+    color=df.decomposition_enthalpy.map(lambda x: "red" if x > 0 else "blue"),
+    title=f"{today} - {len(df):,} MP entries",
+)
+# result on 2023-01-10: plots match. no correlation between formation energy and decomposition
+# enthalpy. R^2 = -1.571, MAE = 1.604
+ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.png", dpi=300)
+
+ax = density_scatter(
+    df.formation_energy_per_atom,
+    df.decomposition_enthalpy,
+)
+ax.set(xlim=[-5, 1], ylim=[-1, 1])
diff --git a/data/wbm/analysis.py b/data/wbm/analysis.py
diff --git a/data/wbm/readme.md b/data/wbm/readme.md
@@ -1,12 +1,12 @@
 # WBM Dataset
 
-The **WBM dataset** was published in [Predicting stable crystalline compounds using chemical similarity][wbm paper] (Nature Computational Materials, Jan 2021, [doi:10.1038/s41524-020-00481-6](http://doi.org/10.1038/s41524-020-00481-6)). The authors generated 257,487 structures through single-element substitutions on Materials Project (MP) source structures. The replacement element was chosen based on chemical similarity determined by a matrix data-mined from the [Inorganic Crystal Structure Database (ICSD)](https://icsd.products.fiz-karlsruhe.de).
+The **WBM dataset** was published in [Predicting stable crystalline compounds using chemical similarity][wbm paper] (Nature Computational Materials, Jan 2021, [doi:10.1038/s41524-020-00481-6](http://doi.org/10.1038/s41524-020-00481-6)). The authors generated 257,487 structures through single-element substitutions on Materials Project (MP) source structures. The replacement element was chosen based on chemical similarity determined by a matrix data mined from the [Inorganic Crystal Structure Database (ICSD)](https://icsd.products.fiz-karlsruhe.de).
 
-The resulting novel structures were relaxed using MP-compatible VASP inputs (i.e. using `pymatgen`'s `MPRelaxSet`) and identical POTCARs in an attempt to create a database of Materials Project compatible novel crystals. Any degrade in model performance from training to test set should therefore largely be a result of extrapolation error rather than covariate shift in the underlying data.
+The resulting novel structures were relaxed using MP-compatible VASP inputs (i.e. using `pymatgen`'s `MPRelaxSet`) and identical POTCARs in an attempt to create a database of Materials Project compatible novel crystals. Any degradation in model performance from training to test set should therefore largely be a result of extrapolation error rather than covariate shift in the underlying data.
 
 The authors performed 5 rounds of elemental substitution in total, each time relaxing all generated structures and adding those found to lie on the convex hull back to the source pool. In total, ~20k or close to 10% were found to lie on the Materials Project convex hull.
 
-Since repeated substitutions should - on average - increase chemical dissimilarity, the 5 iterations of this data-generation process are a unique and compelling feature as it allows out-of distribution testing. We can check how model performance degrades when asked to predict on structures increasingly more dissimilar from the training set (which is restricted to the MP 2022 database release (or earlier) for all models in this benchmark).
+Since repeated substitutions should - on average - increase chemical dissimilarity, the 5 iterations of this data-generation process are a unique and compelling feature as it allows out-of-distribution testing. We can check how model performance degrades when asked to predict structures increasingly more dissimilar from the training set (which is restricted to the MP 2022 database release (or earlier) for all models in this benchmark).
 
 ## 🆔 &thinsp; About the IDs
 
@@ -70,7 +70,7 @@ materialscloud:2021.68 includes a readme file with a description of the dataset,
 
 [wbm paper]: https://nature.com/articles/s41524-020-00481-6
 
-## 📊 &thinsp; Data Plots
+## 📊 &thinsp; Plots
 
 <caption>Heatmap of WBM training set element counts</caption>
 <slot name="wbm-elements-heatmap">
diff --git a/site/package.json b/site/package.json
@@ -25,13 +25,18 @@
     "@typescript-eslint/parser": "^5.48.0",
     "eslint": "^8.31.0",
     "eslint-plugin-svelte3": "^4.0.0",
+    "hast-util-from-string": "^2.0.0",
+    "hast-util-select": "^5.0.3",
+    "hast-util-to-string": "^2.0.0",
     "hastscript": "^7.2.0",
     "highlight.js": "^11.7.0",
+    "katex": "^0.16.4",
     "mdsvex": "^0.10.6",
     "prettier": "^2.8.2",
     "prettier-plugin-svelte": "^2.9.0",
     "rehype-autolink-headings": "^6.1.1",
     "rehype-slug": "^5.1.0",
+    "remark-math": "^3.0.0",
     "svelte": "^3.55.0",
     "svelte-check": "^3.0.1",
     "svelte-github-corner": "^0.2.0",
diff --git a/site/src/app.css b/site/src/app.css
@@ -7,7 +7,7 @@
   --toc-li-padding: 4pt 1ex;
   --toc-mobile-btn-color: white;
   --toc-desktop-nav-margin: 0 0 0 1em;
-  --toc-min-width: 20em;
+  --toc-min-width: 16em;
   --toc-active-bg: darkcyan;
 
   --ghc-color: var(--night);
diff --git a/site/src/app.html b/site/src/app.html
@@ -25,7 +25,13 @@
 
     <link rel="icon" href="/favicon.svg" />
     <link rel="stylesheet" href="/prism-vsc-dark-plus.css" />
+    <!-- interactive plots -->
     <script src="https://cdn.plot.ly/plotly-2.14.0.min.js"></script>
+    <!-- math display -->
+    <link
+      rel="stylesheet"
+      href="https://cdn.jsdelivr.net/npm/katex@0.15.0/dist/katex.min.css"
+    />
 
     %sveltekit.head%
   </head>
diff --git a/site/src/routes/how-to-contribute/+page.md b/site/src/routes/how-to-contribute/+page.md
@@ -2,7 +2,7 @@
 
 ## 🔨 &thinsp; Installation
 
-The recommended way to acquire the train and test data for this benchmark is through its Python package [available onPyPI](https://pypi.org/project/matbench-discovery):
+The recommended way to acquire the train and test data for this benchmark is through its Python package [available on PyPI](https://pypi.org/project/matbench-discovery):
 
 ```zsh
 pip install matbench-discovery
@@ -124,7 +124,7 @@ To deploy a new model on this benchmark and add it to our leaderboard, please cr
 
    Arbitrary other keys can be added as needed.
 
-Please see any of subdirectories in [`models/`](https://github.com/janosh/matbench-discovery/tree/main/models) for example submissions. More detailed step-by-step instructions below:
+Please see any of the subdirectories in `models`/`](<https://github.com/janosh/matbench-discovery/tree/main/models>) for example submissions. More detailed step-by-step instructions below:
 
 ### Step 1: Clone the repo
 
@@ -142,7 +142,7 @@ Create a new folder
 mkdir models/<model_name>
 ```
 
-and place the above listed files there. The file structure should look like this:
+and place the above-listed files there. The file structure should look like this:
 
 ```txt
 matbench-discovery-root
@@ -155,7 +155,7 @@ matbench-discovery-root
         └── train_<model_name>.py # optional
 ```
 
-You can include arbitrary other supporting files like metadata, model features (below 10MB to keep `git clone` time low) if they are needed to run the model or help others reproduce your results. For larger files, please upload to [Figshare](https://figshare.com) or similar and link them somewhere in your files.
+You can include arbitrary other supporting files like metadata and model features (below 10MB to keep `git clone` time low) if they are needed to run the model or help others reproduce your results. For larger files, please upload to [Figshare](https://figshare.com) or similar and link them somewhere in your files.
 
 ### Step 3: Create a PR to the [Matbench Discovery repo](https://github.com/janosh/matbench-discovery)
 
@@ -168,6 +168,6 @@ git commit -m 'add <model_name> to Matbench Discovery leaderboard'
 
 And you're done! Once tests pass and the PR is merged, your model will be added to the leaderboard! 🎉
 
-## Troubleshooting
+## 😵‍💫 &thinsp; Troubleshooting
 
 Having problems using or contributing to the project? Please [open an issue on GitHub](https://github.com/janosh/matbench-discovery/issues). We're happy to help!
diff --git a/site/svelte.config.js b/site/svelte.config.js
@@ -1,14 +1,31 @@
 import adapter from '@sveltejs/adapter-static'
+import { fromString } from 'hast-util-from-string'
+import { selectAll } from 'hast-util-select'
+import { toString } from 'hast-util-to-string'
 import { s } from 'hastscript'
+import katex from 'katex'
 import { mdsvex } from 'mdsvex'
-import linkHeadings from 'rehype-autolink-headings'
-import headingSlugs from 'rehype-slug'
+import link_headings from 'rehype-autolink-headings'
+import heading_slugs from 'rehype-slug'
+import math from 'remark-math'
 import preprocess from 'svelte-preprocess'
 
 const rehypePlugins = [
-  headingSlugs,
+  // from https://github.com/kwshi/rehype-katex-svelte
+  (options = {}) =>
+    (tree) => {
+      for (const node of selectAll(`.math-inline,.math-display`, tree)) {
+        const displayMode = node.properties?.className?.includes(`math-display`)
+        const rendered = katex.renderToString(toString(node), {
+          ...options,
+          displayMode,
+        })
+        fromString(node, `{@html ${JSON.stringify(rendered)}}`)
+      }
+    },
+  heading_slugs,
   [
-    linkHeadings,
+    link_headings,
     {
       behavior: `append`,
       test: [`h2`, `h3`, `h4`, `h5`, `h6`], // don't auto-link <h1>
@@ -30,6 +47,9 @@ export default {
     preprocess(),
     mdsvex({
       rehypePlugins,
+      // remark-math@3.0.0 pinned due to mdsvex, see
+      // https://github.com/kwshi/rehype-katex-svelte#usage
+      remarkPlugins: [math],
       extensions: [`.svx`, `.md`],
     }),
   ],