janosh
diff --git a/‎.pre-commit-config.yaml
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎contributing.md
Lines changed: 1 addition & 1 deletion b/‎contributing.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/training-sets.yml renamed to ‎data/datasets.yml
Lines changed: 137 additions & 0 deletions b/‎data/training-sets.yml renamed to ‎data/datasets.yml
Lines changed: 137 additions & 0 deletions
diff --git a/‎readme.md
Lines changed: 8 additions & 8 deletions b/‎readme.md
Lines changed: 8 additions & 8 deletions
diff --git a/‎scripts/evals/discovery.py
Lines changed: 4 additions & 4 deletions b/‎scripts/evals/discovery.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎site/package.json
Lines changed: 13 additions & 10 deletions b/‎site/package.json
Lines changed: 13 additions & 10 deletions
diff --git a/‎site/src/app.d.ts
Lines changed: 3 additions & 3 deletions b/‎site/src/app.d.ts
Lines changed: 3 additions & 3 deletions
diff --git a/‎site/src/lib/HeatmapTable.svelte
Lines changed: 0 additions & 11 deletions b/‎site/src/lib/HeatmapTable.svelte
Lines changed: 0 additions & 11 deletions
@@ -79,8 +79,8 @@ repos:
         files: ^models/.+/.+\.yml$
         args: [--schemafile, tests/model-schema.yml]
       - id: check-jsonschema
-        files: ^data/training-sets\.yml$
-        args: [--schemafile, tests/training-set-schema.yml]
+        files: ^data/datasets\.yml$
+        args: [--schemafile, tests/dataset-schema.yml]
       - id: check-github-actions
 
   - repo: https://github.com/RobertCraigie/pyright-python
 
@@ -135,7 +135,7 @@ To submit a new model to this benchmark and add it to our leaderboard, please cr
      torch-geometric: 2.0.9
      ...
 
-   training_set: [MPtrj] # list of keys from data/training-sets.yml
+   training_set: [MPtrj] # list of keys from data/datasets.yml
 
    notes: # notes can have any key, be multiline and support markdown.
      description: This is how my model works...
 
@@ -91,6 +91,7 @@ GNoME:
   n_materials: 6_000_000
   open: false
   date_created: 2023-11-29 # https://github.com/google-deepmind/materials_discovery/commit/a701b9529
+  license: Apache 2.0
   params:
     method: DFT
     code: VASP
@@ -109,6 +110,7 @@ MatterSim:
   pressure_range: 0-1000 GPa
   open: false
   date_created: 2024-05-08
+  license: Unreleased
   params:
     method: DFT
     code: VASP
@@ -272,3 +274,138 @@ OpenLAM:
     This dataset integrates multidisciplinary DFT data sourced from Deep Modeling community (https://deepmodeling.com)
     and other open repositories to pre-train large atomic models (LAMs),
     while intentionally excluding overlap with WBM benchmark systems (e.g., Alex3D structures).
+
+OC20:
+  title: Open Catalyst 2020
+  url: https://opencatalystproject.org/leaderboard.html
+  download_url: https://fair-chem.github.io/core/datasets/oc20.html#per-adsorbate-trajectories
+  doi: https://doi.org/10.1021/acscatal.0c04525
+  n_structures: 133_934_018
+  open: true
+  date_created: 2020-10-01
+  license: CC BY 4.0
+  params:
+    method: DFT
+    code: VASP
+    functional: PBE+U
+    pseudopotentials: PBE
+  created_by:
+    - name: Open Catalyst Project
+      url: https://opencatalystproject.org
+  description: |
+    A dataset for catalysis research containing DFT relaxations of adsorbates on catalyst surfaces,
+    specifically designed for training ML models to predict adsorption energies and atomic forces.
+
+NOMAD:
+  title: NOMAD Repository
+  url: https://nomad-lab.eu/prod/v1/gui/search/entries/search/entries
+  doi: https://joss.theoj.org/papers/10.21105/joss.05388
+  n_structures: 19_111_098 # as of 2025-04-04
+  n_materials: 4_335_728
+  open: true
+  date_created: 2019-06-01
+  license: CC BY 4.0
+  params:
+    method: [DFT, ML]
+    code: Various
+    functional: Various
+    pseudopotentials: Various
+  created_by:
+    - name: NOMAD Team
+      url: https://nomad-lab.eu
+  description: |
+    A repository hosting over 19 million calculations across various computational materials science codes,
+    providing a rich source of DFT and molecular dynamics data for training ML models.
+
+AFLOW:
+  title: AFLOW Database
+  url: https://aflow.org
+  doi: https://doi.org/10.1016/j.commatsci.2012.02.005
+  n_structures: 3_530_330 # as of 2025-04-04
+  n_materials: 3_530_330 # unsure how number of materials and structures differ for AFLOW
+  open: true
+  date_created: 2012-06-01
+  license: Open
+  params:
+    method: DFT
+    code: VASP
+    functional: Various
+    pseudopotentials: Various
+  created_by:
+    - name: AFLOW Team
+      url: https://aflow.org
+  description: |
+    A database of over 3.5 million materials with calculated thermodynamic, electronic, and structural properties,
+    using standardized high-throughput DFT calculations.
+
+OQMD:
+  title: Open Quantum Materials Database
+  url: https://oqmd.org
+  doi: https://doi.org/10.1007/s11837-013-0755-4
+  download_url: https://static.oqmd.org/static/downloads/qmdb__v1_6__112023.sql.gz
+  n_materials: 1_226_781 # as of 2025-04-04
+  n_structures: 1_226_781
+  open: true
+  date_created: 2014-04-03
+  license: CC BY 4.0
+  params:
+    method: DFT
+    code: VASP
+    code_version: 5.3.2 # see settings page https://oqmd.org/documentation/vasp
+    functional: PBE
+    pseudopotentials: PBE
+    energy_cutoff: 520 eV
+    kpoint_density: 8000 KPPRA
+  created_by:
+    - name: OQMD Team
+      affiliation: Northwestern University
+      url: https://oqmd.org
+  description: |
+    A database of DFT-calculated thermodynamic and structural properties for over 1 million inorganic compounds,
+    focused on providing data for materials discovery and design. Calculations use a four-step relaxation scheme
+    with progressively increasing precision, with GGA+U for transition metals, lanthanides, and actinides in
+    compounds with oxygen. Spin-polarization is used for structures containing 3d or actinide elements.
+
+MatPES PBE:
+  title: MatPES v1.0 PBE
+  url: https://matpes.ai
+  download_url: https://s3.us-east-1.amazonaws.com/materialsproject-contribs/MatPES_2025_1/MatPES-PBE-2025.1.json.gz
+  n_structures: 434_712 # as of 2025-04-04
+  open: true
+  date_created: 2025-03-06
+  doi: https://doi.org/10.48550/arXiv.2503.04070
+  license: MIT
+  params:
+    method: DFT
+    code: VASP
+    code_version: 6.4.x
+    functional: PBE
+    pseudopotentials: PBE_64
+  created_by:
+    - name: MatPES Team
+      url: https://matpes.ai
+  description: |
+    A dataset containing 434,712 structures from ~300K molecular dynamics simulations,
+    providing potential energy surfaces that are valuable for training ML interatomic potentials.
+
+MatPES r2SCAN:
+  title: MatPES v1.0 r2SCAN
+  url: https://matpes.ai
+  download_url: https://s3.us-east-1.amazonaws.com/materialsproject-contribs/MatPES_2025_1/MatPES-R2SCAN-2025.1.json.gz
+  n_structures: 387_897 # as of 2025-04-04
+  open: true
+  date_created: 2025-03-06
+  doi: https://doi.org/10.48550/arXiv.2503.04070
+  license: MIT
+  params:
+    method: DFT
+    code: VASP
+    code_version: 6.4.x
+    functional: r2SCAN
+    pseudopotentials: PBE_64
+  created_by:
+    - name: MatPES Team
+      url: https://matpes.ai
+  description: |
+    A dataset containing 387,897 structures from ~300K molecular dynamics simulations,
+    providing potential energy surfaces that are valuable for training ML interatomic potentials.
@@ -1,7 +1,9 @@
-<h1 align="center" style="line-height: 0; margin-block: -1em 1em;">
-  <img src="https://github.com/janosh/matbench-discovery/raw/main/site/static/favicon.svg" alt="Logo" width="60px"><br>
-  Matbench Discovery
-</h1>
+<slot name="title">
+  <h1 align="center">
+    <img src="https://github.com/janosh/matbench-discovery/raw/main/site/static/favicon.svg" alt="Logo" width="60px"><br>
+    Matbench Discovery
+  </h1>
+</slot>
 
 <h4 align="center" class="toc-exclude" style="display: none;">
 
@@ -13,11 +15,9 @@
 
 </h4>
 
-<slot name="metrics_table" />
+Matbench Discovery is an [interactive leaderboard](https://janosh.github.io/matbench-discovery/models) and associated [PyPI package](https://pypi.org/project/matbench-discovery) which ranks ML models on multiple tasks designed to simulate high-throughput discovery of new stable inorganic crystals, finding their ground state atomic positions and predicting their thermal conductivity.
 
-Matbench Discovery is an [interactive leaderboard](https://janosh.github.io/matbench-discovery/models) and associated [PyPI package](https://pypi.org/project/matbench-discovery) which rank ML models on a task designed to simulate high-throughput discovery of new stable inorganic crystals as well as predicting some of their properties.
-
-We've tested <slot name="model_count" /> models covering multiple methodologies including graph neural network (GNN) interatomic potentials, GNN one-shot predictors, iterative Bayesian optimizers and random forests with shallow-learning structure fingerprints.
+We rank <slot name="model_count">20+</slot> models covering multiple methodologies including graph neural network (GNN) interatomic potentials, GNN one-shot predictors, iterative Bayesian optimizers and random forests with shallow-learning structure fingerprints.
 
 <slot name="best_report" />
 
 
@@ -80,8 +80,8 @@
     key for key, meta in MODEL_METADATA.items() if not model_is_compliant(meta)
 ]
 
-with open(f"{DATA_DIR}/training-sets.yml") as file:
-    TRAINING_SETS = yaml.safe_load(file)
+with open(f"{DATA_DIR}/datasets.yml") as file:
+    DATASETS = yaml.safe_load(file)
 
 # Add model metadata to df_metrics(_10k, _uniq_protos)
 models = discovery.df_metrics_uniq_protos.columns
@@ -131,10 +131,10 @@
             dataset_urls, dataset_tooltip_lines = {}, []
 
             for train_set in training_sets:
-                if isinstance(train_set, str) and train_set not in TRAINING_SETS:
+                if isinstance(train_set, str) and train_set not in DATASETS:
                     raise ValueError(f"Unknown training set {train_set=} for {model=}")
                 key = train_set if isinstance(train_set, str) else ""
-                dataset_info = TRAINING_SETS.get(key, train_set)
+                dataset_info = DATASETS.get(key, train_set)
                 n_structs = dataset_info["n_structures"]
                 n_materials = dataset_info.get("n_materials", n_structs)
 
 
@@ -21,14 +21,17 @@
     "@rollup/plugin-yaml": "^4.1.2",
     "@stylistic/eslint-plugin": "^4.2.0",
     "@sveltejs/adapter-static": "^3.0.8",
-    "@sveltejs/kit": "^2.20.2",
+    "@sveltejs/kit": "^2.20.4",
     "@sveltejs/vite-plugin-svelte": "^5.0.3",
+    "@types/js-yaml": "^4.0.9",
+    "@types/json-schema": "^7.0.15",
+    "@types/node": "^22.14.0",
     "d3-array": "^3.2.4",
     "d3-scale": "^4.0.2",
     "d3-scale-chromatic": "^3.1.0",
-    "elementari": "^0.3.0",
-    "eslint": "^9.23.0",
-    "eslint-plugin-svelte": "^3.3.3",
+    "elementari": "^0.3.3",
+    "eslint": "^9.24.0",
+    "eslint-plugin-svelte": "^3.5.1",
     "hastscript": "^9.0.1",
     "iconify-icon": "^2.3.0",
     "js-yaml": "^4.1.0",
@@ -44,20 +47,20 @@
     "rehype-stringify": "^10.0.1",
     "remark-math": "6.0.0",
     "remark-parse": "^11.0.0",
-    "remark-rehype": "^11.1.1",
-    "svelte": "^5.25.3",
+    "remark-rehype": "^11.1.2",
+    "svelte": "^5.25.7",
     "svelte-check": "^4.1.5",
     "svelte-multiselect": "11.0.0-rc.1",
     "svelte-preprocess": "^6.0.3",
     "svelte-toc": "^0.5.9",
     "svelte-zoo": "^0.4.17",
     "svelte2tsx": "^0.7.35",
     "tslib": "^2.8.1",
-    "typescript": "5.8.2",
-    "typescript-eslint": "^8.28.0",
+    "typescript": "5.8.3",
+    "typescript-eslint": "^8.29.0",
     "unified": "^11.0.5",
-    "vite": "^6.2.3",
-    "vitest": "^3.0.9"
+    "vite": "^6.2.5",
+    "vitest": "^3.1.1"
   },
   "prettier": {
     "semi": false,
 
@@ -13,10 +13,10 @@ declare module 'models/*.yml' {
   export default data
 } // model metadata files
 
-declare module 'data/training-sets.yml' {
-  const data: import('$lib').TrainingSet[]
+declare module '*/datasets.yml' {
+  const data: Record<string, import('$lib/types').Dataset>
   export default data
-} // training sets
+}
 
 declare module '*citation.cff' {
   const data: import('$lib').Citation
 
@@ -264,7 +264,6 @@
   .table-container::-webkit-scrollbar {
     display: none; /* Safari and Chrome */
   }
-
   th,
   td {
     padding: var(--heatmap-cell-padding, 1pt 3pt);
@@ -274,17 +273,14 @@
     overflow: hidden;
     text-overflow: ellipsis;
   }
-
   th {
     background: var(--heatmap-header-bg, var(--night));
     position: sticky;
     cursor: pointer;
   }
-
   th:hover {
     background: var(--heatmap-header-hover-bg, var(--night-lighter, #2a2a2a));
   }
-
   .sticky-col {
     position: sticky;
     left: 0;
@@ -294,20 +290,16 @@
   tr:nth-child(odd) td.sticky-col {
     background: var(--heatmap-row-odd-bg, rgb(15, 14, 14));
   }
-
   tbody tr:hover {
     filter: var(--heatmap-row-hover-filter, brightness(1.1));
   }
-
   td[data-sort-value] {
     cursor: default;
   }
-
   .group-header th {
     border-bottom: 1px solid black;
     text-align: center;
   }
-
   /* Styles for the table header with sort hint and controls */
   .table-header {
     display: flex;
@@ -319,19 +311,16 @@
     padding: 0.25rem 0;
     border-bottom: 1px solid rgba(255, 255, 255, 0.1);
   }
-
   .sort-hint {
     font-size: 0.85em;
     color: var(--text-muted, #aaa);
     margin: 0;
   }
-
   .controls-container {
     display: inline-flex;
     align-items: center;
     margin-left: auto;
   }
-
   .not-sortable {
     cursor: default;
   }
Original file line number	Diff line number	Diff line change
`@@ -264,7 +264,6 @@`
`264`	`264`	`.table-container::-webkit-scrollbar {`
`265`	`265`	`display: none; /* Safari and Chrome */`
`266`	`266`	`}`
`267`		`-`
`268`	`267`	`th,`
`269`	`268`	`td {`
`270`	`269`	`padding: var(--heatmap-cell-padding, 1pt 3pt);`
`@@ -274,17 +273,14 @@`
`274`	`273`	`overflow: hidden;`
`275`	`274`	`text-overflow: ellipsis;`
`276`	`275`	`}`
`277`		`-`
`278`	`276`	`th {`
`279`	`277`	`background: var(--heatmap-header-bg, var(--night));`
`280`	`278`	`position: sticky;`
`281`	`279`	`cursor: pointer;`
`282`	`280`	`}`
`283`		`-`
`284`	`281`	`th:hover {`
`285`	`282`	`background: var(--heatmap-header-hover-bg, var(--night-lighter, #2a2a2a));`
`286`	`283`	`}`
`287`		`-`
`288`	`284`	`.sticky-col {`
`289`	`285`	`position: sticky;`
`290`	`286`	`left: 0;`
`@@ -294,20 +290,16 @@`
`294`	`290`	`tr:nth-child(odd) td.sticky-col {`
`295`	`291`	`background: var(--heatmap-row-odd-bg, rgb(15, 14, 14));`
`296`	`292`	`}`
`297`		`-`
`298`	`293`	`tbody tr:hover {`
`299`	`294`	`filter: var(--heatmap-row-hover-filter, brightness(1.1));`
`300`	`295`	`}`
`301`		`-`
`302`	`296`	`td[data-sort-value] {`
`303`	`297`	`cursor: default;`
`304`	`298`	`}`
`305`		`-`
`306`	`299`	`.group-header th {`
`307`	`300`	`border-bottom: 1px solid black;`
`308`	`301`	`text-align: center;`
`309`	`302`	`}`
`310`		`-`
`311`	`303`	`/* Styles for the table header with sort hint and controls */`
`312`	`304`	`.table-header {`
`313`	`305`	`display: flex;`
`@@ -319,19 +311,16 @@`
`319`	`311`	`padding: 0.25rem 0;`
`320`	`312`	`border-bottom: 1px solid rgba(255, 255, 255, 0.1);`
`321`	`313`	`}`
`322`		`-`
`323`	`314`	`.sort-hint {`
`324`	`315`	`font-size: 0.85em;`
`325`	`316`	`color: var(--text-muted, #aaa);`
`326`	`317`	`margin: 0;`
`327`	`318`	`}`
`328`		`-`
`329`	`319`	`.controls-container {`
`330`	`320`	`display: inline-flex;`
`331`	`321`	`align-items: center;`
`332`	`322`	`margin-left: auto;`
`333`	`323`	`}`
`334`		`-`
`335`	`324`	`.not-sortable {`
`336`	`325`	`cursor: default;`
`337`	`326`	`}`