janosh
diff --git a/‎.github/pull_request_template.md
Lines changed: 1 addition & 1 deletion b/‎.github/pull_request_template.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 3 additions & 3 deletions b/‎.pre-commit-config.yaml
Lines changed: 3 additions & 3 deletions
diff --git a/‎contributing.md
Lines changed: 1 addition & 1 deletion b/‎contributing.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/mp/build_phase_diagram.py
Lines changed: 4 additions & 2 deletions b/‎data/mp/build_phase_diagram.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎data/mp/get_mp_energies.py
Lines changed: 7 additions & 7 deletions b/‎data/mp/get_mp_energies.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎data/pmg_structs_to_ase_extxyz.py
Lines changed: 4 additions & 2 deletions b/‎data/pmg_structs_to_ase_extxyz.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
Lines changed: 7 additions & 7 deletions b/‎data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
Lines changed: 7 additions & 7 deletions
diff --git a/‎data/wbm/compile_wbm_test_set.py
Lines changed: 0 additions & 13 deletions b/‎data/wbm/compile_wbm_test_set.py
Lines changed: 0 additions & 13 deletions
diff --git a/‎data/wbm/eda_wbm.py
Lines changed: 11 additions & 6 deletions b/‎data/wbm/eda_wbm.py
Lines changed: 11 additions & 6 deletions
diff --git a/‎matbench_discovery/data-files.yml
Lines changed: 4 additions & 10 deletions b/‎matbench_discovery/data-files.yml
Lines changed: 4 additions & 10 deletions
diff --git a/‎matbench_discovery/data.py
Lines changed: 1 addition & 1 deletion b/‎matbench_discovery/data.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎matbench_discovery/enums.py
Lines changed: 5 additions & 9 deletions b/‎matbench_discovery/enums.py
Lines changed: 5 additions & 9 deletions
diff --git a/‎matbench_discovery/metrics/geo_opt.py
Lines changed: 3 additions & 3 deletions b/‎matbench_discovery/metrics/geo_opt.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎matbench_discovery/remote/fetch.py
Lines changed: 1 addition & 1 deletion b/‎matbench_discovery/remote/fetch.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎matbench_discovery/remote/figshare.py
Lines changed: 2 additions & 2 deletions b/‎matbench_discovery/remote/figshare.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎matbench_discovery/structure/symmetry.py
Lines changed: 3 additions & 1 deletion b/‎matbench_discovery/structure/symmetry.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎models/alignn/test_alignn_discovery.py
Lines changed: 1 addition & 1 deletion b/‎models/alignn/test_alignn_discovery.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/alignn/train_alignn.py
Lines changed: 4 additions & 4 deletions b/‎models/alignn/train_alignn.py
Lines changed: 4 additions & 4 deletions
@@ -9,7 +9,7 @@ Please check the following items before submitting your PR:
 - [ ] I have created a new folder and YAML metadata file `models/<arch_name>/<model_variant>.yml` for my submission. `arch_name` is the name of the architecture and `model_variant.yml` includes things like author details, training set names and important hyperparameters.
 - [ ] I have added the my new model as a new attribute on the [`Model.<arch_name>` enum](https://github.com/janosh/matbench-discovery/blob/57d0d0c8a14cd317/matbench_discovery/enums.py#L274) in `enums.py`.
 - [ ] I have uploaded the energy/force/stress model prediction file for the WBM test set to Figshare or another cloud storage service (`<yyyy-mm-dd>-<model_variant>-preds.csv.gz`).
-- [ ] I have uploaded the model-relaxed structures file to Figshare or another cloud storage service (`<yyyy-mm-dd>-wbm-IS2RE-FIRE.json.gz`).
+- [ ] I have uploaded the model-relaxed structures file to Figshare or another cloud storage service in [JSON lines format](https://jsonlines.org) (`<yyyy-mm-dd>-wbm-IS2RE-FIRE.jsonl.gz`). JSON Lines allows fast loading of small numbers of structures with `pandas.read_json(lines=True, nrows=100)` for inspection.
 - [ ] I have uploaded the phonon predictions to Figshare or another cloud storage service (`<yyyy-mm-dd>-kappa-103-FIRE-<values-of-dist|fmax|symprec>.gz`).
 - [ ] I have included the urls to the Figshare files in the YAML metadata file (`models/<arch_name>/<model_variant>.yml`). If not using Figshare I have included the urls to the cloud storage service in the description of the PR.
 - [ ] I have included the test script (`test_<arch_name>_<task>.py` for `task` in `discovery`, `kappa`, `diatomics`) that generated the prediction files.
 
@@ -8,7 +8,7 @@ default_install_hook_types: [pre-commit, commit-msg]
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.0
+    rev: v0.11.2
     hooks:
       - id: ruff
         args: [--fix]
@@ -57,7 +57,7 @@ repos:
         exclude: ^(site/src/figs/.+\.svelte|data/wbm/20.+\..+|site/src/(routes|figs).+\.(yaml|json)|changelog.md)$
 
   - repo: https://github.com/pre-commit/mirrors-eslint
-    rev: v9.22.0
+    rev: v9.23.0
     hooks:
       - id: eslint
         types: [file]
@@ -84,7 +84,7 @@ repos:
       - id: check-github-actions
 
   - repo: https://github.com/RobertCraigie/pyright-python
-    rev: v1.1.396
+    rev: v1.1.397
     hooks:
       - id: pyright
         args: [--level, error]
@@ -17,7 +17,7 @@ To submit a new model to this benchmark and add it to our leaderboard, please cr
 
 1. You should share your model's predictions through a cloud storage service (we recommend [Figshare](https://figshare.com)) and include the download links in your PR description. Your cloud storage directory should contain files in a compressed format with the following naming convention: `<arch-name>/<model-variant>/<yyyy-mm-dd>-<eval-task>.{csv.gz|json.gz}`. For example, a in the case of MACE-MP-0, the file paths would be:
 
-   - geometry optimization: `mace/mace-mp-0/2023-12-11-wbm-IS2RE-FIRE.json.gz`
+   - geometry optimization: `mace/mace-mp-0/2023-12-11-wbm-IS2RE-FIRE.jsonl.gz` (use [JSON Lines format](https://jsonlines.org) for fast loading of small numbers of structures with `pandas.read_json(lines=True, nrows=100)` for inspection)
    - discovery: `mace/mace-mp-0/2023-12-11-wbm-IS2RE.csv.gz`
    - phonons: `mace/mace-mp-0/2024-11-09-kappa-103-FIRE-dist=0.01-fmax=1e-4-symprec=1e-5.json.gz`
 
 
@@ -33,8 +33,10 @@
 df_mp_cse.index.name = Key.mat_id
 df_mp_cse.index = [e.entry_id for e in df_mp_cse.entry]
 df_mp_cse.reset_index().to_json(
-    f"{module_dir}/{today}-mp-computed-structure-entries.json.gz",
+    f"{module_dir}/{today}-mp-computed-structure-entries.jsonl.gz",
     default_handler=lambda x: x.as_dict(),
+    orient="records",
+    lines=True,
 )
 
 
@@ -74,7 +76,7 @@
 
 # %% build phase diagram with both MP entries + WBM entries
 wbm_cse_path = DataFiles.wbm_computed_structure_entries.path
-df_wbm = pd.read_json(wbm_cse_path).set_index(Key.mat_id)
+df_wbm = pd.read_json(wbm_cse_path, lines=True).set_index(Key.mat_id)
 
 # using ComputedStructureEntry vs ComputedEntry here is important as CSEs receive
 # more accurate energy corrections that take into account peroxide/superoxide nature
 
@@ -60,21 +60,21 @@
 
 
 # %%
-df_cse = pd.read_json(DataFiles.mp_computed_structure_entries.path).set_index(
+df_mp_cse = pd.read_json(DataFiles.mp_computed_structure_entries.path).set_index(
     Key.mat_id
 )
 
-df_cse[Key.structure] = [
+df_mp_cse[Key.structure] = [
     Structure.from_dict(cse[Key.structure])
-    for cse in tqdm(df_cse.entry, desc="Hydrating structures")
+    for cse in tqdm(df_mp_cse.entry, desc="Hydrating structures")
 ]
-df_cse[f"{Key.protostructure}_moyo"] = [
+df_mp_cse[f"{Key.protostructure}_moyo"] = [
     prototype.get_protostructure_label(struct)
-    for struct in tqdm(df_cse.structure, desc="Calculating proto-structure labels")
+    for struct in tqdm(df_mp_cse.structure, desc="Calculating proto-structure labels")
 ]
 # make sure symmetry detection succeeded for all structures
-assert df_cse[f"{Key.protostructure}_moyo"].str.startswith("invalid").sum() == 0
-df_mp[f"{Key.protostructure}_moyo"] = df_cse[f"{Key.protostructure}_moyo"]
+assert df_mp_cse[f"{Key.protostructure}_moyo"].str.startswith("invalid").sum() == 0
+df_mp[f"{Key.protostructure}_moyo"] = df_mp_cse[f"{Key.protostructure}_moyo"]
 
 spg_nums = df_mp[f"{Key.protostructure}_moyo"].str.split("_").str[2].astype(int)
 # make sure all our spacegroup numbers match MP's
 
@@ -53,7 +53,9 @@
 
 # %% convert WBM initial structures to ASE Atoms (no properties other than material ID
 # included in Atoms.info)
-df_wbm_init = pd.read_json(DataFiles.wbm_initial_structures.path).set_index(Key.mat_id)
+df_wbm_init = pd.read_json(DataFiles.wbm_initial_structures.path, lines=True).set_index(
+    Key.mat_id
+)
 
 wbm_init_atoms_list: list[Atoms] = []
 for mat_id, struct_dict in tqdm(df_wbm_init[Key.init_struct].items(), desc="WBM init"):
@@ -68,7 +70,7 @@
 # %% convert WBM ComputedStructureEntries to ASE Atoms (material ID and energy included
 # in Atoms.info)
 wbm_cse_path = DataFiles.wbm_computed_structure_entries.path
-df_wbm_cse = pd.read_json(wbm_cse_path).set_index(Key.mat_id)
+df_wbm_cse = pd.read_json(wbm_cse_path, lines=True).set_index(Key.mat_id)
 
 wbm_cse_atoms_list: list[Atoms] = []
 for mat_id, cse_dict in tqdm(
 
@@ -23,29 +23,29 @@
 from matbench_discovery.enums import DataFiles
 
 wbm_cse_path = DataFiles.wbm_computed_structure_entries.path
-df_cse = pd.read_json(wbm_cse_path).set_index(Key.mat_id)
+df_wbm_cse = pd.read_json(wbm_cse_path, lines=True).set_index(Key.mat_id)
 
 cses = [
     ComputedStructureEntry.from_dict(dct)
     for dct in tqdm(
-        df_cse[Key.computed_structure_entry],
+        df_wbm_cse[Key.computed_structure_entry],
         desc="Loading ComputedStructureEntries",
     )
 ]
 
 ces = [
     ComputedEntry.from_dict(dct)
     for dct in tqdm(
-        df_cse[Key.computed_structure_entry], desc="Loading ComputedEntries"
+        df_wbm_cse[Key.computed_structure_entry], desc="Loading ComputedEntries"
     )
 ]
 
 
 # %%
 processed = MaterialsProject2020Compatibility().process_entries(cses, verbose=True)
-assert len(processed) == len(df_cse)
+assert len(processed) == len(df_wbm_cse)
 processed = MaterialsProject2020Compatibility().process_entries(ces, verbose=True)
-assert len(processed) == len(df_cse)
+assert len(processed) == len(df_wbm_cse)
 
 df_wbm["e_form_per_atom_mp2020_from_ce"] = [
     get_e_form_per_atom(entry)
@@ -66,9 +66,9 @@
 
 # %%
 processed = MaterialsProjectCompatibility().process_entries(cses, verbose=True)
-assert len(processed) == len(df_cse)
+assert len(processed) == len(df_wbm_cse)
 processed = MaterialsProjectCompatibility().process_entries(ces, verbose=True)
-assert len(processed) == len(df_cse)
+assert len(processed) == len(df_wbm_cse)
 
 df_wbm["e_form_per_atom_legacy_from_ce"] = [
     get_e_form_per_atom(entry) for entry in tqdm(ces)
 
@@ -703,16 +703,3 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 
 # %% write final summary data to disk (yeah!)
 df_summary.round(6).to_csv(f"{WBM_DIR}/{today}-wbm-summary.csv.gz")
-
-
-# %% only here to load data for later inspection
-if False:
-    df_summary = pd.read_csv(DataFiles.wbm_summary.path).set_index(Key.mat_id)
-    df_wbm = pd.read_json(DataFiles.wbm_cses_plus_init_structs.path).set_index(
-        Key.mat_id
-    )
-
-    df_wbm[Key.computed_structure_entry] = [
-        ComputedStructureEntry.from_dict(dct)
-        for dct in tqdm(df_wbm[Key.computed_structure_entry])
-    ]
@@ -359,15 +359,20 @@
 
 
 # %%
-df_wbm_structs = pd.read_json(DataFiles.wbm_cses_plus_init_structs.path)
-df_wbm_structs = df_wbm_structs.set_index(Key.mat_id)
+df_wbm_init_structs = pd.read_json(DataFiles.wbm_initial_structures.path, lines=True)
+df_wbm_init_structs = df_wbm_init_structs.set_index(Key.mat_id)
+
+df_wbm_final_structs = pd.read_json(
+    DataFiles.wbm_computed_structure_entries.path, lines=True
+)
+df_wbm_final_structs = df_wbm_final_structs.set_index(Key.mat_id)
 
 
 # %%
 for wbm_id in df_sym_change.index:
-    init_struct = Structure.from_dict(df_wbm_structs.loc[wbm_id][Key.init_struct])
+    init_struct = Structure.from_dict(df_wbm_init_structs.loc[wbm_id][Key.init_struct])
     final_struct = Structure.from_dict(
-        df_wbm_structs.loc[wbm_id][Key.computed_structure_entry]["structure"]
+        df_wbm_final_structs.loc[wbm_id][Key.computed_structure_entry]["structure"]
     )
     init_struct.properties[Key.mat_id] = f"{wbm_id}-init"
     final_struct.properties[Key.mat_id] = f"{wbm_id}-final"
@@ -379,11 +384,11 @@
 wbm_id = df_sym_change.index[0]
 
 struct = Structure.from_dict(
-    df_wbm_structs.loc[wbm_id][Key.computed_structure_entry]["structure"]
+    df_wbm_final_structs.loc[wbm_id][Key.computed_structure_entry]["structure"]
 )
 struct.to(f"{module_dir}/{wbm_id}.cif")
 struct.to(f"{module_dir}/{wbm_id}.json")
 
-struct = Structure.from_dict(df_wbm_structs.loc[wbm_id][Key.init_struct])
+struct = Structure.from_dict(df_wbm_init_structs.loc[wbm_id][Key.init_struct])
 struct.to(f"{module_dir}/{wbm_id}-init.cif")
 struct.to(f"{module_dir}/{wbm_id}-init.json")
@@ -47,8 +47,8 @@ mp_trj_extxyz:
   md5: 7f433171e4e5f2ef9304dccd42d5488f
 
 wbm_computed_structure_entries:
-  url: https://figshare.com/files/40344463
-  path: wbm/2022-10-19-wbm-computed-structure-entries.json.bz2
+  url: https://figshare.com/files/53161832
+  path: wbm/2022-10-19-wbm-computed-structure-entries.jsonl.gz
   description: JSON-Serialized `pymatgen` [`ComputedStructureEntries`] containing all WBM DFT-relaxed structures and corresponding final energies
   md5: 481959b65f28150ae6ee7297ddeba538
 
@@ -59,8 +59,8 @@ wbm_relaxed_atoms:
   md5: 4726643ac0dfbab69a4284454c891e68
 
 wbm_initial_structures:
-  url: https://figshare.com/files/40344466
-  path: wbm/2022-10-19-wbm-init-structs.json.bz2
+  url: https://figshare.com/files/53161835
+  path: wbm/2022-10-19-wbm-init-structs.jsonl.gz
   description: Unrelaxed WBM structures in `pymatgen` `Structure` format
   md5: ff2c40a3a7bf65468852b67f0dbc67df
 
@@ -70,12 +70,6 @@ wbm_initial_atoms:
   description: Unrelaxed WBM structures as `ase` Atoms in extended XYZ format
   md5: 2a292211ca6acb30ed8416178d644098
 
-wbm_cses_plus_init_structs:
-  url: https://figshare.com/files/40344469
-  path: wbm/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2
-  description: Both unrelaxed and DFT-relaxed WBM structures, the latter stored with their final VASP energies as `pymatgen` [`ComputedStructureEntries`]
-  md5: eaabe984d070156cc50a8a075cd5e315
-
 wbm_summary:
   url: https://figshare.com/files/44225498
   path: wbm/2023-12-13-wbm-summary.csv.gz
 
@@ -333,7 +333,7 @@ def update_yaml_at_path(
     """
     # raise on repeated or trailing dots in dotted path
     if not re.match(r"^[a-zA-Z0-9-+=_]+(\.[a-zA-Z0-9-+=_]+)*$", dotted_path):
-        raise ValueError(f"Invalid dotted path: {dotted_path}")
+        raise ValueError(f"Invalid {dotted_path=}")
 
     with open(file_path) as file:
         yaml_data = round_trip_yaml.load(file)
 
@@ -458,7 +458,7 @@ class DataFiles(Files):
 
     mp_computed_structure_entries = (
         auto(),
-        ("mp/2023-02-07-mp-computed-structure-entries.json.gz"),
+        "mp/2023-02-07-mp-computed-structure-entries.json.gz",
     )
     mp_elemental_ref_entries = (
         auto(),
@@ -475,24 +475,20 @@ class DataFiles(Files):
 
     wbm_computed_structure_entries = (
         auto(),
-        ("wbm/2022-10-19-wbm-computed-structure-entries.json.bz2"),
+        "wbm/2022-10-19-wbm-computed-structure-entries.jsonl.gz",
     )
     wbm_relaxed_atoms = auto(), "wbm/2024-08-04-wbm-relaxed-atoms.extxyz.zip"
-    wbm_initial_structures = auto(), "wbm/2022-10-19-wbm-init-structs.json.bz2"
+    wbm_initial_structures = auto(), "wbm/2022-10-19-wbm-init-structs.jsonl.gz"
     wbm_initial_atoms = auto(), "wbm/2024-08-04-wbm-initial-atoms.extxyz.zip"
-    wbm_cses_plus_init_structs = (
-        auto(),
-        ("wbm/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"),
-    )
     wbm_summary = auto(), "wbm/2023-12-13-wbm-summary.csv.gz"
     alignn_checkpoint = auto(), "2023-06-02-pbenner-best-alignn-model.pth.zip"
     phonondb_pbe_103_structures = (
         auto(),
-        ("phonons/2024-11-09-phononDB-PBE-103-structures.extxyz"),
+        "phonons/2024-11-09-phononDB-PBE-103-structures.extxyz",
     )
     phonondb_pbe_103_kappa_no_nac = (
         auto(),
-        ("phonons/2024-11-09-kappas-phononDB-PBE-noNAC.json.gz"),
+        "phonons/2024-11-09-kappas-phononDB-PBE-noNAC.json.gz",
     )
     wbm_dft_geo_opt_symprec_1e_2 = (
         auto(),
 
@@ -87,13 +87,13 @@ def calc_geo_opt_metrics(df_model_analysis: pd.DataFrame) -> dict[str, float]:
     # Get relevant columns
     spg_diff = df_model_analysis[MbdKey.spg_num_diff]
     n_sym_ops_diff = df_model_analysis[MbdKey.n_sym_ops_diff]
-    rmsd = df_model_analysis[MbdKey.structure_rmsd_vs_dft]
+    rmsd_vals = df_model_analysis[MbdKey.structure_rmsd_vs_dft]
 
     # Count total number of structures (excluding NaN values)
     n_structs = len(spg_diff.dropna())
 
-    # Calculate RMSD and MAE metrics
-    mean_rmsd = rmsd.mean()
+    # Fill NaN values with 1.0 (the stol value we set in StructureMatcher)
+    mean_rmsd = rmsd_vals.fillna(1.0).mean()
     sym_ops_mae = n_sym_ops_diff.abs().mean()
 
     # Count cases where spacegroup changed
 
@@ -19,7 +19,7 @@ def download_file(file_path: str, url: str) -> None:
 
         response.raise_for_status()
 
-        with open(file_path, "wb") as file:
+        with open(file_path, mode="wb") as file:
             file.write(response.content)
     except requests.exceptions.RequestException:
         print(f"Error downloading {url=}\nto {file_path=}.\n{traceback.format_exc()}")
 
@@ -22,9 +22,9 @@
 DOWNLOAD_URL_PREFIX: Final = "https://figshare.com/files"
 ARTICLE_IDS: Final[dict[str, int | None]] = {
     "model_preds_discovery": 28187990,
-    "model_preds_geo_opt": 28187999,
+    "model_preds_geo_opt": 28642406,
     "model_preds_phonons": 28347251,
-    "model_preds_diatomics": 28437344,  # created 2024-02-13
+    "model_preds_diatomics": 28437344,
     "data_files": 22715158,
 }
 
 
@@ -110,7 +110,9 @@ def pred_vs_ref_struct_symmetry(
         df_sym_pred[Key.n_sym_ops] - df_sym_ref[Key.n_sym_ops]
     )
 
-    structure_matcher = StructureMatcher()
+    # scale=False and stol=1 are important for getting accurate distance of atomic
+    # positions from DFT-relaxed positions. details in https://github.com/janosh/matbench-discovery/issues/230
+    structure_matcher = StructureMatcher(stol=1.0, scale=False)
     ref_ids, pred_ids = set(ref_structs), set(pred_structs)
     shared_ids = ref_ids & pred_ids
     if len(shared_ids) == 0:
 
@@ -76,7 +76,7 @@
 }[task_type]
 input_col = {Task.IS2RE: Key.init_struct, Task.RS2RE: Key.final_struct}[task_type]
 
-df_in = pd.read_json(data_path).set_index(Key.mat_id)
+df_in = pd.read_json(data_path, lines=True).set_index(Key.mat_id)
 
 df_in[target_col] = df_wbm[target_col]
 if task_type == Task.RS2RE:
 
@@ -51,17 +51,17 @@
 
 
 # %% Load data
-df_cse = pd.read_json(DataFiles.mp_computed_structure_entries.path).set_index(
+df_mp_cse = pd.read_json(DataFiles.mp_computed_structure_entries.path).set_index(
     Key.mat_id
 )
-df_cse[Key.structure] = [
+df_mp_cse[Key.structure] = [
     Structure.from_dict(cse[Key.structure])
-    for cse in tqdm(df_cse.entry, desc="Structures from dict")
+    for cse in tqdm(df_mp_cse.entry, desc="Structures from dict")
 ]
 
 # load energies
 df_in = pd.read_csv(DataFiles.mp_energies.path).set_index(Key.mat_id)
-df_in[Key.structure] = df_cse[Key.structure]
+df_in[Key.structure] = df_mp_cse[Key.structure]
 if target_col not in df_in:
     raise TypeError(f"{target_col!s} not in {df_in.columns=}")