fix bad column name in join_mace_results.py

janosh · janosh · commit d7f300b8b73b · 2023-09-03T11:09:29.000+02:00
update readme
diff --git a/models/m3gnet/join_m3gnet_results.py b/models/m3gnet/join_m3gnet_results.py
@@ -57,8 +57,8 @@
 df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
     "material_id"
 )
-
-df_cse["cse"] = [
+entry_col = "computed_structure_entry"
+df_cse[entry_col] = [
     ComputedStructureEntry.from_dict(dct)
     for dct in tqdm(df_cse.computed_structure_entry)
 ]
@@ -71,22 +71,22 @@
     mat_id, struct_dict, m3gnet_energy, *_ = row
     mlip_struct = Structure.from_dict(struct_dict)
     df_m3gnet.at[mat_id, struct_col] = mlip_struct  # noqa: PD008
-    cse = df_cse.loc[mat_id, "cse"]
+    cse = df_cse.loc[mat_id, entry_col]
     cse._energy = m3gnet_energy  # cse._energy is the uncorrected energy
     cse._structure = mlip_struct
-    df_m3gnet.loc[mat_id, "cse"] = cse
+    df_m3gnet.loc[mat_id, entry_col] = cse
 
 
 # %% apply energy corrections
 out = MaterialsProject2020Compatibility().process_entries(
-    df_m3gnet.cse, verbose=True, clean=True
+    df_m3gnet[entry_col], verbose=True, clean=True
 )
 assert len(out) == len(df_m3gnet)
 
 
 # %% compute corrected formation energies
 df_m3gnet["e_form_per_atom_m3gnet"] = [
-    get_e_form_per_atom(cse) for cse in tqdm(df_m3gnet.cse)
+    get_e_form_per_atom(cse) for cse in tqdm(df_m3gnet[entry_col])
 ]
 
 
diff --git a/models/mace/join_mace_results.py b/models/mace/join_mace_results.py
@@ -30,7 +30,7 @@
 # %%
 module_dir = os.path.dirname(__file__)
 task_type = "IS2RE"
-date = "2023-08-14"
+date = "2023-09-02"
 glob_pattern = f"{date}-mace-wbm-{task_type}*/*.json.gz"
 file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
 print(f"Found {len(file_paths):,} files for {glob_pattern = }")
@@ -77,7 +77,7 @@
 
 # %% apply energy corrections
 out = MaterialsProject2020Compatibility().process_entries(
-    df_mace.cse, verbose=True, clean=True
+    df_mace[entry_col], verbose=True, clean=True
 )
 assert len(out) == len(df_mace)
 
@@ -96,7 +96,8 @@
 
 # %%
 bad_mask = (df_wbm[e_form_col] - df_wbm[e_form_mace_col]).abs() > 10
-ax = density_scatter(df=df_wbm[bad_mask], x=e_form_col, y=e_form_mace_col)
+print(f"{sum(bad_mask)=}")
+ax = density_scatter(df=df_wbm[~bad_mask], x=e_form_col, y=e_form_mace_col)
 
 
 # %%
diff --git a/models/mace/test_mace.py b/models/mace/test_mace.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+import torch
 import wandb
 from ase.constraints import ExpCellFilter
 from ase.optimize import FIRE, LBFGS
@@ -36,6 +37,7 @@
 # model_name = "2023-07-14-mace-ilyes-trained-MPF-2021-2-8-big-128-6"
 # MACE trained on CHGNet training set by Yuan Chiang
 model_name = "2023-08-14-mace-yuan-trained-mptrj-04"
+device = "cuda" if torch.cuda.is_available() else "cpu"
 
 slurm_vars = slurm_submit(
     job_name=job_name,
@@ -64,12 +66,13 @@
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
 e_pred_col = "mace_energy"
+id_col = "material_id"
 max_steps = 500
 force_max = 0.05  # Run until the forces are smaller than this in eV/A
 checkpoint = f"{ROOT}/models/mace/{model_name}.model"
 
 df_in: pd.DataFrame = np.array_split(
-    pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
+    pd.read_json(data_path).set_index(id_col), slurm_array_task_count
 )[slurm_array_task_id - 1]
 
 run_params = dict(
@@ -83,14 +86,15 @@
     relax_cell=relax_cell,
     force_max=force_max,
     ase_optimizer=ase_optimizer,
+    device=device,
 )
 
 run_name = f"{job_name}-{slurm_array_task_id}"
 wandb.init(project="matbench-discovery", name=run_name, config=run_params)
 
 
 # %%
-mace_calc = MACECalculator(checkpoint, device="cuda", default_dtype="float32")
+mace_calc = MACECalculator(checkpoint, device=device, default_dtype="float32")
 relax_results: dict[str, dict[str, Any]] = {}
 input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
 
@@ -130,18 +134,18 @@
         )
 
         relax_results[material_id] = {
-            "mace_structure": mace_struct,
-            "mace_energy": mace_energy,
-            "mace_trajectory": mace_traj,  # Add the trajectory to the results
+            "structure": mace_struct,
+            "energy": mace_energy,
+            "trajectory": mace_traj,
         }
     except Exception as exc:
         print(f"Failed to relax {material_id}: {exc!r}")
         continue
 
 
 # %%
-df_out = pd.DataFrame(relax_results).T
-df_out.index.name = "material_id"
+df_out = pd.DataFrame(relax_results).T.add_prefix("mace_")
+df_out.index.name = id_col
 
 df_out.reset_index().to_json(out_path, default_handler=as_dict_handler)
 
diff --git a/readme.md b/readme.md
@@ -5,15 +5,15 @@
 
 <h4 align="center" class="toc-exclude">
 
+[![arXiv](https://img.shields.io/badge/arXiv-2308.14920-blue)](https://arxiv.org/abs/2308.14920)
 [![Tests](https://github.com/janosh/matbench-discovery/actions/workflows/test.yml/badge.svg)](https://github.com/janosh/matbench-discovery/actions/workflows/test.yml)
 [![GitHub Pages](https://github.com/janosh/matbench-discovery/actions/workflows/gh-pages.yml/badge.svg)](https://github.com/janosh/matbench-discovery/actions/workflows/gh-pages.yml)
-[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/janosh/matbench-discovery/main.svg?badge_token=Qza33izjRxSbegTqeSyDvA)](https://results.pre-commit.ci/latest/github/janosh/matbench-discovery/main?badge_token=Qza33izjRxSbegTqeSyDvA)
 [![Requires Python 3.9+](https://img.shields.io/badge/Python-3.9+-blue.svg?logo=python&logoColor=white)](https://python.org/downloads)
 [![PyPI](https://img.shields.io/pypi/v/matbench-discovery?logo=pypi&logoColor=white)](https://pypi.org/project/matbench-discovery?logo=pypi&logoColor=white)
 
 </h4>
 
-> TL;DR: We benchmark ML models on crystal stability prediction from unrelaxed structures finding universal interatomic potentials (UIP) like [M3GNet](https://github.com/materialsvirtuallab/m3gnet) and [CHGNet](https://github.com/CederGroupHub/chgnet) to be highly accurate, robust across chemistries and ready for production use in high-throughput discovery pipelines.
+> TL;DR: We benchmark ML models on crystal stability prediction from unrelaxed structures finding universal interatomic potentials (UIP) like [CHGNet](https://github.com/CederGroupHub/chgnet), [M3GNet](https://github.com/materialsvirtuallab/m3gnet) and [MACE](https://github.com/ACEsuit/mace) to be highly accurate, robust across chemistries and ready for production use in high-throughput discovery pipelines.
 
 Matbench Discovery is an [interactive leaderboard](https://janosh.github.io/matbench-discovery/models) and associated [PyPI package](https://pypi.org/project/matbench-discovery) which together make it easy to rank ML energy models on a task designed to closely simulate a high-throughput discovery campaign for new stable inorganic crystals.