add join_mace_results.py

janosh · janosh · commit eb11ab0acdf1 · 2023-08-16T17:50:24.000-07:00
add props n_authors and first_name_mode to References.svelte
diff --git a/models/bowsr/join_bowsr_results.py b/models/bowsr/join_bowsr_results.py
@@ -65,7 +65,7 @@
 
 
 # %%
-out_path = f"{module_dir}/{glob_pattern.split('/*')[0]}"
+out_path = file_paths[0].rsplit("/", 1)[0]
 df_bowsr = df_bowsr.round(4)
 # save energy and formation energy as fast-loading CSV
 df_bowsr.select_dtypes("number").to_csv(f"{out_path}.csv")
diff --git a/models/bowsr/test_bowsr.py b/models/bowsr/test_bowsr.py
@@ -126,7 +126,7 @@
         try:
             struct_bowsr, energy_bowsr = optimizer.get_optimized_structure_and_energy()
         except Exception as exc:
-            print(f"Failed to relax {material_id}: {exc}")
+            print(f"Failed to relax {material_id}: {exc!r}")
 
         results = {
             f"e_form_per_atom_bowsr_{energy_model}": model.predict_energy(struct_bowsr),
diff --git a/models/chgnet/join_chgnet_results.py b/models/chgnet/join_chgnet_results.py
@@ -63,11 +63,11 @@
 
 
 # %%
-out_path = f"{module_dir}/{glob_pattern.split('/*')[0]}"
+out_path = file_paths[0].rsplit("/", 1)[0]
 df_chgnet = df_chgnet.round(4)
 df_chgnet.select_dtypes("number").to_csv(f"{out_path}.csv.gz")
 df_chgnet.reset_index().to_json(f"{out_path}.json.gz", default_handler=as_dict_handler)
 
-# in_path = f"{module_dir}/2023-03-04-chgnet-wbm-IS2RE.json.gz"
-# df_chgnet = pd.read_csv(in_path.replace(".json.gz", ".csv")).set_index("material_id")
-# df_chgnet = pd.read_json(in_path).set_index("material_id")
+# in_path = f"{module_dir}/2023-03-04-chgnet-wbm-IS2RE"
+# df_chgnet = pd.read_csv(f"{in_path}.csv.gz").set_index("material_id")
+# df_chgnet = pd.read_json(f"{in_path}.json.gz").set_index("material_id")
diff --git a/models/chgnet/test_chgnet.py b/models/chgnet/test_chgnet.py
@@ -104,7 +104,7 @@
             e_pred_col: relax_result["trajectory"].energies[-1],
         }
     except Exception as exc:
-        print(f"Failed to relax {material_id}: {exc}")
+        print(f"Failed to relax {material_id}: {exc!r}")
 
 
 # %%
diff --git a/models/m3gnet/join_m3gnet_results.py b/models/m3gnet/join_m3gnet_results.py
@@ -69,11 +69,11 @@
 cse: ComputedStructureEntry
 for row in tqdm(df_m3gnet.itertuples(), total=len(df_m3gnet)):
     mat_id, struct_dict, m3gnet_energy, *_ = row
-    m3gnet_struct = Structure.from_dict(struct_dict)
-    df_m3gnet.at[mat_id, struct_col] = m3gnet_struct  # noqa: PD008
+    mlip_struct = Structure.from_dict(struct_dict)
+    df_m3gnet.at[mat_id, struct_col] = mlip_struct  # noqa: PD008
     cse = df_cse.loc[mat_id, "cse"]
     cse._energy = m3gnet_energy  # cse._energy is the uncorrected energy
-    cse._structure = m3gnet_struct
+    cse._structure = mlip_struct
     df_m3gnet.loc[mat_id, "cse"] = cse
 
 
@@ -91,12 +91,12 @@
 
 
 # %%
-out_path = f"{module_dir}/{glob_pattern.split('/*')[0]}"
+out_path = file_paths[0].rsplit("/", 1)[0]
 df_m3gnet = df_m3gnet.round(4)
 df_m3gnet.select_dtypes("number").to_csv(f"{out_path}.csv.gz")
 df_m3gnet.reset_index().to_json(f"{out_path}.json.gz", default_handler=as_dict_handler)
 
 
-# in_path = f"{module_dir}/2022-10-31-m3gnet-wbm-IS2RE.json.gz"
-# df_m3gnet = pd.read_csv(in_path.replace(".json.gz", ".csv")).set_index("material_id")
-# df_m3gnet = pd.read_json(in_path).set_index("material_id")
+# in_path = f"{module_dir}/2022-10-31-m3gnet-wbm-IS2RE"
+# df_m3gnet = pd.read_csv(f"{in_path}.csv.gz").set_index("material_id")
+# df_m3gnet = pd.read_json(f"{in_path}.json.gz").set_index("material_id")
diff --git a/models/m3gnet/test_m3gnet.py b/models/m3gnet/test_m3gnet.py
@@ -112,7 +112,7 @@
             e_pred_col: relax_result["trajectory"].energies[-1],
         }
     except Exception as exc:
-        print(f"Failed to relax {material_id}: {exc}")
+        print(f"Failed to relax {material_id}: {exc!r}")
 
 
 # %%
diff --git a/models/mace/join_mace_results.py b/models/mace/join_mace_results.py
@@ -0,0 +1,114 @@
+"""Concatenate MACE results from multiple data files generated by slurm job array
+into single file.
+"""
+
+
+# %%
+from __future__ import annotations
+
+import os
+import warnings
+from glob import glob
+
+import pandas as pd
+from pymatgen.core import Structure
+from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+from pymatviz import density_scatter
+from tqdm import tqdm
+
+from matbench_discovery.data import DATA_FILES, as_dict_handler, df_wbm
+from matbench_discovery.energy import get_e_form_per_atom
+from matbench_discovery.preds import e_form_col
+
+__author__ = "Janosh Riebesell"
+__date__ = "2023-03-01"
+
+warnings.filterwarnings(action="ignore", category=UserWarning, module="pymatgen")
+
+
+# %%
+module_dir = os.path.dirname(__file__)
+task_type = "IS2RE"
+date = "2023-08-14"
+glob_pattern = f"{date}-mace-wbm-{task_type}*/*.json.gz"
+file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
+print(f"Found {len(file_paths):,} files for {glob_pattern = }")
+struct_col = "mace_structure"
+
+dfs: dict[str, pd.DataFrame] = {}
+
+
+# %%
+for file_path in tqdm(file_paths):
+    if file_path in dfs:
+        continue
+    df = pd.read_json(file_path).set_index("material_id")
+    # drop trajectory to save memory
+    dfs[file_path] = df.drop(columns="mace_trajectory")
+
+df_mace = pd.concat(dfs.values()).round(4)
+
+
+# %%
+df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
+    "material_id"
+)
+
+entry_col = "computed_structure_entry"
+df_cse[entry_col] = [
+    ComputedStructureEntry.from_dict(dct)
+    for dct in tqdm(df_cse.computed_structure_entry)
+]
+
+
+# %% transfer mace energies and relaxed structures WBM CSEs since MP2020 energy
+# corrections applied below are structure-dependent (for oxides and sulfides)
+cse: ComputedStructureEntry
+for row in tqdm(df_mace.itertuples(), total=len(df_mace)):
+    mat_id, struct_dict, mace_energy, *_ = row
+    mlip_struct = Structure.from_dict(struct_dict)
+    df_mace.at[mat_id, struct_col] = mlip_struct  # noqa: PD008
+    cse = df_cse.loc[mat_id, entry_col]
+    cse._energy = mace_energy  # cse._energy is the uncorrected energy
+    cse._structure = mlip_struct
+    df_mace.loc[mat_id, entry_col] = cse
+
+
+# %% apply energy corrections
+out = MaterialsProject2020Compatibility().process_entries(
+    df_mace.cse, verbose=True, clean=True
+)
+assert len(out) == len(df_mace)
+
+
+# %% compute corrected formation energies
+e_form_mace_col = "e_form_per_atom_mace"
+df_mace["formula"] = df_wbm.formula
+df_mace[e_form_mace_col] = [
+    get_e_form_per_atom(dict(energy=cse.energy, composition=formula))
+    for formula, cse in tqdm(
+        df_mace.set_index("formula")[entry_col].items(), total=len(df_mace)
+    )
+]
+df_wbm[e_form_mace_col] = df_mace[e_form_mace_col]
+
+
+# %%
+bad_mask = (df_wbm[e_form_col] - df_wbm[e_form_mace_col]).abs() > 10
+ax = density_scatter(df=df_wbm[bad_mask], x=e_form_col, y=e_form_mace_col)
+
+
+# %%
+out_path = file_paths[0].rsplit("/", 1)[0]
+df_mace = df_mace.round(4)
+df_mace[~bad_mask].select_dtypes("number").to_csv(f"{out_path}.csv.gz")
+df_mace.reset_index().to_json(f"{out_path}.json.gz", default_handler=as_dict_handler)
+
+df_bad = df_mace[bad_mask].drop(columns=[entry_col, struct_col])
+df_bad[e_form_col] = df_wbm[e_form_col]
+df_bad.to_csv(f"{out_path}-bad.csv")
+
+# in_path = f"{module_dir}/2023-08-14-mace-wbm-IS2RE-FIRE"
+# df_mace = pd.read_csv(f"{in_path}.csv.gz").set_index("material_id")
+# df_mace = pd.read_json(f"{in_path}.json.gz").set_index("material_id")
diff --git a/models/mace/test_mace.py b/models/mace/test_mace.py
@@ -32,6 +32,10 @@
 job_name = f"mace-wbm-{task_type}-{ase_optimizer}"
 out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
 relax_cell = True
+# MACE trained on M3GNet training set by original MACE authors
+# model_name = "2023-07-14-mace-ilyes-trained-MPF-2021-2-8-big-128-6"
+# MACE trained on CHGNet training set by Yuan Chiang
+model_name = "2023-08-14-mace-yuan-trained-mptrj-04"
 
 slurm_vars = slurm_submit(
     job_name=job_name,
@@ -62,6 +66,7 @@
 e_pred_col = "mace_energy"
 max_steps = 500
 force_max = 0.05  # Run until the forces are smaller than this in eV/A
+checkpoint = f"{ROOT}/models/mace/{model_name}.model"
 
 df_in: pd.DataFrame = np.array_split(
     pd.read_json(data_path).set_index("material_id"), slurm_array_task_count
@@ -70,6 +75,7 @@
 run_params = dict(
     data_path=data_path,
     versions={dep: version(dep) for dep in ("mace", "numpy", "torch")},
+    checkpoint=checkpoint,
     task_type=task_type,
     df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     slurm_vars=slurm_vars,
@@ -84,8 +90,6 @@
 
 
 # %%
-checkpoint = f"{ROOT}/models/mace/2023-07-14-mace-universal-2-big-128-6.model"
-# load MACE model pre-trained on M3GNet training set by original MACE authors
 mace_calc = MACECalculator(checkpoint, device="cuda", default_dtype="float32")
 relax_results: dict[str, dict[str, Any]] = {}
 input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
@@ -131,7 +135,7 @@
             "mace_trajectory": mace_traj,  # Add the trajectory to the results
         }
     except Exception as exc:
-        print(f"Failed to relax {material_id}: {exc}")
+        print(f"Failed to relax {material_id}: {exc!r}")
         continue
 
 
diff --git a/models/voronoi/join_voronoi_features.py b/models/voronoi/join_voronoi_features.py
@@ -39,5 +39,5 @@
 
 
 # %%
-out_path = f"{module_dir}/{glob_pattern.split('-*')[0]}.csv.bz2"
-df_features.to_csv(out_path)
+out_path = file_paths[0].rsplit("/", 1)[0]
+df_features.to_csv(f"{out_path}.csv.bz2")
diff --git a/site/src/lib/References.svelte b/site/src/lib/References.svelte
@@ -5,6 +5,8 @@
   export let references: Reference[]
   export let ref_selector: string = `a.ref[href^='#']`
   export let found_on_page: Reference[] = references
+  export let n_authors: number = 1
+  export let first_name_mode: 'initial' | 'full' | 'none' = `none`
 
   function filter_refs() {
     const ref_links = document.querySelectorAll<HTMLAnchorElement>(ref_selector)
@@ -20,10 +22,24 @@
       <li>
         <strong {id}>{title}</strong>
         <span>
-          {@html author.map((a) => `${a.given} ${a.family}`).join(`, &thinsp; `)}
+          {@html author
+            .slice(0, n_authors)
+            .map((auth) => {
+              const { given, family } = auth
+              const first_name = {
+                initial: `${given[0]}. `,
+                full: `${given} `,
+                none: ``,
+              }[first_name_mode]
+              return `${first_name ?? ``}${family}`
+            })
+            .join(`,&thinsp; `)}
+          {#if author.length > n_authors}
+            <em>et al.</em>
+          {/if}
         </span>
-        &mdash;
         <small>
+          &mdash;
           {#if DOI}
             <a href="https://doi.org/{DOI}">{DOI}</a>
           {:else if href}
diff --git a/site/src/routes/preprint/references.yaml b/site/src/routes/preprint/references.yaml
@@ -1178,6 +1178,48 @@ references:
   URL: https://link.aps.org/doi/10.1103/PhysRevB.99.014104
   volume: '99'
 
+- id: draxl_nomad_2018
+  abstract: >-
+    , Data are a crucial raw material of this century. The amount of data that
+    have been created in materials science thus far and that continues to be
+    created every day is immense. Without a proper infrastructure that allows
+    for collecting and sharing data, the envisioned success of big data-driven
+    materials science will be hampered. For the field of computational materials
+    science, the NOMAD (Novel Materials Discovery) Center of Excellence (CoE)
+    has changed the scientific culture toward comprehensive and findable,
+    accessible, interoperable, and reusable (FAIR) data, opening new avenues for
+    mining materials science big data. Novel data-analytics concepts and tools
+    turn data into knowledge and help in the prediction of new materials and in
+    the identification of new properties of already known materials.
+  accessed:
+    - year: 2020
+      month: 7
+      day: 29
+  author:
+    - family: Draxl
+      given: Claudia
+    - family: Scheffler
+      given: Matthias
+  citation-key: draxl_nomad_2018
+  container-title: MRS Bulletin
+  DOI: 10.1557/mrs.2018.208
+  ISSN: 0883-7694, 1938-1425
+  issue: '9'
+  issued:
+    - year: 2018
+      month: 9
+  language: en
+  note: 'ZSCC: 0000084'
+  page: 676-682
+  publisher: Cambridge University Press
+  source: Cambridge University Press
+  title: 'NOMAD: The FAIR concept for big data-driven materials science'
+  title-short: NOMAD
+  type: article-journal
+  URL: >-
+    https://www.cambridge.org/core/journals/mrs-bulletin/article/nomad-the-fair-concept-for-big-datadriven-materials-science/1EEF321F62D41997CA16AD367B74C4B0
+  volume: '43'
+
 - id: dunn_benchmarking_2020
   abstract: >-
     We present a benchmark test suite and an automated machine learning

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@`
`104`	`104`	`e_pred_col: relax_result["trajectory"].energies[-1],`
`105`	`105`	`}`
`106`	`106`	`except Exception as exc:`
`107`		`- print(f"Failed to relax {material_id}: {exc}")`
	`107`	`+ print(f"Failed to relax {material_id}: {exc!r}")`
`108`	`108`
`109`	`109`
`110`	`110`	`# %%`
Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,7 @@`
`112`	`112`	`e_pred_col: relax_result["trajectory"].energies[-1],`
`113`	`113`	`}`
`114`	`114`	`except Exception as exc:`
`115`		`- print(f"Failed to relax {material_id}: {exc}")`
	`115`	`+ print(f"Failed to relax {material_id}: {exc!r}")`
`116`	`116`
`117`	`117`
`118`	`118`	`# %%`