fix join_chgnet_results.py by removing code to apply MP2020 corrections

janosh · janosh · commit 01658ad73e4c · 2023-06-19T20:29:24.000-07:00
reason: unlike M3GNet which predicts raw DFT energies, CHGNet targets include MP2020 corrections. Hence we don't need to correct afterwards

increase CHGnet max relax steps: 500 -&gt; 2000

small train+test script refactor
diff --git a/matbench_discovery/metrics.py b/matbench_discovery/metrics.py
@@ -64,9 +64,9 @@ def stable_metrics(
         stability_threshold (float): Where to place stability threshold relative to
             convex hull in eV/atom, usually 0 or 0.1 eV. Defaults to 0.
 
-    Note: Could be replaced by sklearn.metrics.classification_report() which takes
-        binary labels. I.e. classification_report(true > 0, pred > 0, output_dict=True)
-        should give equivalent results.
+    Note: Should give equivalent classification metrics to sklearn.metrics.
+        classification_report(each_true > 0, each_pred > 0, output_dict=True) which
+        takes binary labels.
 
     Returns:
         dict[str, float]: dictionary of classification metrics with keys DAF, Precision,
diff --git a/models/bowsr/join_bowsr_results.py b/models/bowsr/join_bowsr_results.py
@@ -31,12 +31,9 @@
 for file_path in tqdm(file_paths):
     if file_path in dfs:
         continue
-    df = pd.read_json(file_path).set_index("material_id")
+    dfs[file_path] = pd.read_json(file_path).set_index("material_id")
 
-    dfs[file_path] = df
 
-
-# %%
 df_bowsr = pd.concat(dfs.values()).round(4)
 
 
diff --git a/models/bowsr/test_bowsr.py b/models/bowsr/test_bowsr.py
@@ -29,7 +29,7 @@
 
 task_type = "IS2RE"  # "RS2RE"
 module_dir = os.path.dirname(__file__)
-# set large job array size for fast testing/debugging
+# set large job array size for smaller data splits and faster testing/debugging
 slurm_array_task_count = 500
 # see https://stackoverflow.com/a/55431306 for how to change array throttling
 # post submission
@@ -95,9 +95,7 @@
     data_path=data_path,
     df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     energy_model=energy_model,
-    maml_version=version("maml"),
-    energy_model_version=version(energy_model),
-    numpy_version=version("numpy"),
+    **{f"{dep}_version": version(dep) for dep in ("maml", "numpy", energy_model)},
     optimize_kwargs=optimize_kwargs,
     task_type=task_type,
     slurm_vars=slurm_vars,
diff --git a/models/cgcnn/test_cgcnn.py b/models/cgcnn/test_cgcnn.py
@@ -84,9 +84,7 @@
 run_params = dict(
     data_path=data_path,
     df=dict(shape=str(df.shape), columns=", ".join(df)),
-    aviary_version=version("aviary"),
-    numpy_version=version("numpy"),
-    torch_version=version("torch"),
+    **{f"{dep}_version": version(dep) for dep in ("aviary", "numpy", "torch")},
     ensemble_size=len(runs),
     task_type=task_type,
     target_col=e_form_col,
diff --git a/models/cgcnn/train_cgcnn.py b/models/cgcnn/train_cgcnn.py
@@ -102,9 +102,7 @@
 run_params = dict(
     data_path=data_path,
     batch_size=batch_size,
-    aviary_version=version("aviary"),
-    numpy_version=version("numpy"),
-    torch_version=version("torch"),
+    **{f"{dep}_version": version(dep) for dep in ("aviary", "numpy", "torch")},
     train_df=dict(shape=str(train_data.df.shape), columns=", ".join(train_df)),
     test_df=dict(shape=str(test_data.df.shape), columns=", ".join(test_df)),
     slurm_vars=slurm_vars,
diff --git a/models/chgnet/2023-03-04-chgnet-wbm-IS2RE.csv b/models/chgnet/2023-03-04-chgnet-wbm-IS2RE.csv
diff --git a/models/chgnet/join_chgnet_results.py b/models/chgnet/join_chgnet_results.py
@@ -1,4 +1,4 @@
-"""Concatenate chgnet results from multiple data files generated by slurm job array
+"""Concatenate CHGNet results from multiple data files generated by slurm job array
 into single file.
 """
 
@@ -13,13 +13,11 @@
 import pandas as pd
 from megnet.utils.models import load_model
 from pymatgen.core import Structure
-from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
-from pymatgen.entries.computed_entries import ComputedStructureEntry
 from pymatviz import density_scatter
 from tqdm import tqdm
 
 from matbench_discovery import today
-from matbench_discovery.data import DATA_FILES, as_dict_handler
+from matbench_discovery.data import as_dict_handler
 from matbench_discovery.energy import get_e_form_per_atom
 from matbench_discovery.preds import df_wbm, e_form_col
 
@@ -32,7 +30,7 @@
 # %%
 module_dir = os.path.dirname(__file__)
 task_type = "IS2RE"
-date = "2023-03-04"
+date = "2023-03-06"
 glob_pattern = f"{date}-chgnet-wbm-{task_type}*/*.json.gz"
 file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
 print(f"Found {len(file_paths):,} files for {glob_pattern = }")
@@ -48,47 +46,23 @@
     # drop trajectory to save memory
     dfs[file_path] = df.drop(columns="chgnet_trajectory")
 
-
-# %%
 df_chgnet = pd.concat(dfs.values()).round(4)
 
 
-# %%
-df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
-    "material_id"
-)
-
-df_cse["cse"] = [
-    ComputedStructureEntry.from_dict(x) for x in tqdm(df_cse.computed_structure_entry)
-]
-
-
-# %% transfer CHGNet energies and relaxed structures WBM CSEs since MP2020 energy
-# corrections applied below are structure-dependent (for oxides and sulfides)
-cse: ComputedStructureEntry
-for row in tqdm(df_chgnet.itertuples(), total=len(df_chgnet)):
-    mat_id, struct_dict, chgnet_energy, *_ = row
-    chgnet_struct = Structure.from_dict(struct_dict)
-    cse = df_cse.loc[mat_id, "cse"]
-    cse._energy = chgnet_energy  # cse._energy is the uncorrected energy
-    cse._structure = chgnet_struct
-    df_chgnet.loc[mat_id, "cse"] = cse
-
-
-# %% apply energy corrections to CSEs with CHGNet
-out = MaterialsProject2020Compatibility().process_entries(
-    df_chgnet.cse, verbose=True, clean=True
-)
-assert len(out) == len(df_chgnet)
-
-
 # %% compute corrected formation energies
 e_form_chgnet_col = "e_form_per_atom_chgnet"
-df_chgnet[e_form_chgnet_col] = [get_e_form_per_atom(cse) for cse in tqdm(df_chgnet.cse)]
+df_chgnet["formula"] = df_wbm.formula
+df_chgnet[e_form_chgnet_col] = [
+    get_e_form_per_atom(dict(energy=ene, composition=formula))
+    for formula, ene in tqdm(
+        df_chgnet.set_index("formula").chgnet_energy.items(), total=len(df_chgnet)
+    )
+]
+df_wbm[e_form_chgnet_col] = df_chgnet[e_form_chgnet_col]
 
 
 # %%
-ax = density_scatter(x=df_wbm[e_form_col], y=df_chgnet[e_form_chgnet_col])
+ax = density_scatter(x=df_wbm[e_form_col], y=df_wbm[e_form_chgnet_col])
 
 
 # %% load 2019 MEGNet formation energy model
@@ -97,11 +71,14 @@
 
 
 # %% predict formation energies on chgnet relaxed structure with MEGNet
-for material_id, cse in tqdm(df_cse.cse.items(), total=len(df_cse)):
+for material_id, struct in tqdm(
+    df_chgnet.chgnet_structure.items(), total=len(df_chgnet)
+):
     if material_id in megnet_e_form_preds:
         continue
     try:
-        struct = cse.structure
+        if isinstance(struct, dict):
+            struct = Structure.from_dict(struct)
         [e_form_per_atom] = megnet_mp_e_form.predict_structure(struct)
         megnet_e_form_preds[material_id] = e_form_per_atom
     except Exception as exc:
@@ -118,7 +95,7 @@
 
 assert (
     n_isna := df_chgnet.e_form_per_atom_chgnet_megnet.isna().sum()
-) < 10, f"{n_isna=}, expected 7 or similar"
+) < 10, f"too many missing MEGNet preds: {n_isna}"
 
 
 # %%
@@ -133,6 +110,6 @@
 
 df_chgnet.select_dtypes("number").to_csv(out_path.replace(".json.gz", ".csv"))
 
-# in_path = f"{module_dir}/2022-10-31-chgnet-wbm-IS2RE.json.gz"
-# df_chgnet_csv = pd.read_csv(in_path.replace(".json.gz", ".csv"))
+# in_path = f"{module_dir}/2023-03-04-chgnet-wbm-IS2RE.json.gz"
+# df_chgnet = pd.read_csv(in_path.replace(".json.gz", ".csv")).set_index("material_id")
 # df_chgnet = pd.read_json(in_path).set_index("material_id")
diff --git a/models/chgnet/metadata.yml b/models/chgnet/metadata.yml
@@ -33,8 +33,12 @@ requirements:
   numpy: 1.24.0
 trained_on_benchmark: false
 
+hyperparams:
+  max_steps: 2000
+
 notes:
   description: |
     The Crystal Hamiltonian Graph Neural Network (CHGNet) is a universal GNN-based interatomic potential trained on energies, forces, stresses and magnetic moments from the MP trajectory dataset containing ∼1.5 million inorganic structures.
     ![CHGNet Pipeline](https://user-images.githubusercontent.com/30958850/222924937-1d09bbce-ee18-4b19-8061-ec689cd15887.svg)
   training: Using pre-trained model with 400,438 params released with preprint. Training set unreleased at time of writing.
+  corrections: Unlike e.g. M3GNet which predicts raw DFT energies, CHGNet targets include MP2020 corrections. Hence no need to correct again.
diff --git a/models/chgnet/test_chgnet.py b/models/chgnet/test_chgnet.py
@@ -10,7 +10,6 @@
 from __future__ import annotations
 
 import os
-import warnings
 from importlib.metadata import version
 from typing import Any
 
@@ -31,7 +30,7 @@
 
 task_type = "IS2RE"  # "RS2RE"
 module_dir = os.path.dirname(__file__)
-# set large job array size for fast testing/debugging
+# set large job array size for smaller data splits and faster testing/debugging
 slurm_array_task_count = 100
 job_name = f"chgnet-wbm-{task_type}{'-debug' if DEBUG else ''}"
 out_dir = os.environ.get("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
@@ -41,8 +40,8 @@
     out_dir=out_dir,
     partition="ampere",
     account="LEE-SL3-GPU",
-    time="3:0:0",
-    # array=f"1-{slurm_array_task_count}",
+    time="6:0:0",
+    array=f"1-{slurm_array_task_count}",
     slurm_flags="--nodes 1 --gpus-per-node 1",
 )
 
@@ -54,9 +53,6 @@
 if os.path.isfile(out_path):
     raise SystemExit(f"{out_path = } already exists, exciting early")
 
-warnings.filterwarnings(action="ignore", category=UserWarning, module="pymatgen")
-warnings.filterwarnings(action="ignore", category=UserWarning, module="tensorflow")
-
 
 # %%
 data_path = {
@@ -67,19 +63,19 @@
 print(f"{data_path=}")
 df_in = pd.read_json(data_path).set_index("material_id")
 e_pred_col = "chgnet_energy"
+max_steps = 2000
 
 df_in: pd.DataFrame = np.array_split(df_in, slurm_array_task_count)[
     slurm_array_task_id - 1
 ]
 
 run_params = dict(
     data_path=data_path,
-    chgnet_version=version("chgnet"),
-    numpy_version=version("numpy"),
-    torch_version=version("torch"),
+    **{f"{dep}_version": version(dep) for dep in ("chgnet", "numpy", "torch")},
     task_type=task_type,
     df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     slurm_vars=slurm_vars,
+    max_steps=max_steps,
 )
 
 run_name = f"{job_name}-{slurm_array_task_id}"
@@ -100,7 +96,9 @@
     if material_id in relax_results:
         continue
     try:
-        relax_result = chgnet.relax(structures[material_id], verbose=False)
+        relax_result = chgnet.relax(
+            structures[material_id], verbose=False, steps=max_steps
+        )
     except Exception as error:
         print(f"Failed to relax {material_id}: {error}")
         continue
diff --git a/models/m3gnet/join_m3gnet_results.py b/models/m3gnet/join_m3gnet_results.py
@@ -45,14 +45,9 @@
     if file_path in dfs:
         continue
     df = pd.read_json(file_path).set_index("material_id")
-    df[f"m3gnet_energy_{task_type}"] = [
-        x["energies"][-1][0] for x in df.m3gnet_trajectory
-    ]
     # drop trajectory to save memory
     dfs[file_path] = df.drop(columns="m3gnet_trajectory")
 
-
-# %%
 df_m3gnet = pd.concat(dfs.values()).round(4)
 
 
@@ -130,7 +125,7 @@
 
 assert (
     n_isna := df_m3gnet.e_form_per_atom_m3gnet_megnet.isna().sum()
-) < 10, f"{n_isna=}, expected 7 or similar"
+) < 10, f"too many missing MEGNet preds: {n_isna}"
 
 
 # %%
diff --git a/models/m3gnet/test_m3gnet.py b/models/m3gnet/test_m3gnet.py
@@ -29,7 +29,7 @@
 
 task_type = "IS2RE"  # "RS2RE"
 module_dir = os.path.dirname(__file__)
-# set large job array size for fast testing/debugging
+# set large job array size for smaller data splits and faster testing/debugging
 slurm_array_task_count = 100
 job_name = f"m3gnet-wbm-{task_type}{'-debug' if DEBUG else ''}"
 out_dir = os.environ.get("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
@@ -75,8 +75,7 @@
 
 run_params = dict(
     data_path=data_path,
-    m3gnet_version=version("m3gnet"),
-    numpy_version=version("numpy"),
+    **{f"{dep}_version": version(dep) for dep in ("m3gnet", "numpy")},
     task_type=task_type,
     df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     slurm_vars=slurm_vars,
diff --git a/models/megnet/test_megnet.py b/models/megnet/test_megnet.py
@@ -65,8 +65,7 @@
 # %%
 run_params = dict(
     data_path=data_path,
-    megnet_version=version("megnet"),
-    numpy_version=version("numpy"),
+    **{f"{dep}_version": version(dep) for dep in ("megnet", "numpy")},
     model_name=model_name,
     task_type=task_type,
     target_col=e_form_col,
diff --git a/models/voronoi/join_voronoi_features.py b/models/voronoi/join_voronoi_features.py
@@ -30,11 +30,8 @@
 for file_path in tqdm(file_paths):
     if file_path in dfs:
         continue
-    df = pd.read_csv(file_path).set_index("material_id")
-    dfs[file_path] = df
+    dfs[file_path] = pd.read_csv(file_path).set_index("material_id")
 
-
-# %%
 df_features = pd.concat(dfs.values()).round(4)
 
 ax = df_features.isna().sum().value_counts().T.plot.bar()
diff --git a/models/voronoi/train_test_voronoi_rf.py b/models/voronoi/train_test_voronoi_rf.py
@@ -72,9 +72,7 @@
     train_path=train_path,
     test_path=test_path,
     mp_energies_path=DATA_FILES.mp_energies,
-    scikit_learn_version=version("scikit-learn"),
-    matminer_version=version("matminer"),
-    numpy_version=version("numpy"),
+    **{f"{dep}_version": version(dep) for dep in ("scikit-learn", "matminer", "numpy")},
     model_name=model_name,
     train_target_col=train_e_form_col,
     test_target_col=test_e_form_col,
diff --git a/models/voronoi/voronoi_featurize_dataset.py b/models/voronoi/voronoi_featurize_dataset.py
@@ -81,8 +81,7 @@
     input_col=input_col,
     slurm_vars=slurm_vars,
     out_path=out_path,
-    matminer_version=version("matminer"),
-    numpy_version=version("numpy"),
+    **{f"{dep}_version": version(dep) for dep in ("matminer", "numpy")},
 )
 
 wandb.init(project="matbench-discovery", name=run_name, config=run_params)
diff --git a/models/wrenformer/test_wrenformer.py b/models/wrenformer/test_wrenformer.py
@@ -73,9 +73,7 @@
 run_params = dict(
     data_path=data_path,
     df=dict(shape=str(df.shape), columns=", ".join(df)),
-    aviary_version=version("aviary"),
-    numpy_version=version("numpy"),
-    torch_version=version("torch"),
+    **{f"{dep}_version": version(dep) for dep in ("aviary", "numpy", "torch")},
     ensemble_size=len(runs),
     task_type=task_type,
     target_col=e_form_col,
diff --git a/models/wrenformer/train_wrenformer.py b/models/wrenformer/train_wrenformer.py
@@ -59,9 +59,7 @@
 
 run_params = dict(
     data_path=data_path,
-    aviary_version=version("aviary"),
-    numpy_version=version("numpy"),
-    torch_version=version("torch"),
+    **{f"{dep}_version": version(dep) for dep in ("aviary", "numpy", "torch")},
     batch_size=batch_size,
     train_df=dict(shape=train_df.shape, columns=", ".join(train_df)),
     test_df=dict(shape=test_df.shape, columns=", ".join(test_df)),
diff --git a/scripts/compile_metrics.py b/scripts/compile_metrics.py
diff --git a/site/src/figs/metrics-table.svelte b/site/src/figs/metrics-table.svelte
diff --git a/site/src/routes/paper/+page.md b/site/src/routes/paper/+page.md
diff --git a/site/src/routes/paper/iclr-ml4mat/+page.md b/site/src/routes/paper/iclr-ml4mat/+page.md
diff --git a/site/src/routes/si/+page.md b/site/src/routes/si/+page.md

Original file line number	Diff line number	Diff line change
`@@ -81,8 +81,7 @@`
`81`	`81`	`input_col=input_col,`
`82`	`82`	`slurm_vars=slurm_vars,`
`83`	`83`	`out_path=out_path,`
`84`		`- matminer_version=version("matminer"),`
`85`		`- numpy_version=version("numpy"),`
	`84`	`+ **{f"{dep}_version": version(dep) for dep in ("matminer", "numpy")},`
`86`	`85`	`)`
`87`	`86`
`88`	`87`	`wandb.init(project="matbench-discovery", name=run_name, config=run_params)`