compare_cse_vs_ce_mp_2020_corrections.py code for materialsproject/pymatgen#2730

janosh · janosh · commit 6101995c0eb8 · 2023-06-19T20:29:21.000-07:00
diff --git a/data/wbm/compare_cse_vs_ce_mp_2020_corrections.py b/data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
@@ -1,3 +1,6 @@
+# %%
+import gzip
+import json
 import warnings
 from datetime import datetime
 
@@ -12,6 +15,7 @@
 from matbench_discovery import ROOT
 from matbench_discovery.energy import get_e_form_per_atom
 from matbench_discovery.plot_scripts import df_wbm
+from matbench_discovery.plots import plt
 
 """
 NOTE MaterialsProject2020Compatibility takes structural information into account when
@@ -47,8 +51,10 @@
     get_e_form_per_atom(entry) for entry in tqdm(cses)
 ]
 
-df_wbm["mp2020_cse_correction"] = [cse.correction for cse in tqdm(cses)]
-df_wbm["mp2020_ce_correction"] = [ce.correction for ce in tqdm(ces)]
+df_wbm["mp2020_cse_correction_per_atom"] = [
+    cse.correction_per_atom for cse in tqdm(cses)
+]
+df_wbm["mp2020_ce_correction_per_atom"] = [ce.correction_per_atom for ce in tqdm(ces)]
 
 
 # %%
@@ -81,21 +87,25 @@
 
 
 # %%
+ax = plt.gca()
 for key, df_anion in df_ce_ne_cse.groupby("anion"):
     ax = df_anion.plot.scatter(
-        ax=locals().get("ax"),
-        x="mp2020_cse_correction",
-        y="mp2020_ce_correction",
+        ax=ax,
+        x="mp2020_cse_correction_per_atom",
+        y="mp2020_ce_correction_per_atom",
         label=f"{key} ({len(df_anion):,})",
         color=dict(oxide="orange", sulfide="teal").get(key, "blue"),
-        title=f"Outliers in formation energy from CSE vs CE ({len(df_ce_ne_cse):,}"
-        f" / {len(df_wbm):,} = {len(df_ce_ne_cse) / len(df_wbm):.1%})",
+        title=f"CSE vs CE corrections for ({len(df_ce_ne_cse):,} / {len(df_wbm):,} = "
+        f"{len(df_ce_ne_cse) / len(df_wbm):.1%})\n outliers of largest difference",
     )
 
 ax.axline((0, 0), slope=1, color="gray", linestyle="dashed", zorder=-1)
 
+# ax.figure.savefig(f"{ROOT}/tmp/{today}-ce-vs-cse-corrections-outliers.pdf")
+
 
 # %%
+ax = plt.gca()
 for key, df_anion in df_ce_ne_cse.groupby("anion"):
     ax = df_anion.plot.scatter(
         ax=locals().get("ax"),
@@ -113,3 +123,42 @@
 # different formation energies are oxides or sulfides for which MP 2020 compat takes
 # into account structural information to make more accurate corrections.
 # ax.figure.savefig(f"{ROOT}/tmp/{today}-ce-vs-cse-outliers.pdf")
+
+
+# %% below code resulted in
+# https://github.com/materialsproject/pymatgen/issues/2730
+wbm_step_2_34803 = (
+    df_ce_ne_cse.e_form_per_atom_mp2020_from_cse
+    - df_ce_ne_cse.e_form_per_atom_mp2020_from_ce
+).idxmax()
+idx = df_wbm.index.get_loc(wbm_step_2_34803)
+cse_mp2020, cse_legacy = cses[idx].copy(), cses[idx].copy()
+ce_mp2020, ce_legacy = ces[idx].copy(), ces[idx].copy()
+
+
+with gzip.open(f"{ROOT}/tmp/cse-wbm-step-2-34803.json.zip", "w") as f:
+    f.write(cse_mp2020.to_json().encode("utf-8"))
+
+with gzip.open(f"{ROOT}/tmp/cse-wbm-step-2-34803.json.zip") as f:
+    cse = ComputedStructureEntry.from_dict(json.load(f))
+
+cse_mp2020 = cse.copy()
+cse_legacy = cse.copy()
+ce_mp2020 = ComputedEntry.from_dict(cse.to_dict())
+ce_legacy = ce_mp2020.copy()
+
+
+MaterialsProject2020Compatibility().process_entry(cse_mp2020)
+MaterialsProject2020Compatibility().process_entry(ce_mp2020)
+MaterialsProjectCompatibility().process_entry(cse_legacy)
+MaterialsProjectCompatibility().process_entry(ce_legacy)
+
+print(f"{cse_mp2020.correction=:.4}")
+print(f"{ce_mp2020.correction=:.4}")
+print(f"{cse_legacy.correction=:.4}")
+print(f"{ce_legacy.correction=:.4}")
+
+print(f"{cse_mp2020.energy_adjustments=}\n")
+print(f"{ce_mp2020.energy_adjustments=}\n")
+print(f"{cse_legacy.energy_adjustments=}\n")
+print(f"{ce_legacy.energy_adjustments=}\n")
diff --git a/models/bowsr/slurm_array_bowsr_wbm.py b/models/bowsr/slurm_array_bowsr_wbm.py
@@ -27,19 +27,21 @@
 """
 
 task_type = "IS2RE"  # "RS2RE"
-today = f"{datetime.now():%Y-%m-%d}"
 module_dir = os.path.dirname(__file__)
 # --mem 12000 avoids slurmstepd: error: Detected 1 oom-kill event(s)
 #     Some of your processes may have been killed by the cgroup out-of-memory handler.
 slurm_mem_per_node = 12000
 # set large job array size for fast testing/debugging
 slurm_array_task_count = 500
-out_dir = f"{module_dir}/{today}-bowsr-megnet-wbm-{task_type}"
+timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
+today = timestamp.split("@")[0]
+job_name = f"bowsr-megnet-wbm-{task_type}"
+out_dir = f"{module_dir}/{today}-{job_name}"
 
 data_path = f"{ROOT}/data/2022-10-19-wbm-init-structs.json.gz"
 
 slurm_submit_python(
-    job_name=f"bowsr-megnet-wbm-{task_type}",
+    job_name=job_name,
     log_dir=out_dir,
     partition="icelake-himem",
     account="LEE-SL3-CPU",
@@ -57,7 +59,6 @@
 slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
 slurm_array_task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
 out_path = f"{out_dir}/{slurm_array_task_id}.json.gz"
-timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
 
 print(f"Job started running {timestamp}")
 print(f"{slurm_job_id = }")
@@ -164,4 +165,4 @@
 
 df_output.reset_index().to_json(out_path, default_handler=as_dict_handler)
 
-wandb.log_artifact(out_path, type=f"bowsr-megnet-wbm-{task_type}")
+wandb.log_artifact(out_path, type=job_name)
diff --git a/models/cgcnn/slurm_train_cgcnn_ensemble.py b/models/cgcnn/slurm_train_cgcnn_ensemble.py
@@ -25,11 +25,12 @@
 # %%
 epochs = 300
 target_col = "formation_energy_per_atom"
-run_name = f"cgcnn-robust-{epochs=}-{target_col}"
+run_name = f"cgcnn-robust-{target_col}-{epochs=}"
 print(f"{run_name=}")
 robust = "robust" in run_name.lower()
 n_folds = 10
-today = f"{datetime.now():%Y-%m-%d}"
+timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
+today = timestamp.split("@")[0]
 log_dir = f"{os.path.dirname(__file__)}/{today}-{run_name}"
 
 slurm_submit_python(
@@ -60,7 +61,7 @@
 df["structure"] = [Structure.from_dict(s) for s in tqdm(df.structure, disable=None)]
 assert target_col in df
 
-train_df, test_df = df_train_test_split(df, test_size=0.5)
+train_df, test_df = df_train_test_split(df, test_size=0.05)
 
 train_data = CrystalGraphData(train_df, task_dict={target_col: task_type})
 train_loader = DataLoader(
@@ -85,14 +86,14 @@
 model = CrystalGraphConvNet(**model_params)
 
 run_params = dict(
+    data_path=data_path,
     batch_size=batch_size,
     train_df=dict(shape=train_data.df.shape, columns=", ".join(train_df)),
     test_df=dict(shape=test_data.df.shape, columns=", ".join(test_df)),
 )
 
 
 # %%
-timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
 print(f"Job started running {timestamp}")
 
 train_model(
diff --git a/models/m3gnet/slurm_array_m3gnet_wbm.py b/models/m3gnet/slurm_array_m3gnet_wbm.py
@@ -26,7 +26,8 @@
 __date__ = "2022-08-15"
 
 task_type = "IS2RE"  # "RS2RE"
-today = f"{datetime.now():%Y-%m-%d}"
+timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
+today = timestamp.split("@")[0]
 module_dir = os.path.dirname(__file__)
 # set large job array size for fast testing/debugging
 slurm_array_task_count = 100
@@ -51,7 +52,6 @@
 # %%
 slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
 slurm_array_task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
-timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
 
 print(f"Job started running {timestamp}")
 print(f"{slurm_job_id = }")
diff --git a/models/wrenformer/slurm_train_wrenformer_ensemble.py b/models/wrenformer/slurm_train_wrenformer_ensemble.py
@@ -18,10 +18,15 @@
 
 # %%
 epochs = 300
-target_col = "e_form"
-run_name = f"wrenformer-robust-mp+wbm-{epochs=}-{target_col}"
+data_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
+target_col = "formation_energy_per_atom"
+# data_path = f"{ROOT}/data/2022-08-25-m3gnet-trainset-mp-2021-struct-energy.json.gz"
+# target_col = "mp_energy_per_atom"
+data_name = "m3gnet-trainset" if "m3gnet" in data_path else "mp"
+run_name = f"wrenformer-robust-{data_name}-{target_col}-{epochs=}"
 n_folds = 10
-today = f"{datetime.now():%Y-%m-%d}"
+timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
+today = timestamp.split("@")[0]
 dataset = "mp"
 log_dir = f"{os.path.dirname(__file__)}/{dataset}/{today}-{run_name}"
 
@@ -38,13 +43,8 @@
 
 # %%
 learning_rate = 3e-4
-data_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
-target_col = "energy_per_atom"
-# data_path = f"{ROOT}/data/2022-08-25-m3gnet-trainset-mp-2021-struct-energy.json.gz"
-# target_col = "mp_energy_per_atom"
 batch_size = 128
 slurm_array_task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
-timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
 input_col = "wyckoff_spglib"
 
 print(f"Job started running {timestamp}")
@@ -54,9 +54,10 @@
 df = pd.read_json(data_path).set_index("material_id", drop=False)
 assert target_col in df, f"{target_col=} not in {list(df)}"
 assert input_col in df, f"{input_col=} not in {list(df)}"
-train_df, test_df = df_train_test_split(df, test_size=0.3)
+train_df, test_df = df_train_test_split(df, test_size=0.05)
 
 run_params = dict(
+    data_path=data_path,
     batch_size=batch_size,
     train_df=dict(shape=train_df.shape, columns=", ".join(train_df)),
     test_df=dict(shape=test_df.shape, columns=", ".join(test_df)),