refactor data loading in model test scripts

janosh · janosh · commit d564ade5359d · 2023-06-19T20:29:24.000-07:00
diff --git a/models/bowsr/test_bowsr.py b/models/bowsr/test_bowsr.py
@@ -12,7 +12,6 @@
 from maml.apps.bowsr.model.megnet import MEGNet
 from maml.apps.bowsr.optimizer import BayesianOptimizer
 from pymatgen.core import Structure
-from pymatgen.entries.computed_entries import ComputedStructureEntry
 from tqdm import tqdm
 
 from matbench_discovery import DEBUG, timestamp, today
@@ -39,7 +38,11 @@
 job_name = f"bowsr-{energy_model}-wbm-{task_type}{'-debug' if DEBUG else ''}"
 out_dir = os.environ.get("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
 
-data_path = DATA_FILES.wbm_initial_structures
+data_path = {
+    "IS2RE": DATA_FILES.wbm_initial_structures,
+    "RS2RE": DATA_FILES.wbm_computed_structure_entries,
+}[task_type]
+
 
 slurm_vars = slurm_submit(
     job_name=job_name,
@@ -73,7 +76,7 @@
 # %%
 df_wbm = pd.read_json(data_path).set_index("material_id")
 
-df_this_job: pd.DataFrame = np.array_split(df_wbm, slurm_array_task_count)[
+df_in: pd.DataFrame = np.array_split(df_wbm, slurm_array_task_count)[
     slurm_array_task_id - 1
 ]
 
@@ -90,7 +93,7 @@
 run_params = dict(
     bayes_optim_kwargs=bayes_optim_kwargs,
     data_path=data_path,
-    df=dict(shape=str(df_this_job.shape), columns=", ".join(df_this_job)),
+    df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     energy_model=energy_model,
     maml_version=version("maml"),
     energy_model_version=version(energy_model),
@@ -106,16 +109,12 @@
 # %%
 model = MEGNet()
 relax_results: dict[str, dict[str, Any]] = {}
+input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
 
-if task_type == "IS2RE":
-    structures = df_this_job.initial_structure.map(Structure.from_dict).to_dict()
-elif task_type == "RS2RE":
-    structures = df_this_job.cse.map(
-        lambda x: ComputedStructureEntry.from_dict(x).structure
-    ).to_dict()
-else:
-    raise ValueError(f"Unknown {task_type = }")
+if task_type == "RS2RE":
+    df_in[input_col] = [x["structure"] for x in df_in.computed_structure_entry]
 
+structures = df_in[input_col].map(Structure.from_dict).to_dict()
 
 for material_id in tqdm(structures, desc="Main loop", disable=None):
     structure = structures[material_id]
diff --git a/models/cgcnn/test_cgcnn.py b/models/cgcnn/test_cgcnn.py
@@ -14,7 +14,7 @@
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-from matbench_discovery import CHECKPOINT_DIR, DEBUG, WANDB_PATH, today
+from matbench_discovery import CHECKPOINT_DIR, DEBUG, ROOT, WANDB_PATH, today
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.plots import wandb_scatter
 from matbench_discovery.slurm import slurm_submit
@@ -45,19 +45,12 @@
 
 
 # %%
-if task_type == "IS2RE":
-    data_path = DATA_FILES.wbm_initial_structures
-    # or for debug
-    # data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json-1k-samples.bz2"
-    # created with:
-    # df = df.sample(1000)
-    # df.reset_index().to_json(data_path.replace(".json", "-1k-samples.json"))
-    input_col = "initial_structure"
-elif task_type == "RS2RE":
-    data_path = DATA_FILES.wbm_computed_structure_entries
-    input_col = "relaxed_structure"
-else:
-    raise ValueError(f"Unexpected {task_type=}")
+data_path = {
+    "IS2RE": DATA_FILES.wbm_initial_structures,
+    "RS2RE": DATA_FILES.wbm_computed_structure_entries,
+    "IS2RE-debug": f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json-1k-samples.bz2",
+}[task_type + "-debug" if DEBUG else ""]
+input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
 
 df = pd.read_json(data_path).set_index("material_id")
 
diff --git a/models/chgnet/join_chgnet_results.py b/models/chgnet/join_chgnet_results.py
@@ -33,7 +33,7 @@
 # %%
 module_dir = os.path.dirname(__file__)
 task_type = "IS2RE"
-date = "2023-03-02"
+date = "2023-03-04"
 glob_pattern = f"{date}-chgnet-wbm-{task_type}*/*.json.gz"
 file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
 print(f"Found {len(file_paths):,} files for {glob_pattern = }")
@@ -128,7 +128,7 @@
 
 # %%
 ax = density_scatter(
-    df=df_chgnet, x="e_form_per_atom_chgnet", y="e_form_per_atom_chgnet_megnet"
+    df=df_chgnet, x="e_form_per_atom_chgnet_megnet", y="e_form_per_atom_chgnet"
 )
 
 
diff --git a/models/chgnet/test_chgnet.py b/models/chgnet/test_chgnet.py
@@ -19,7 +19,6 @@
 import wandb
 from chgnet.model import StructOptimizer
 from pymatgen.core import Structure
-from pymatgen.entries.computed_entries import ComputedStructureEntry
 from tqdm import tqdm
 
 from matbench_discovery import DEBUG, timestamp, today
@@ -69,7 +68,7 @@
 df_in = pd.read_json(data_path).set_index("material_id")
 e_pred_col = "chgnet_energy"
 
-df_this_job: pd.DataFrame = np.array_split(df_in, slurm_array_task_count)[
+df_in: pd.DataFrame = np.array_split(df_in, slurm_array_task_count)[
     slurm_array_task_id - 1
 ]
 
@@ -79,7 +78,7 @@
     numpy_version=version("numpy"),
     torch_version=version("torch"),
     task_type=task_type,
-    df=dict(shape=str(df_this_job.shape), columns=", ".join(df_this_job)),
+    df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     slurm_vars=slurm_vars,
 )
 
@@ -90,15 +89,12 @@
 # %%
 chgnet = StructOptimizer()  # load default pre-trained CHGNnet model
 relax_results: dict[str, dict[str, Any]] = {}
+input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
 
-if task_type == "IS2RE":
-    structures = df_this_job.initial_structure.map(Structure.from_dict).to_dict()
-elif task_type == "RS2RE":
-    df_this_job.cse = df_this_job.cse.map(ComputedStructureEntry.from_dict)
-    structures = df_this_job.cse.map(lambda x: x.structure).to_dict()
-else:
-    raise ValueError(f"Unknown {task_type = }")
+if task_type == "RS2RE":
+    df_in[input_col] = [x["structure"] for x in df_in.computed_structure_entry]
 
+structures = df_in[input_col].map(Structure.from_dict).to_dict()
 
 for material_id in tqdm(structures, disable=None):
     if material_id in relax_results:
diff --git a/models/m3gnet/test_m3gnet.py b/models/m3gnet/test_m3gnet.py
@@ -18,7 +18,6 @@
 import wandb
 from m3gnet.models import Relaxer
 from pymatgen.core import Structure
-from pymatgen.entries.computed_entries import ComputedStructureEntry
 from tqdm import tqdm
 
 from matbench_discovery import DEBUG, timestamp, today
@@ -61,12 +60,15 @@
 
 
 # %%
-data_path = DATA_FILES.wbm_computed_structure_entries_plus_init_structs
+data_path = {
+    "IS2RE": DATA_FILES.wbm_initial_structures,
+    "RS2RE": DATA_FILES.wbm_computed_structure_entries,
+}[task_type]
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
 df_wbm = pd.read_json(data_path).set_index("material_id")
 
-df_this_job: pd.DataFrame = np.array_split(df_wbm, slurm_array_task_count)[
+df_in: pd.DataFrame = np.array_split(df_wbm, slurm_array_task_count)[
     slurm_array_task_id - 1
 ]
 
@@ -75,7 +77,7 @@
     m3gnet_version=version("m3gnet"),
     numpy_version=version("numpy"),
     task_type=task_type,
-    df=dict(shape=str(df_this_job.shape), columns=", ".join(df_this_job)),
+    df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     slurm_vars=slurm_vars,
 )
 
@@ -86,15 +88,12 @@
 # %%
 megnet = Relaxer()  # load default pre-trained M3GNet model
 relax_results: dict[str, dict[str, Any]] = {}
+input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
 
-if task_type == "IS2RE":
-    structures = df_this_job.initial_structure.map(Structure.from_dict).to_dict()
-elif task_type == "RS2RE":
-    df_this_job.cse = df_this_job.cse.map(ComputedStructureEntry.from_dict)
-    structures = df_this_job.cse.map(lambda x: x.structure).to_dict()
-else:
-    raise ValueError(f"Unknown {task_type = }")
+if task_type == "RS2RE":
+    df_in[input_col] = [x["structure"] for x in df_in.computed_structure_entry]
 
+structures = df_in[input_col].map(Structure.from_dict).to_dict()
 
 for material_id in tqdm(structures, disable=None):
     if material_id in relax_results:
diff --git a/models/megnet/test_megnet.py b/models/megnet/test_megnet.py
@@ -15,7 +15,6 @@
 import wandb
 from megnet.utils.models import load_model
 from pymatgen.core import Structure
-from pymatgen.entries.computed_entries import ComputedStructureEntry
 from sklearn.metrics import r2_score
 from tqdm import tqdm
 
@@ -50,13 +49,16 @@
 if os.path.isfile(out_path):
     raise SystemExit(f"{out_path = } already exists, exciting early")
 
-data_path = DATA_FILES.wbm_initial_structures
+data_path = {
+    "IS2RE": DATA_FILES.wbm_initial_structures,
+    "RS2RE": DATA_FILES.wbm_computed_structure_entries,
+}[task_type]
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
 e_form_col = "e_form_per_atom_mp2020_corrected"
 assert e_form_col in df_wbm, f"{e_form_col=} not in {list(df_wbm)=}"
 
-df_wbm_structs = pd.read_json(data_path).set_index("material_id")
+df_in = pd.read_json(data_path).set_index("material_id")
 megnet_mp_e_form = load_model(model_name := "Eform_MP_2019")
 
 
@@ -68,21 +70,20 @@
     model_name=model_name,
     task_type=task_type,
     target_col=e_form_col,
-    df=dict(shape=str(df_wbm_structs.shape), columns=", ".join(df_wbm_structs)),
+    df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     slurm_vars=slurm_vars,
 )
 
 wandb.init(project="matbench-discovery", name=job_name, config=run_params)
 
 
 # %%
-if task_type == "IS2RE":
-    structures = df_wbm_structs.initial_structure.map(Structure.from_dict)
-elif task_type == "RS2RE":
-    df_wbm_structs.cse = df_wbm_structs.cse.map(ComputedStructureEntry.from_dict)
-    structures = df_wbm_structs.cse.map(lambda x: x.structure)
-else:
-    raise ValueError(f"Unknown {task_type = }")
+input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type]
+
+if task_type == "RS2RE":
+    df_in[input_col] = [x["structure"] for x in df_in.computed_structure_entry]
+
+structures = df_in[input_col].map(Structure.from_dict).to_dict()
 
 megnet_e_form_preds = {}
 for material_id in tqdm(structures, disable=None):
diff --git a/models/voronoi/voronoi_featurize_dataset.py b/models/voronoi/voronoi_featurize_dataset.py
@@ -25,10 +25,10 @@
 
 
 data_name = "mp"  # "mp"
-if data_name == "wbm":
-    data_path = DATA_FILES.wbm_initial_structures
-elif data_name == "mp":
-    data_path = DATA_FILES.mp_computed_structure_entries
+data_path = {
+    "wbm": DATA_FILES.wbm_initial_structures,
+    "mp": DATA_FILES.mp_computed_structure_entries,
+}[data_name]
 
 input_col = "initial_structure"
 # input_col = "relaxed_structure"
@@ -60,26 +60,24 @@
 
 print(f"{data_path=}")
 df = pd.read_json(data_path).set_index("material_id")
-df_this_job: pd.DataFrame = np.array_split(df, slurm_array_task_count)[
+df_in: pd.DataFrame = np.array_split(df, slurm_array_task_count)[
     slurm_array_task_id - 1
 ]
 
 if data_name == "mp":  # extract structure dicts from ComputedStructureEntry
-    struct_dicts = [x["structure"] for x in df_this_job.entry]
+    struct_dicts = [x["structure"] for x in df_in.entry]
 elif data_name == "wbm" and input_col == "relaxed_structure":
-    struct_dicts = [x["structure"] for x in df_this_job.computed_structure_entry]
+    struct_dicts = [x["structure"] for x in df_in.computed_structure_entry]
 elif data_name == "wbm" and input_col == "initial_structure":
-    struct_dicts = df_this_job.initial_structure
+    struct_dicts = df_in.initial_structure
 
-df_this_job[input_col] = [
-    Structure.from_dict(x) for x in tqdm(struct_dicts, disable=None)
-]
+df_in[input_col] = [Structure.from_dict(x) for x in tqdm(struct_dicts, disable=None)]
 
 
 # %%
 run_params = dict(
     data_path=data_path,
-    df=dict(shape=str(df_this_job.shape), columns=", ".join(df_this_job)),
+    df=dict(shape=str(df_in.shape), columns=", ".join(df_in)),
     input_col=input_col,
     slurm_vars=slurm_vars,
     out_path=out_path,
@@ -94,9 +92,9 @@
 # > No electronegativity for Ne. Setting to NaN. This has no physical meaning, ...
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pymatgen")
 
-df_features = featurizer.featurize_dataframe(
-    df_this_job, input_col, ignore_errors=True
-)[featurizer.feature_labels()].round(4)
+df_features = featurizer.featurize_dataframe(df_in, input_col, ignore_errors=True)[
+    featurizer.feature_labels()
+].round(4)
 
 
 # %%