add structure perturbation to train_cgcnn.py

janosh · janosh · commit 6dd43982ac3c · 2023-06-19T20:29:22.000-07:00
including new matbench_discovery/structure.py module with tests in tests/test_structure.py
diff --git a/matbench_discovery/structure.py b/matbench_discovery/structure.py
@@ -0,0 +1,27 @@
+import numpy as np
+from pymatgen.core import Structure
+
+__author__ = "Janosh Riebesell"
+__date__ = "2022-12-02"
+
+np.random.seed(0)  # ensure reproducible structure perturbations
+
+
+def perturb_structure(struct: Structure, gamma: float = 1.5) -> Structure:
+    """Perturb the atomic coordinates of a pymatgen structure
+
+    Args:
+        struct (Structure): pymatgen structure to be perturbed
+
+    Returns:
+        Structure: Perturbed structure
+    """
+    perturbed = struct.copy()
+    for site in perturbed:
+        magnitude = np.random.weibull(gamma)
+        vec = np.random.randn(3)  # TODO maybe make func recursive to deal with 0-vector
+        vec /= np.linalg.norm(vec)  # unit vector
+        site.coords += vec * magnitude
+        site.to_unit_cell(in_place=True)
+
+    return perturbed
diff --git a/models/cgcnn/plot_structure_perturbation.py b/models/cgcnn/plot_structure_perturbation.py
@@ -0,0 +1,36 @@
+# %%
+import numpy as np
+import pandas as pd
+from pymatgen.core import Lattice, Structure
+from pymatviz import plot_structure_2d
+
+from matbench_discovery.plots import plt
+from matbench_discovery.structure import perturb_structure
+
+__author__ = "Janosh Riebesell"
+__date__ = "2022-12-02"
+
+
+# %%
+ax = pd.Series(np.random.weibull(1.5, 100000)).hist(bins=100)
+title = "Distribution of perturbation magnitudes"
+ax.set(xlabel="magnitude of perturbation", ylabel="count", title=title)
+
+
+# %%
+struct = Structure(
+    lattice=Lattice.cubic(5),
+    species=("Fe", "O"),
+    coords=((0, 0, 0), (0.5, 0.5, 0.5)),
+)
+
+ax = plot_structure_2d(struct)
+ax.set(title=f"Original structure: {struct.formula}")
+ax.set_aspect("equal")
+
+
+# %%
+fig, axs = plt.subplots(3, 4, figsize=(12, 10))
+for idx, ax in enumerate(axs.flat, 1):
+    plot_structure_2d(perturb_structure(struct), ax=ax)
+    ax.set(title=f"perturbation {idx}")
diff --git a/models/cgcnn/test_cgcnn.py b/models/cgcnn/test_cgcnn.py
@@ -23,12 +23,12 @@
 __date__ = "2022-08-15"
 
 """
-Script that downloads checkpoints for an ensemble of CGCNN models trained on all MP
+Download WandB checkpoints for an ensemble of CGCNN models trained on all MP
 formation energies, then makes predictions on some dataset, prints ensemble metrics and
 saves predictions to CSV.
 """
 
-task_type = "RS2RE"
+task_type = "IS2RE"
 debug = "slurm-submit" in sys.argv
 job_name = f"test-cgcnn-wbm-{task_type}{'-debug' if DEBUG else ''}"
 module_dir = os.path.dirname(__file__)
@@ -58,16 +58,15 @@
 
 target_col = "e_form_per_atom_mp2020_corrected"
 df[target_col] = df_wbm[target_col]
-assert target_col in df, f"{target_col=} not in {list(df)}"
 if task_type == "RS2RE":
     df[input_col] = [x["structure"] for x in df.computed_structure_entry]
 assert input_col in df, f"{input_col=} not in {list(df)}"
 
 df[input_col] = [Structure.from_dict(x) for x in tqdm(df[input_col], disable=None)]
 
 filters = {
-    "created_at": {"$gt": "2022-11-22", "$lt": "2022-11-23"},
-    "display_name": {"$regex": "^cgcnn-robust"},
+    "created_at": {"$gt": "2022-12-03", "$lt": "2022-12-04"},
+    "display_name": {"$regex": "^train-cgcnn-robust-augment=3-"},
 }
 runs = wandb.Api().runs("janosh/matbench-discovery", filters=filters)
 
@@ -92,19 +91,15 @@
     slurm_vars=slurm_vars,
 )
 
-
 wandb.init(project="matbench-discovery", name=job_name, config=run_params)
 
 cg_data = CrystalGraphData(
-    df,
-    task_dict={target_col: "regression"},
-    structure_col=input_col,
-    identifiers=["formula_from_cse"],
+    df, task_dict={target_col: "regression"}, structure_col=input_col
 )
 data_loader = DataLoader(
     cg_data, batch_size=1024, shuffle=False, collate_fn=collate_batch
 )
-df, ensemble_metrics = predict_from_wandb_checkpoints(
+df_preds, ensemble_metrics = predict_from_wandb_checkpoints(
     runs,
     # dropping isolated-atom structs means len(cg_data.df) < len(df)
     cache_dir=CHECKPOINT_DIR,
@@ -114,9 +109,10 @@
     data_loader=data_loader,
 )
 
-df.to_csv(f"{out_dir}/{job_name}-preds.csv", index=False)
+df_preds.to_csv(f"{out_dir}/{job_name}-preds.csv", index=False)
 pred_col = f"{target_col}_pred_ens"
-table = wandb.Table(dataframe=df[[target_col, pred_col]].reset_index())
+assert pred_col in df, f"{pred_col=} not in {list(df)}"
+table = wandb.Table(dataframe=df_preds[[target_col, pred_col]].reset_index())
 
 
 # %%
diff --git a/models/cgcnn/train_cgcnn.py b/models/cgcnn/train_cgcnn.py
@@ -8,10 +8,11 @@
 from aviary.train import df_train_test_split, train_model
 from pymatgen.core import Structure
 from torch.utils.data import DataLoader
-from tqdm import tqdm
+from tqdm import tqdm, trange
 
 from matbench_discovery import DEBUG, ROOT, timestamp, today
 from matbench_discovery.slurm import slurm_submit
+from matbench_discovery.structure import perturb_structure
 
 """
 Train a CGCNN ensemble on target_col of data_path.
@@ -24,7 +25,10 @@
 # %%
 epochs = 300
 target_col = "formation_energy_per_atom"
-job_name = f"train-cgcnn-robust-{target_col}{'-debug' if DEBUG else ''}"
+input_col = "structure"
+id_col = "material_id"
+augment = 3
+job_name = f"train-cgcnn-robust-{augment=}{'-debug' if DEBUG else ''}"
 print(f"{job_name=}")
 robust = "robust" in job_name.lower()
 ensemble_size = 10
@@ -35,7 +39,7 @@
     job_name=job_name,
     partition="ampere",
     account="LEE-SL3-GPU",
-    time="8:0:0",
+    time="12:0:0",
     array=f"1-{ensemble_size}",
     out_dir=out_dir,
     slurm_flags="--nodes 1 --gpus-per-node 1",
@@ -55,10 +59,18 @@
 data_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
 # data_path = f"{ROOT}/data/mp/2022-08-13-mp-energies-1k-samples.json.gz"
 print(f"{data_path=}")
-df = pd.read_json(data_path).set_index("material_id", drop=False)
-df["structure"] = [Structure.from_dict(s) for s in tqdm(df.structure, disable=None)]
+df = pd.read_json(data_path).set_index(id_col)
+df[input_col] = [Structure.from_dict(s) for s in tqdm(df[input_col], disable=None)]
 assert target_col in df
 
+df_aug = df.copy()
+structs = df_aug.pop(input_col)
+for idx in trange(augment, desc="Augmenting"):
+    df_aug[input_col] = [perturb_structure(x) for x in structs]
+    df = pd.concat([df, df_aug.set_index(f"{x}-aug={idx+1}" for x in df_aug.index)])
+
+del df_aug
+
 train_df, test_df = df_train_test_split(df, test_size=0.05)
 
 print(f"{train_df.shape=}")
@@ -91,6 +103,8 @@
     train_df=dict(shape=str(train_data.df.shape), columns=", ".join(train_df)),
     test_df=dict(shape=str(test_data.df.shape), columns=", ".join(test_df)),
     slurm_vars=slurm_vars,
+    augment=augment,
+    input_col=input_col,
 )
 
 
@@ -108,9 +122,9 @@
     swa_start=swa_start,
     target_col=target_col,
     task_type=task_type,
+    train_loader=train_loader,
     test_loader=test_loader,
     timestamp=timestamp,
-    train_loader=train_loader,
     wandb_path="janosh/matbench-discovery",
     run_params=run_params,
 )
diff --git a/models/wrenformer/test_wrenformer.py b/models/wrenformer/test_wrenformer.py
@@ -27,7 +27,7 @@
 task_type = "IS2RE"
 data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-summary.csv"
 debug = "slurm-submit" in sys.argv
-job_name = f"test-wrenformer-wbm-IS2RE{'-debug' if DEBUG else ''}"
+job_name = f"test-wrenformer-wbm-{task_type}{'-debug' if DEBUG else ''}"
 module_dir = os.path.dirname(__file__)
 out_dir = os.environ.get("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")
 
diff --git a/tests/test_structure.py b/tests/test_structure.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import numpy as np
+import pytest
+from pymatgen.core import Lattice, Structure
+
+from matbench_discovery.structure import perturb_structure
+
+
+@pytest.fixture
+def struct() -> Structure:
+    return Structure(
+        lattice=Lattice.cubic(5),
+        species=("Fe", "O"),
+        coords=((0, 0, 0), (0.5, 0.5, 0.5)),
+    )
+
+
+def test_perturb_structure(struct: Structure) -> None:
+    np.random.seed(0)
+    perturbed = perturb_structure(struct)
+    assert len(perturbed) == len(struct)
+
+    for site, new in zip(struct, perturbed):
+        assert site.specie == new.specie
+        assert tuple(site.coords) != tuple(new.coords)
+
+    # test that the perturbation is reproducible
+    np.random.seed(0)
+    assert perturbed == perturb_structure(struct)
+    # but different on subsequent calls
+    assert perturb_structure(struct) != perturb_structure(struct)