sevennet/filter_bad_preds.py fix bad_mask df_wbm column access

janosh · janosh · commit 05c4335c3130 · 2024-07-26T17:47:04.000-04:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ default_install_hook_types: [pre-commit, commit-msg]
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.3
+    rev: v0.5.5
     hooks:
       - id: ruff
         args: [--fix]
@@ -79,7 +79,7 @@ repos:
       - id: check-github-actions
 
   - repo: https://github.com/RobertCraigie/pyright-python
-    rev: v1.1.372
+    rev: v1.1.373
     hooks:
       - id: pyright
         args: [--level, error]
diff --git a/models/sevennet/filter_bad_preds.py b/models/sevennet/filter_bad_preds.py
@@ -1,15 +1,18 @@
 import pandas as pd
 
-from matbench_discovery.data import Key, df_wbm
+from matbench_discovery.data import df_wbm
+from matbench_discovery.enums import MbdKey
 
-E_FORM_COL = "e_form_per_atom_sevennet"
+e_form_7net_col = "e_form_per_atom_sevennet"
 
 csv_path = "./2024-07-11-sevennet-preds.csv.gz"
 df_preds = pd.read_csv(csv_path).set_index("material_id")
 
 # NOTE this filtering is necessary for both MACE and SevenNet because some outliers
 # have extremely low e_form (like -1e40)
-bad_mask = df_preds[E_FORM_COL] - df_wbm[Key.e_form] < -5
+bad_mask = abs(df_preds[e_form_7net_col] - df_wbm[MbdKey.e_form_wbm]) > 5
+n_preds = len(df_preds[e_form_7net_col].dropna())
+print(f"{sum(bad_mask)=} is {sum(bad_mask) / len(df_wbm):.2%} of {n_preds:,}")
 df_preds[~bad_mask].select_dtypes("number").to_csv(
     "./2024-07-11-sevennet-preds-no-bad.csv.gz"
 )
diff --git a/models/sevennet/test_sevennet.py b/models/sevennet/test_sevennet.py
@@ -24,11 +24,11 @@
 
 
 # %% this config is editable
-SMOKE_TEST = True
+smoke_test = True
 sevennet_root = os.path.dirname(sevenn.__path__[0])
 module_dir = os.path.dirname(__file__)
 sevennet_chkpt = f"{module_dir}/sevennet_checkpoint.pth.tar"
-pot_name = "sevennet"
+model_name = "sevennet"
 task_type = Task.IS2RE
 ase_optimizer = "FIRE"
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -55,22 +55,20 @@
 slurm_array_task_id = int(os.getenv("SLURM_ARRAY_TASK_ID", "0"))
 
 os.makedirs(out_dir := "./results", exist_ok=True)
-out_path = f"{out_dir}/{pot_name}-{slurm_array_task_id:>03}.json.gz"
+out_path = f"{out_dir}/{model_name}-{slurm_array_task_id:>03}.json.gz"
 
 data_path = {Task.IS2RE: DataFiles.wbm_initial_structures.path}[task_type]
-print(f"\nJob started running {timestamp}, eval {pot_name}", flush=True)
+print(f"\nJob started running {timestamp}, eval {model_name}", flush=True)
 print(f"{data_path=}", flush=True)
 
-e_pred_col = "sevennet_energy"
-
-# Init ASE SevenNet Calculator from checkpoint
+# Initialize ASE SevenNet Calculator from checkpoint
 sevennet_calc = SevenNetCalculator(sevennet_chkpt)
 
 
 # %%
 print(f"Read data from {data_path}")
 df_in = pd.read_json(data_path).set_index(Key.mat_id)
-if SMOKE_TEST:
+if smoke_test:
     df_in = df_in.head(10)
 else:
     df_in = df_in.sample(frac=1, random_state=7)  # shuffle data for equal runtime
@@ -111,5 +109,5 @@
 
 
 # %%
-if not SMOKE_TEST:
+if not smoke_test:
     df_out.reset_index().to_json(out_path, default_handler=as_dict_handler)
diff --git a/models/sevennet/train_sevennet/convert_MPTrj_to_xyz.py b/models/sevennet/train_sevennet/convert_MPTrj_to_xyz.py
diff --git a/models/sevennet/train_sevennet/pre_train.yaml b/models/sevennet/train_sevennet/pre_train.yaml
@@ -1,47 +1,47 @@
 model:
-  chemical_species: "auto"
+  chemical_species: auto
   cutoff: 5.0
   channel: 128
   is_parity: False
   lmax: 2
   num_convolution_layer: 5
   irreps_manual:
-    - "128x0e"
-    - "128x0e+64x1e+32x2e"
-    - "128x0e+64x1e+32x2e"
-    - "128x0e+64x1e+32x2e"
-    - "128x0e+64x1e+32x2e"
-    - "128x0e"
+    - 128x0e
+    - 128x0e+64x1e+32x2e
+    - 128x0e+64x1e+32x2e
+    - 128x0e+64x1e+32x2e
+    - 128x0e+64x1e+32x2e
+    - 128x0e
 
   weight_nn_hidden_neurons: [64, 64]
   radial_basis:
-    radial_basis_name: "bessel"
+    radial_basis_name: bessel
     bessel_basis_num: 8
   cutoff_function:
-    cutoff_function_name: "XPLOR"
+    cutoff_function_name: XPLOR
     cutoff_on: 4.5
 
-  act_gate: { "e": "silu", "o": "tanh" }
-  act_scalar: { "e": "silu", "o": "tanh" }
+  act_gate: { "e": silu, "o": tanh }
+  act_scalar: { "e": silu, "o": tanh }
 
-  conv_denominator: "avg_num_neigh"
+  conv_denominator: avg_num_neigh
   train_shift_scale: False
   train_denominator: False
-  self_connection_type: "linear"
+  self_connection_type: linear
 train:
   train_shuffle: False
   random_seed: 1
   is_train_stress: True
   epoch: 600
 
-  loss: "Huber"
+  loss: Huber
   loss_param:
     delta: 0.01
 
-  optimizer: "adam"
+  optimizer: adam
   optim_param:
     lr: 0.01
-  scheduler: "linearlr"
+  scheduler: linearlr
   scheduler_param:
     start_factor: 1.0
     total_iters: 600
@@ -51,10 +51,10 @@ train:
   stress_loss_weight: 0.01
 
   error_record:
-    - ["Energy", "MAE"]
-    - ["Force", "MAE"]
-    - ["Stress", "MAE"]
-    - ["TotalLoss", "None"]
+    - [Energy, MAE]
+    - [Force, MAE]
+    - [Stress, MAE]
+    - [TotalLoss, None]
 
   per_epoch: 10
   # continue:
@@ -64,9 +64,9 @@ train:
 data:
   data_shuffle: False
   batch_size: 4 # batch size per gpu
-  scale: "per_atom_energy_std"
-  shift: "elemwise_reference_energies"
+  scale: per_atom_energy_std
+  shift: elemwise_reference_energies
 
   save_by_train_valid: False
-  load_dataset_path: ["train.sevenn_data"]
-  load_validset_path: ["valid.sevenn_data"]
+  load_dataset_path: [train.sevenn_data]
+  load_validset_path: [valid.sevenn_data]