mkdir data/mp and move some PPD pickles and MP CSEs in there

janosh · janosh · commit 8a8aac65886d · 2023-06-19T20:29:21.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,6 @@ __pycache__
 *.csv.bz2
 *.pkl.gz
 data/**/raw
-data/**/202*
 
 # Weights and Biases logs
 wandb/
diff --git a/data/mp/build_phase_diagram.py b/data/mp/build_phase_diagram.py
@@ -26,13 +26,13 @@
 pd.Series(
     {e.entry_id: e for e in all_mp_computed_structure_entries}
 ).drop_duplicates().to_json(  # mp-15590 appears twice so we drop_duplicates()
-    f"{ROOT}/data/{today}-mp-computed-structure-entries.json.gz",
+    f"{module_dir}/{today}-mp-computed-structure-entries.json.gz",
     default_handler=lambda x: x.as_dict(),
 )
 
 
 # %%
-data_path = f"{ROOT}/data/2022-09-16-mp-computed-structure-entries.json.gz"
+data_path = f"{module_dir}/2022-09-16-mp-computed-structure-entries.json.gz"
 df = pd.read_json(data_path).set_index("material_id")
 # drop the structure, just load ComputedEntry
 mp_computed_entries = df.entry.map(ComputedEntry.from_dict).to_dict()
@@ -87,11 +87,11 @@
 elemental_ref_entries = get_elemental_ref_entries(mp_computed_entries)
 
 # save elemental_ref_entries to disk as json
-with open(f"{module_dir}/{today}-elemental-ref-entries.json", "w") as file:
+with open(f"{ROOT}/data/mp/{today}-mp-elemental-reference-entries.json", "w") as file:
     json.dump(elemental_ref_entries, file, default=lambda x: x.as_dict())
 
 
-df_mp = pd.read_json(f"{ROOT}/data/2022-08-13-mp-energies.json.gz").set_index(
+df_mp = pd.read_json(f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz").set_index(
     "material_id"
 )
 
diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -1,4 +1,5 @@
 # %%
+import os
 from datetime import datetime
 
 import pandas as pd
@@ -7,8 +8,6 @@
 from mp_api.client import MPRester
 from tqdm import tqdm
 
-from matbench_discovery import ROOT
-
 """
 Download all MP formation and above hull energies on 2022-08-13.
 
@@ -20,6 +19,7 @@
 __date__ = "2022-08-13"
 
 today = f"{datetime.now():%Y-%m-%d}"
+module_dir = os.path.dirname(__file__)
 
 
 # %% query all MP formation energies on 2022-08-13
@@ -48,6 +48,6 @@
 
 df["wyckoff"] = [get_aflow_label_from_spglib(x) for x in tqdm(df.structure)]
 
-df.to_json(f"{ROOT}/data/{today}-mp-energies.json.gz", default_handler=as_dict_handler)
+df.to_json(f"{module_dir}/{today}-mp-energies.json.gz", default_handler=as_dict_handler)
 
-# df = pd.read_json(f"{ROOT}/data/2022-08-13-mp-energies.json.gz")
+# df = pd.read_json(f"{module_dir}/2022-08-13-mp-energies.json.gz")
diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
@@ -272,9 +272,13 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 df_wbm["formula_from_cse"] = [
     x.alphabetical_formula for x in df_wbm.pop("composition_from_cse")
 ]
-df_wbm[
-    ["initial_structure", "computed_structure_entry", "formula_from_cse"]
-].reset_index().to_json(f"{module_dir}/{today}-wbm-cses+init-structs.json.bz2")
+
+for key, col_name in (
+    ("cses", "computed_structure_entry"),
+    ("init-structs", "initial_structure"),
+):
+    cols = ["initial_structure", "formula_from_cse", col_name]
+    df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{key}.json.bz2")
 
 
 # %%
diff --git a/models/bowsr/slurm_array_bowsr_wbm.py b/models/bowsr/slurm_array_bowsr_wbm.py
@@ -36,7 +36,7 @@
 slurm_array_task_count = 500
 out_dir = f"{module_dir}/{today}-bowsr-megnet-wbm-{task_type}"
 
-data_path = f"{ROOT}/data/2022-06-26-wbm-cses-and-initial-structures.json.gz"
+data_path = f"{ROOT}/data/2022-10-19-wbm-init-structs.json.gz"
 
 slurm_submit_python(
     job_name=f"bowsr-megnet-wbm-{task_type}",
diff --git a/models/cgcnn/use_cgcnn_ensemble.py b/models/cgcnn/use_cgcnn_ensemble.py
@@ -62,7 +62,7 @@
 )
 df, ensemble_metrics = predict_from_wandb_checkpoints(
     runs,
-    df=df,
+    df=cg_data.df,  # dropping isolated-atom structs means len(cg_data.df) < len(df)
     target_col=target_col,
     model_class=CrystalGraphConvNet,
     data_loader=data_loader,
diff --git a/models/voronoi/featurize_mp_wbm.py b/models/voronoi/featurize_mp_wbm.py
@@ -44,7 +44,7 @@
 
 
 # %%
-data_path = f"{ROOT}/data/2022-09-16-mp-computed-structure-entries.json.gz"
+data_path = f"{ROOT}/data/mp/2022-09-16-mp-computed-structure-entries.json.gz"
 # data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-cses+init-structs.json.bz2"
 df = pd.read_json(data_path).set_index("material_id")
 
diff --git a/models/wrenformer/slurm_train_wrenformer_ensemble.py b/models/wrenformer/slurm_train_wrenformer_ensemble.py
@@ -44,7 +44,7 @@
 
 # %%
 learning_rate = 3e-4
-data_path = f"{ROOT}/data/2022-08-13-mp-energies.json.gz"
+data_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
 target_col = "energy_per_atom"
 # data_path = f"{ROOT}/data/2022-08-25-m3gnet-trainset-mp-2021-struct-energy.json.gz"
 # target_col = "mp_energy_per_atom"

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@`
`62`	`62`	`)`
`63`	`63`	`df, ensemble_metrics = predict_from_wandb_checkpoints(`
`64`	`64`	`runs,`
`65`		`- df=df,`
	`65`	`+ df=cg_data.df, # dropping isolated-atom structs means len(cg_data.df) < len(df)`
`66`	`66`	`target_col=target_col,`
`67`	`67`	`model_class=CrystalGraphConvNet,`
`68`	`68`	`data_loader=data_loader,`