replace figshare wbm-steps-summary.csv imports with data/wbm/2022-10-19-wbm-summary.csv

janosh · janosh · commit 73655f6051f4 · 2023-06-19T20:29:21.000-07:00
diff --git a/data/wbm/readme.md b/data/wbm/readme.md
@@ -8,9 +8,7 @@ Source: [Predicting stable crystalline compounds using chemical similarity](http
 Load with
 
 ```py
-df_wbm_summary = pd.read_csv(  # download wbm-steps-summary.csv (23.31 MB)
-    "https://figshare.com/files/37570234?private_link=ff0ad14505f9624f0c05"
-).set_index("material_id")
+df_wbm_summary = pd.read_csv("data/wbm/2022-10-19-wbm-summary.csv").set_index("material_id")
 ```
 
 ## Comprehensive Link Collection for WBM dataset
diff --git a/models/bowsr/join_bowsr_results.py b/models/bowsr/join_bowsr_results.py
@@ -10,7 +10,7 @@
 from pymatgen.core import Structure
 from tqdm import tqdm
 
-from matbench_discovery import ROOT, as_dict_handler
+from matbench_discovery import ROOT
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-09-22"
@@ -48,9 +48,8 @@
 
 
 # %% compare against WBM formation energy targets to make sure we got sensible results
-df_wbm = pd.read_csv(  # download wbm-steps-summary.csv (23.31 MB)
-    "https://figshare.com/files/37570234?private_link=ff0ad14505f9624f0c05"
-).set_index("material_id")
+data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-summary.csv"
+df_wbm = pd.read_csv(data_path).set_index("material_id")
 
 df_bowsr["e_form_wbm"] = df_wbm.e_form_per_atom
 
@@ -71,7 +70,7 @@
 
 # %%
 out_path = f"{ROOT}/models/bowsr/{today}-bowsr-megnet-wbm-{task_type}.json.gz"
-df_bowsr.reset_index().to_json(out_path, default_handler=as_dict_handler)
+df_bowsr.reset_index().to_json(out_path, default_handler=lambda x: x.as_dict())
 
 # out_path = f"{ROOT}/models/bowsr/2022-08-16-bowsr-megnet-wbm-IS2RE.json.gz"
 # df_bowsr = pd.read_json(out_path).set_index("material_id")
diff --git a/models/cgcnn/use_cgcnn_ensemble.py b/models/cgcnn/use_cgcnn_ensemble.py
@@ -49,12 +49,10 @@
 df = df.dropna()  # two missing initial structures
 assert len(df) == old_len - 2
 
-assert all(
-    df.index == df_wbm.drop(index=no_init_structs).index
-), "df and df_wbm must have same index"
-df["e_form_per_atom_mp2020_corrected"] = df_wbm.e_form_per_atom_mp2020_corrected
+assert all(df.index == df_wbm.drop(index=no_init_structs).index)
 
 target_col = "e_form_per_atom_mp2020_corrected"
+df[target_col] = df_wbm[target_col]
 input_col = "initial_structure"
 assert target_col in df, f"{target_col=} not in {list(df)}"
 assert input_col in df, f"{input_col=} not in {list(df)}"
@@ -84,4 +82,4 @@
     data_loader=data_loader,
 )
 
-df.round(6).to_csv(f"{module_dir}/{today}-{run_name}-preds.csv")
+df.round(6).to_csv(f"{module_dir}/{today}-{run_name}-preds.csv", index=False)
diff --git a/models/wrenformer/mp/use_wrenformer_ensemble.py b/models/wrenformer/mp/use_wrenformer_ensemble.py
@@ -10,6 +10,7 @@
 from aviary.wrenformer.data import df_to_in_mem_dataloader
 from aviary.wrenformer.model import Wrenformer
 
+from matbench_discovery import ROOT
 from matbench_discovery.slurm import slurm_submit_python
 
 __author__ = "Janosh Riebesell"
@@ -37,8 +38,7 @@
 
 
 # %%
-# download wbm-steps-summary.csv (23.31 MB)
-data_path = "https://figshare.com/files/37570234?private_link=ff0ad14505f9624f0c05"
+data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-summary.csv"
 df = pd.read_csv(data_path).set_index("material_id")
 
 target_col = "e_form_per_atom"