don't hydrate pseudo-relaxed structures in join_{bowsr,m3gnet}_results.py

janosh · janosh · commit 2073300eefce · 2023-06-19T20:29:22.000-07:00
diff --git a/data/mp/build_phase_diagram.py b/data/mp/build_phase_diagram.py
@@ -3,7 +3,6 @@
 import json
 import os
 import pickle
-from datetime import datetime
 
 import pandas as pd
 import pymatviz
@@ -12,10 +11,9 @@
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from pymatgen.ext.matproj import MPRester
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 from matbench_discovery.energy import get_e_form_per_atom, get_elemental_ref_entries
 
-today = f"{datetime.now():%Y-%m-%d}"
 module_dir = os.path.dirname(__file__)
 
 
diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -1,13 +1,14 @@
 # %%
 import os
-from datetime import datetime
 
 import pandas as pd
 from aviary.utils import as_dict_handler
 from aviary.wren.utils import get_aflow_label_from_spglib
 from mp_api.client import MPRester
 from tqdm import tqdm
 
+from matbench_discovery import today
+
 """
 Download all MP formation and above hull energies on 2022-08-13.
 
@@ -18,7 +19,7 @@
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-13"
 
-today = f"{datetime.now():%Y-%m-%d}"
+
 module_dir = os.path.dirname(__file__)
 
 
diff --git a/data/wbm/compare_cse_vs_ce_mp_2020_corrections.py b/data/wbm/compare_cse_vs_ce_mp_2020_corrections.py
@@ -2,7 +2,6 @@
 import gzip
 import json
 import warnings
-from datetime import datetime
 
 import pandas as pd
 from pymatgen.entries.compatibility import (
@@ -12,7 +11,7 @@
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from tqdm import tqdm
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 from matbench_discovery.energy import get_e_form_per_atom
 from matbench_discovery.plot_scripts import df_wbm
 from matbench_discovery.plots import plt
@@ -23,7 +22,6 @@
 ComputedStructureEntry, not ComputedEntry when applying corrections.
 """
 
-today = f"{datetime.now():%Y-%m-%d}"
 
 cse_path = f"{ROOT}/data/wbm/2022-10-19-wbm-cses.json.bz2"
 df_cse = pd.read_json(cse_path).set_index("material_id")
@@ -101,7 +99,7 @@
 
 ax.axline((0, 0), slope=1, color="gray", linestyle="dashed", zorder=-1)
 
-# ax.figure.savefig(f"{ROOT}/tmp/{today}-ce-vs-cse-corrections-outliers.pdf")
+ax.figure.savefig(f"{ROOT}/tmp/{today}-ce-vs-cse-corrections-outliers.pdf")
 
 
 # %%
@@ -122,7 +120,7 @@
 # insight: all materials for which ComputedEntry and ComputedStructureEntry give
 # different formation energies are oxides or sulfides for which MP 2020 compat takes
 # into account structural information to make more accurate corrections.
-# ax.figure.savefig(f"{ROOT}/tmp/{today}-ce-vs-cse-outliers.pdf")
+ax.figure.savefig(f"{ROOT}/tmp/{today}-ce-vs-cse-e-form-outliers.pdf")
 
 
 # %% below code resulted in
diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
@@ -4,7 +4,6 @@
 import pickle
 import urllib.request
 import warnings
-from datetime import datetime
 from glob import glob
 
 import pandas as pd
@@ -21,7 +20,7 @@
 from pymatviz import density_scatter
 from tqdm import tqdm
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 from matbench_discovery.energy import get_e_form_per_atom
 
 try:
@@ -40,7 +39,6 @@
 
 
 module_dir = os.path.dirname(__file__)
-today = f"{datetime.now():%Y-%m-%d}"
 warnings.filterwarnings("ignore", category=UserWarning, module="pymatgen")
 
 
diff --git a/matbench_discovery/plot_scripts/hist_classified_stable_vs_hull_dist.py b/matbench_discovery/plot_scripts/hist_classified_stable_vs_hull_dist.py
@@ -1,9 +1,7 @@
 # %%
-from datetime import datetime
-
 import pandas as pd
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 from matbench_discovery.plot_scripts import df_wbm
 from matbench_discovery.plots import (
     StabilityCriterion,
@@ -25,8 +23,6 @@
 See fig. S1 in https://science.org/doi/10.1126/sciadv.abn4117.
 """
 
-today = f"{datetime.now():%Y-%m-%d}"
-
 
 # %%
 df = pd.read_csv(
@@ -78,5 +74,7 @@
     title=f"Enrichment Factor = {metrics['enrichment']:.3}",
 )
 
-fig_name = f"wren-wbm-hull-dist-hist-{which_energy=}-{stability_crit=}"
-# fig.savefig(f"{ROOT}/figures/{today}-{fig_name}.pdf")
+
+# %%
+fig_name = f"{today}-wren-wbm-hull-dist-hist-{which_energy=}-{stability_crit=}"
+# fig.savefig(f"{ROOT}/figures/{fig_name}.pdf")
diff --git a/matbench_discovery/plot_scripts/hist_classified_stable_vs_hull_dist_batches.py b/matbench_discovery/plot_scripts/hist_classified_stable_vs_hull_dist_batches.py
@@ -1,10 +1,8 @@
 # %%
-from datetime import datetime
-
 import pandas as pd
 import pymatviz
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 from matbench_discovery.plot_scripts import df_wbm
 from matbench_discovery.plots import (
     StabilityCriterion,
@@ -27,8 +25,6 @@
 See fig. S1 in https://science.org/doi/10.1126/sciadv.abn4117.
 """
 
-today = f"{datetime.now():%Y-%m-%d}"
-
 
 # %%
 dfs = {}
diff --git a/matbench_discovery/plot_scripts/precision_recall.py b/matbench_discovery/plot_scripts/precision_recall.py
@@ -1,18 +1,13 @@
 # %%
-from datetime import datetime
-
-import matplotlib.pyplot as plt
 import pandas as pd
 from sklearn.metrics import f1_score
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 from matbench_discovery.plot_scripts import df_wbm
-from matbench_discovery.plots import StabilityCriterion, cumulative_clf_metric
+from matbench_discovery.plots import StabilityCriterion, cumulative_clf_metric, plt
 
 __author__ = "Rhys Goodall, Janosh Riebesell"
 
-today = f"{datetime.now():%Y-%m-%d}"
-
 
 # %%
 dfs: dict[str, pd.DataFrame] = {}
@@ -118,10 +113,11 @@
 # x-ticks every 10k materials
 # ax.set(xticks=range(0, int(ax.get_xlim()[1]), 10_000))
 
-fig.suptitle(f"{today} ")
+fig.suptitle(f"{today} {stability_crit=}")
 xlabel_cumulative = "Materials predicted stable sorted by hull distance"
 fig.text(0.5, -0.08, xlabel_cumulative, ha="center")
 
 
 # %%
-# fig.savefig(f"{ROOT}/figures/{today}-precision-recall-curves.pdf")
+img_path = f"{ROOT}/figures/{today}-precision-recall-curves.pdf"
+# fig.savefig(img_path)
diff --git a/matbench_discovery/plot_scripts/rolling_mae_vs_hull_dist.py b/matbench_discovery/plot_scripts/rolling_mae_vs_hull_dist.py
@@ -1,17 +1,13 @@
 # %%
-from datetime import datetime
-
 import pandas as pd
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 from matbench_discovery.plot_scripts import df_wbm
 from matbench_discovery.plots import rolling_mae_vs_hull_dist
 
 __author__ = "Rhys Goodall, Janosh Riebesell"
 __date__ = "2022-06-18"
 
-today = f"{datetime.now():%Y-%m-%d}"
-
 
 # %%
 markers = ["o", "v", "^", "H", "D", ""]
diff --git a/matbench_discovery/plot_scripts/rolling_mae_vs_hull_dist_wbm_batches.py b/matbench_discovery/plot_scripts/rolling_mae_vs_hull_dist_wbm_batches.py
@@ -1,17 +1,13 @@
 # %%
-from datetime import datetime
-
 import pandas as pd
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 from matbench_discovery.plot_scripts import df_wbm
 from matbench_discovery.plots import plt, rolling_mae_vs_hull_dist
 
 __author__ = "Rhys Goodall, Janosh Riebesell"
 __date__ = "2022-06-18"
 
-today = f"{datetime.now():%Y-%m-%d}"
-
 
 # %%
 df_wren = pd.read_csv(
@@ -70,4 +66,4 @@
 
 
 img_name = f"{today}-{model_name}-rolling-mae-vs-hull-dist-wbm-batches"
-fig.savefig(f"{ROOT}/figures/{img_name}.pdf")
+# fig.savefig(f"{ROOT}/figures/{img_name}.pdf")
diff --git a/models/bowsr/join_bowsr_results.py b/models/bowsr/join_bowsr_results.py
@@ -2,26 +2,22 @@
 from __future__ import annotations
 
 import os
-from datetime import datetime
 from glob import glob
 
 import pandas as pd
 import pymatviz
-from pymatgen.core import Structure
 from tqdm import tqdm
 
-from matbench_discovery import ROOT
+from matbench_discovery import ROOT, today
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-09-22"
 
-today = f"{datetime.now():%Y-%m-%d}"
-
 
 # %%
 module_dir = os.path.dirname(__file__)
 task_type = "IS2RE"
-date = "2022-09-22"
+date = "2022-11-22"
 glob_pattern = f"{date}-bowsr-megnet-wbm-{task_type}/*.json.gz"
 file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
 print(f"Found {len(file_paths):,} files for {glob_pattern = }")
@@ -35,10 +31,6 @@
         continue
     df = pd.read_json(file_path).set_index("material_id")
 
-    df["bowsr_structure"] = df.structure_bowsr.map(Structure.from_dict)
-    df["formula"] = df.structure_bowsr.map(lambda x: x.alphabetical_formula)
-    df["bowsr_volume"] = df.structure_bowsr.map(lambda x: x.volume)
-    df["n_sites"] = df.structure_bowsr.map(len)
     dfs[file_path] = df
 
 
@@ -57,7 +49,7 @@
 # %%
 pymatviz.density_scatter(
     x=df_bowsr.e_form_per_atom_bowsr_megnet,
-    y=df_bowsr.e_form_wbm,
+    y=df_wbm.loc[df_bowsr.index].e_form_per_atom_mp2020_corrected,
 )
 
 
diff --git a/models/m3gnet/eda_wbm_pre_vs_post_m3gnet_relaxation.py b/models/m3gnet/eda_wbm_pre_vs_post_m3gnet_relaxation.py
@@ -1,6 +1,4 @@
 # %%
-from datetime import datetime
-
 import pandas as pd
 import plotly.express as px
 from pymatgen.core import Structure
@@ -9,13 +7,12 @@
 from pymatviz.utils import add_identity_line
 from sklearn.metrics import r2_score
 
-from matbench_discovery import ROOT, plots
+from matbench_discovery import ROOT, plots, today
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-06-18"
 
 
-today = f"{datetime.now():%Y-%m-%d}"
 del plots  # https://github.com/PyCQA/pyflakes/issues/366
 
 
diff --git a/models/m3gnet/join_m3gnet_results.py b/models/m3gnet/join_m3gnet_results.py
@@ -2,22 +2,18 @@
 from __future__ import annotations
 
 import os
-from datetime import datetime
 from glob import glob
 
 import pandas as pd
 from pymatgen.analysis.phase_diagram import PDEntry
-from pymatgen.core import Structure
 from tqdm import tqdm
 
-from matbench_discovery import ROOT, as_dict_handler
+from matbench_discovery import ROOT, as_dict_handler, today
 from matbench_discovery.energy import get_e_form_per_atom
 
 __author__ = "Janosh Riebesell"
 __date__ = "2022-08-16"
 
-today = f"{datetime.now():%Y-%m-%d}"
-
 
 # %%
 module_dir = os.path.dirname(__file__)
@@ -36,26 +32,16 @@
 for file_path in tqdm(file_paths):
     if file_path in dfs:
         continue
-    try:
-        # keep whole dataframe in memory
-        df = pd.read_json(file_path).set_index("material_id")
-        df.index.name = "material_id"
-        col_map = dict(
-            final_structure="m3gnet_structure", trajectory="m3gnet_trajectory"
-        )
-        df = df.rename(columns=col_map)
-        df.reset_index().to_json(file_path)
-        df[f"m3gnet_energy_{task_type}"] = df.m3gnet_trajectory.map(
-            lambda x: x["energies"][-1][0]
-        )
-        df["m3gnet_structure"] = df.m3gnet_structure.map(Structure.from_dict)
-        df["formula"] = df.m3gnet_structure.map(lambda x: x.alphabetical_formula)
-        df["m3gnet_volume"] = df.m3gnet_structure.map(lambda x: x.volume)
-        df["n_sites"] = df.m3gnet_structure.map(len)
-        # drop trajectory to save memory
-        dfs[file_path] = df.drop(columns=["m3gnet_trajectory"])
-    except FileNotFoundError:
-        continue
+    df = pd.read_json(file_path).set_index("material_id")
+    df.index.name = "material_id"
+    col_map = dict(final_structure="structure_m3gnet", trajectory="m3gnet_trajectory")
+    df = df.rename(columns=col_map)
+    df.reset_index().to_json(file_path)
+    df[f"m3gnet_energy_{task_type}"] = df.m3gnet_trajectory.map(
+        lambda x: x["energies"][-1][0]
+    )
+    # drop trajectory to save memory
+    dfs[file_path] = df.drop(columns=["m3gnet_trajectory"])
 
 
 # %%
@@ -64,7 +50,7 @@
 
 # %%
 df_m3gnet["e_form_per_atom_m3gnet"] = [
-    get_e_form_per_atom(PDEntry(row.m3gnet_structure.composition, row.m3gnet_energy))
+    get_e_form_per_atom(PDEntry(row.structure_m3gnet.composition, row.m3gnet_energy))
     for row in tqdm(df_m3gnet.itertuples(), total=len(df_m3gnet), disable=None)
 ]
 df_m3gnet.isna().sum()
diff --git a/models/voronoi/join_voronoi_features.py b/models/voronoi/join_voronoi_features.py