Skip to content

Commit 8a8aac6

Browse files
committed
mkdir data/mp and move some PPD pickles and MP CSEs in there
1 parent a2e3f46 commit 8a8aac6

8 files changed

+19
-16
lines changed

.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ __pycache__
1212
*.csv.bz2
1313
*.pkl.gz
1414
data/**/raw
15-
data/**/202*
1615

1716
# Weights and Biases logs
1817
wandb/

matbench_discovery/build_phase_diagram.py data/mp/build_phase_diagram.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@
2626
pd.Series(
2727
{e.entry_id: e for e in all_mp_computed_structure_entries}
2828
).drop_duplicates().to_json( # mp-15590 appears twice so we drop_duplicates()
29-
f"{ROOT}/data/{today}-mp-computed-structure-entries.json.gz",
29+
f"{module_dir}/{today}-mp-computed-structure-entries.json.gz",
3030
default_handler=lambda x: x.as_dict(),
3131
)
3232

3333

3434
# %%
35-
data_path = f"{ROOT}/data/2022-09-16-mp-computed-structure-entries.json.gz"
35+
data_path = f"{module_dir}/2022-09-16-mp-computed-structure-entries.json.gz"
3636
df = pd.read_json(data_path).set_index("material_id")
3737
# drop the structure, just load ComputedEntry
3838
mp_computed_entries = df.entry.map(ComputedEntry.from_dict).to_dict()
@@ -87,11 +87,11 @@
8787
elemental_ref_entries = get_elemental_ref_entries(mp_computed_entries)
8888

8989
# save elemental_ref_entries to disk as json
90-
with open(f"{module_dir}/{today}-elemental-ref-entries.json", "w") as file:
90+
with open(f"{ROOT}/data/mp/{today}-mp-elemental-reference-entries.json", "w") as file:
9191
json.dump(elemental_ref_entries, file, default=lambda x: x.as_dict())
9292

9393

94-
df_mp = pd.read_json(f"{ROOT}/data/2022-08-13-mp-energies.json.gz").set_index(
94+
df_mp = pd.read_json(f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz").set_index(
9595
"material_id"
9696
)
9797

models/wrenformer/mp/get_mp_energies.py data/mp/get_mp_energies.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# %%
2+
import os
23
from datetime import datetime
34

45
import pandas as pd
@@ -7,8 +8,6 @@
78
from mp_api.client import MPRester
89
from tqdm import tqdm
910

10-
from matbench_discovery import ROOT
11-
1211
"""
1312
Download all MP formation and above hull energies on 2022-08-13.
1413
@@ -20,6 +19,7 @@
2019
__date__ = "2022-08-13"
2120

2221
today = f"{datetime.now():%Y-%m-%d}"
22+
module_dir = os.path.dirname(__file__)
2323

2424

2525
# %% query all MP formation energies on 2022-08-13
@@ -48,6 +48,6 @@
4848

4949
df["wyckoff"] = [get_aflow_label_from_spglib(x) for x in tqdm(df.structure)]
5050

51-
df.to_json(f"{ROOT}/data/{today}-mp-energies.json.gz", default_handler=as_dict_handler)
51+
df.to_json(f"{module_dir}/{today}-mp-energies.json.gz", default_handler=as_dict_handler)
5252

53-
# df = pd.read_json(f"{ROOT}/data/2022-08-13-mp-energies.json.gz")
53+
# df = pd.read_json(f"{module_dir}/2022-08-13-mp-energies.json.gz")

data/wbm/fetch_process_wbm_dataset.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -272,9 +272,13 @@ def increment_wbm_material_id(wbm_id: str) -> str:
272272
df_wbm["formula_from_cse"] = [
273273
x.alphabetical_formula for x in df_wbm.pop("composition_from_cse")
274274
]
275-
df_wbm[
276-
["initial_structure", "computed_structure_entry", "formula_from_cse"]
277-
].reset_index().to_json(f"{module_dir}/{today}-wbm-cses+init-structs.json.bz2")
275+
276+
for key, col_name in (
277+
("cses", "computed_structure_entry"),
278+
("init-structs", "initial_structure"),
279+
):
280+
cols = ["initial_structure", "formula_from_cse", col_name]
281+
df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{key}.json.bz2")
278282

279283

280284
# %%

models/bowsr/slurm_array_bowsr_wbm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
slurm_array_task_count = 500
3737
out_dir = f"{module_dir}/{today}-bowsr-megnet-wbm-{task_type}"
3838

39-
data_path = f"{ROOT}/data/2022-06-26-wbm-cses-and-initial-structures.json.gz"
39+
data_path = f"{ROOT}/data/2022-10-19-wbm-init-structs.json.gz"
4040

4141
slurm_submit_python(
4242
job_name=f"bowsr-megnet-wbm-{task_type}",

models/cgcnn/use_cgcnn_ensemble.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
)
6363
df, ensemble_metrics = predict_from_wandb_checkpoints(
6464
runs,
65-
df=df,
65+
df=cg_data.df, # dropping isolated-atom structs means len(cg_data.df) < len(df)
6666
target_col=target_col,
6767
model_class=CrystalGraphConvNet,
6868
data_loader=data_loader,

models/voronoi/featurize_mp_wbm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545

4646
# %%
47-
data_path = f"{ROOT}/data/2022-09-16-mp-computed-structure-entries.json.gz"
47+
data_path = f"{ROOT}/data/mp/2022-09-16-mp-computed-structure-entries.json.gz"
4848
# data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-cses+init-structs.json.bz2"
4949
df = pd.read_json(data_path).set_index("material_id")
5050

models/wrenformer/slurm_train_wrenformer_ensemble.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545
# %%
4646
learning_rate = 3e-4
47-
data_path = f"{ROOT}/data/2022-08-13-mp-energies.json.gz"
47+
data_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
4848
target_col = "energy_per_atom"
4949
# data_path = f"{ROOT}/data/2022-08-25-m3gnet-trainset-mp-2021-struct-energy.json.gz"
5050
# target_col = "mp_energy_per_atom"

0 commit comments

Comments
 (0)