|
1 |
| -"""Concatenate chgnet results from multiple data files generated by slurm job array |
| 1 | +"""Concatenate CHGNet results from multiple data files generated by slurm job array |
2 | 2 | into single file.
|
3 | 3 | """
|
4 | 4 |
|
|
13 | 13 | import pandas as pd
|
14 | 14 | from megnet.utils.models import load_model
|
15 | 15 | from pymatgen.core import Structure
|
16 |
| -from pymatgen.entries.compatibility import MaterialsProject2020Compatibility |
17 |
| -from pymatgen.entries.computed_entries import ComputedStructureEntry |
18 | 16 | from pymatviz import density_scatter
|
19 | 17 | from tqdm import tqdm
|
20 | 18 |
|
21 | 19 | from matbench_discovery import today
|
22 |
| -from matbench_discovery.data import DATA_FILES, as_dict_handler |
| 20 | +from matbench_discovery.data import as_dict_handler |
23 | 21 | from matbench_discovery.energy import get_e_form_per_atom
|
24 | 22 | from matbench_discovery.preds import df_wbm, e_form_col
|
25 | 23 |
|
|
32 | 30 | # %%
|
33 | 31 | module_dir = os.path.dirname(__file__)
|
34 | 32 | task_type = "IS2RE"
|
35 |
| -date = "2023-03-04" |
| 33 | +date = "2023-03-06" |
36 | 34 | glob_pattern = f"{date}-chgnet-wbm-{task_type}*/*.json.gz"
|
37 | 35 | file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
|
38 | 36 | print(f"Found {len(file_paths):,} files for {glob_pattern = }")
|
|
48 | 46 | # drop trajectory to save memory
|
49 | 47 | dfs[file_path] = df.drop(columns="chgnet_trajectory")
|
50 | 48 |
|
51 |
| - |
52 |
| -# %% |
53 | 49 | df_chgnet = pd.concat(dfs.values()).round(4)
|
54 | 50 |
|
55 | 51 |
|
56 |
| -# %% |
57 |
| -df_cse = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index( |
58 |
| - "material_id" |
59 |
| -) |
60 |
| - |
61 |
| -df_cse["cse"] = [ |
62 |
| - ComputedStructureEntry.from_dict(x) for x in tqdm(df_cse.computed_structure_entry) |
63 |
| -] |
64 |
| - |
65 |
| - |
66 |
| -# %% transfer CHGNet energies and relaxed structures WBM CSEs since MP2020 energy |
67 |
| -# corrections applied below are structure-dependent (for oxides and sulfides) |
68 |
| -cse: ComputedStructureEntry |
69 |
| -for row in tqdm(df_chgnet.itertuples(), total=len(df_chgnet)): |
70 |
| - mat_id, struct_dict, chgnet_energy, *_ = row |
71 |
| - chgnet_struct = Structure.from_dict(struct_dict) |
72 |
| - cse = df_cse.loc[mat_id, "cse"] |
73 |
| - cse._energy = chgnet_energy # cse._energy is the uncorrected energy |
74 |
| - cse._structure = chgnet_struct |
75 |
| - df_chgnet.loc[mat_id, "cse"] = cse |
76 |
| - |
77 |
| - |
78 |
| -# %% apply energy corrections to CSEs with CHGNet |
79 |
| -out = MaterialsProject2020Compatibility().process_entries( |
80 |
| - df_chgnet.cse, verbose=True, clean=True |
81 |
| -) |
82 |
| -assert len(out) == len(df_chgnet) |
83 |
| - |
84 |
| - |
85 | 52 | # %% compute corrected formation energies
|
86 | 53 | e_form_chgnet_col = "e_form_per_atom_chgnet"
|
87 |
| -df_chgnet[e_form_chgnet_col] = [get_e_form_per_atom(cse) for cse in tqdm(df_chgnet.cse)] |
| 54 | +df_chgnet["formula"] = df_wbm.formula |
| 55 | +df_chgnet[e_form_chgnet_col] = [ |
| 56 | + get_e_form_per_atom(dict(energy=ene, composition=formula)) |
| 57 | + for formula, ene in tqdm( |
| 58 | + df_chgnet.set_index("formula").chgnet_energy.items(), total=len(df_chgnet) |
| 59 | + ) |
| 60 | +] |
| 61 | +df_wbm[e_form_chgnet_col] = df_chgnet[e_form_chgnet_col] |
88 | 62 |
|
89 | 63 |
|
90 | 64 | # %%
|
91 |
| -ax = density_scatter(x=df_wbm[e_form_col], y=df_chgnet[e_form_chgnet_col]) |
| 65 | +ax = density_scatter(x=df_wbm[e_form_col], y=df_wbm[e_form_chgnet_col]) |
92 | 66 |
|
93 | 67 |
|
94 | 68 | # %% load 2019 MEGNet formation energy model
|
|
97 | 71 |
|
98 | 72 |
|
99 | 73 | # %% predict formation energies on chgnet relaxed structure with MEGNet
|
100 |
| -for material_id, cse in tqdm(df_cse.cse.items(), total=len(df_cse)): |
| 74 | +for material_id, struct in tqdm( |
| 75 | + df_chgnet.chgnet_structure.items(), total=len(df_chgnet) |
| 76 | +): |
101 | 77 | if material_id in megnet_e_form_preds:
|
102 | 78 | continue
|
103 | 79 | try:
|
104 |
| - struct = cse.structure |
| 80 | + if isinstance(struct, dict): |
| 81 | + struct = Structure.from_dict(struct) |
105 | 82 | [e_form_per_atom] = megnet_mp_e_form.predict_structure(struct)
|
106 | 83 | megnet_e_form_preds[material_id] = e_form_per_atom
|
107 | 84 | except Exception as exc:
|
|
118 | 95 |
|
119 | 96 | assert (
|
120 | 97 | n_isna := df_chgnet.e_form_per_atom_chgnet_megnet.isna().sum()
|
121 |
| -) < 10, f"{n_isna=}, expected 7 or similar" |
| 98 | +) < 10, f"too many missing MEGNet preds: {n_isna}" |
122 | 99 |
|
123 | 100 |
|
124 | 101 | # %%
|
|
133 | 110 |
|
134 | 111 | df_chgnet.select_dtypes("number").to_csv(out_path.replace(".json.gz", ".csv"))
|
135 | 112 |
|
136 |
| -# in_path = f"{module_dir}/2022-10-31-chgnet-wbm-IS2RE.json.gz" |
137 |
| -# df_chgnet_csv = pd.read_csv(in_path.replace(".json.gz", ".csv")) |
| 113 | +# in_path = f"{module_dir}/2023-03-04-chgnet-wbm-IS2RE.json.gz" |
| 114 | +# df_chgnet = pd.read_csv(in_path.replace(".json.gz", ".csv")).set_index("material_id") |
138 | 115 | # df_chgnet = pd.read_json(in_path).set_index("material_id")
|
0 commit comments