|
24 | 24 |
|
25 | 25 | # %%
|
26 | 26 | glob_pattern = "2022-08-16-m3gnet-wbm-relax-results/*.json.gz"
|
27 |
| -file_paths = glob(f"{ROOT}/data/{glob_pattern}") |
| 27 | +file_paths = sorted(glob(f"{ROOT}/data/{glob_pattern}")) |
28 | 28 | print(f"Found {len(file_paths):,} files for {glob_pattern = }")
|
29 | 29 |
|
30 |
| - |
31 | 30 | dfs: dict[str, pd.DataFrame] = {}
|
| 31 | + |
| 32 | + |
| 33 | +# %% |
32 | 34 | # 2022-08-16 tried multiprocessing.Pool() to load files in parallel but was somehow
|
33 | 35 | # slower than serial loading
|
34 | 36 | for file_path in tqdm(file_paths):
|
35 | 37 | if file_path in dfs:
|
36 | 38 | continue
|
37 | 39 | try:
|
38 |
| - dfs[file_path] = pd.read_json(file_path) |
| 40 | + # keep whole dataframe in memory |
| 41 | + df = pd.read_json(file_path) |
| 42 | + df.index = df.index.str.replace("_", "-") |
| 43 | + df.index.name = "material_id" |
| 44 | + col_map = dict( |
| 45 | + final_structure="m3gnet_structure", trajectory="m3gnet_trajectory" |
| 46 | + ) |
| 47 | + df = df.rename(columns=col_map) |
| 48 | + df.reset_index().to_json(file_path) |
| 49 | + df["m3gnet_energy"] = df.m3gnet_trajectory.map(lambda x: x["energies"][-1][0]) |
| 50 | + df["m3gnet_structure"] = df.m3gnet_structure.map(Structure.from_dict) |
| 51 | + df["formula"] = df.m3gnet_structure.map(lambda x: x.formula) |
| 52 | + df["volume"] = df.m3gnet_structure.map(lambda x: x.volume) |
| 53 | + df["n_sites"] = df.m3gnet_structure.map(len) |
| 54 | + dfs[file_path] = df.drop(columns=["m3gnet_trajectory"]) |
39 | 55 | except (ValueError, FileNotFoundError):
|
40 | 56 | # pandas v1.5+ correctly raises FileNotFoundError, below raises ValueError
|
41 | 57 | continue
|
42 | 58 |
|
43 | 59 |
|
44 | 60 | # %%
|
45 | 61 | df_m3gnet = pd.concat(dfs.values())
|
46 |
| -df_m3gnet.index.name = "material_id" |
47 | 62 | if any(df_m3gnet.index.str.contains("_")):
|
48 | 63 | df_m3gnet.index = df_m3gnet.index.str.replace("_", "-")
|
49 | 64 |
|
50 |
| -df_m3gnet = df_m3gnet.rename( |
51 |
| - columns=dict(final_structure="m3gnet_structure", trajectory="m3gnet_trajectory") |
52 |
| -) |
53 |
| - |
54 |
| -df_m3gnet["m3gnet_energy"] = df_m3gnet.trajectory.map(lambda x: x["energies"][-1][0]) |
55 |
| - |
56 | 65 |
|
57 | 66 | # %%
|
58 | 67 | # 2022-01-25-ppd-mp+wbm.pkl.gz (235 MB)
|
|
64 | 73 | )
|
65 | 74 |
|
66 | 75 |
|
67 |
| -df_m3gnet["m3gnet_structure"] = df_m3gnet.m3gnet_structure.map(Structure.from_dict) |
68 |
| -df_m3gnet["pd_entry"] = [ |
| 76 | +pd_entries_m3gnet = [ |
69 | 77 | PDEntry(row.m3gnet_structure.composition, row.m3gnet_energy)
|
70 | 78 | for row in df_m3gnet.itertuples()
|
71 | 79 | ]
|
72 |
| -df_m3gnet["e_form_m3gnet"] = df_m3gnet.pd_entry.map(ppd_mp_wbm.get_form_energy_per_atom) |
| 80 | +df_m3gnet["e_form_m3gnet"] = [ |
| 81 | + ppd_mp_wbm.get_form_energy_per_atom(x) for x in pd_entries_m3gnet |
| 82 | +] |
73 | 83 |
|
74 | 84 |
|
75 | 85 | # %%
|
|
80 | 90 | df_m3gnet["e_above_mp_hull"] = df_hull.e_above_mp_hull
|
81 | 91 |
|
82 | 92 |
|
83 |
| -df_summary = pd.read_csv(f"{ROOT}/data/wbm-steps-summary.csv", comment="#").set_index( |
84 |
| - "material_id" |
85 |
| -) |
| 93 | +df_wbm = pd.read_csv( # download wbm-steps-summary.csv (23.31 MB) |
| 94 | + "https://figshare.com/ndownloader/files/36714216?private_link=ff0ad14505f9624f0c05" |
| 95 | +).set_index("material_id") |
| 96 | + |
| 97 | +df_m3gnet["e_form_wbm"] = df_wbm.e_form |
| 98 | +df_m3gnet["wbm_energy"] = df_wbm.energy |
86 | 99 |
|
87 |
| -df_m3gnet["e_form_wbm"] = df_summary.e_form |
| 100 | +pd_entries_wbm = [ |
| 101 | + PDEntry(row.m3gnet_structure.composition, row.wbm_energy) |
| 102 | + for row in df_m3gnet.itertuples() |
| 103 | +] |
| 104 | +df_m3gnet["e_form_ppd_2022_01_25"] = [ |
| 105 | + ppd_mp_wbm.get_form_energy_per_atom(x) for x in pd_entries_wbm |
| 106 | +] |
| 107 | + |
| 108 | + |
| 109 | +df_m3gnet.filter(like="e_form").plot.scatter(x="e_form_m3gnet", y="e_form_wbm") |
| 110 | +df_m3gnet.filter(like="e_form").plot.scatter( |
| 111 | + x="e_form_m3gnet", y="e_form_ppd_2022_01_25" |
| 112 | +) |
| 113 | +df_m3gnet.filter(like="e_form").plot.scatter(x="e_form_wbm", y="e_form_ppd_2022_01_25") |
88 | 114 |
|
89 | 115 |
|
90 | 116 | # %%
|
|
94 | 120 |
|
95 | 121 | # %%
|
96 | 122 | out_path = f"{ROOT}/data/{today}-m3gnet-wbm-relax-results.json.gz"
|
97 |
| -df_m3gnet.drop(columns=["pd_entry"]).reset_index().to_json( |
98 |
| - out_path, default_handler=as_dict_handler |
99 |
| -) |
| 123 | +df_m3gnet.reset_index().to_json(out_path, default_handler=as_dict_handler) |
100 | 124 |
|
101 | 125 |
|
102 | 126 | # %%
|
103 | 127 | ax_hull_dist_hist = hist_classify_stable_as_func_of_hull_dist(
|
104 |
| - formation_energy_targets=df_m3gnet.e_form_wbm, |
| 128 | + formation_energy_targets=df_m3gnet.e_form_ppd, |
105 | 129 | formation_energy_preds=df_m3gnet.e_form_m3gnet,
|
106 | 130 | e_above_hull_vals=df_m3gnet.e_above_mp_hull,
|
107 | 131 | )
|
|
0 commit comments