|
79 | 79 | # %%
|
80 | 80 | json_paths = sorted(glob(f"{module_dir}/raw/wbm-structures-step-*.json.bz2"))
|
81 | 81 | step_lens = (61848, 52800, 79205, 40328, 23308)
|
82 |
| -# step 3 has 79,211 structures but only 79,205 ComputedStructureEntries |
| 82 | +# step 3 has 79,211 initial structures but only 79,205 ComputedStructureEntries |
83 | 83 | # i.e. 6 extra structures which have missing energy, volume, etc. in the summary file
|
84 | 84 | bad_struct_ids = (70802, 70803, 70825, 70826, 70828, 70829)
|
| 85 | +# step 5 has 2 missing initial structures: 23166, 23294 |
85 | 86 |
|
86 | 87 |
|
87 | 88 | assert len(json_paths) == len(step_lens), "Mismatch in WBM steps and JSON files"
|
@@ -229,6 +230,14 @@ def increment_wbm_material_id(wbm_id: str) -> str:
|
229 | 230 | ).value_counts().to_dict() == {"GGA": 248481, "GGA+U": 9008}
|
230 | 231 |
|
231 | 232 |
|
| 233 | +# drop two materials with missing initial structures |
| 234 | +assert list(df_wbm.query("initial_structure.isna()").index) == [ |
| 235 | + "wbm-step-5-23166", |
| 236 | + "wbm-step-5-23294", |
| 237 | +] |
| 238 | +df_wbm = df_wbm.dropna(subset=["initial_structure"]) |
| 239 | + |
| 240 | + |
232 | 241 | # %% get composition from CSEs
|
233 | 242 | df_wbm["composition_from_cse"] = [
|
234 | 243 | ComputedStructureEntry.from_dict(cse).composition
|
@@ -273,12 +282,13 @@ def increment_wbm_material_id(wbm_id: str) -> str:
|
273 | 282 | x.alphabetical_formula for x in df_wbm.pop("composition_from_cse")
|
274 | 283 | ]
|
275 | 284 |
|
276 |
| -for key, col_name in ( |
277 |
| - ("cses", "computed_structure_entry"), |
278 |
| - ("init-structs", "initial_structure"), |
| 285 | +for fname, cols in ( |
| 286 | + ("cses", ["computed_structure_entry"]), |
| 287 | + ("init-structs", ["initial_structure"]), |
| 288 | + ("cses+init-structs", ["initial_structure", "computed_structure_entry"]), |
279 | 289 | ):
|
280 |
| - cols = ["initial_structure", "formula_from_cse", col_name] |
281 |
| - df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{key}.json.bz2") |
| 290 | + cols = ["formula_from_cse", *cols] |
| 291 | + df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{fname}.json.bz2") |
282 | 292 |
|
283 | 293 |
|
284 | 294 | # %%
|
@@ -486,26 +496,32 @@ def increment_wbm_material_id(wbm_id: str) -> str:
|
486 | 496 | f"{module_dir}/2022-10-19-wbm-cses+init-structs.json.bz2"
|
487 | 497 | ).set_index("material_id")
|
488 | 498 |
|
489 |
| -df_init_struct = pd.read_json( |
490 |
| - f"{module_dir}/2022-10-19-wbm-init-structs.json.bz2" |
491 |
| -).set_index("material_id") |
492 |
| - |
493 | 499 | df_wbm["cse"] = [
|
494 | 500 | ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
|
495 | 501 | ]
|
496 | 502 |
|
497 | 503 |
|
498 | 504 | # %%
|
499 |
| -df_wbm["init_struct"] = [ |
500 |
| - Structure.from_dict(x) if x else None for x in tqdm(df_wbm.initial_structure) |
501 |
| -] |
| 505 | +df_init_struct = pd.read_json( |
| 506 | + f"{module_dir}/2022-10-19-wbm-init-structs.json.bz2" |
| 507 | +).set_index("material_id") |
502 | 508 |
|
503 | 509 | wyckoff_col = "wyckoff_spglib"
|
504 |
| -for idx, struct in tqdm(df_wbm.init_struct.items(), total=len(df_wbm)): |
505 |
| - if struct is None: |
| 510 | +if wyckoff_col not in df_init_struct: |
| 511 | + df_init_struct[wyckoff_col] = None |
| 512 | + |
| 513 | +for idx, struct in tqdm( |
| 514 | + df_init_struct.initial_structure.items(), total=len(df_init_struct) |
| 515 | +): |
| 516 | + if not pd.isna(df_summary.at[idx, wyckoff_col]): |
506 | 517 | continue
|
507 |
| - if not df_wbm.at[idx, wyckoff_col]: |
508 |
| - df_wbm.at[idx, wyckoff_col] = get_aflow_label_from_spglib(struct) |
| 518 | + try: |
| 519 | + struct = Structure.from_dict(struct) |
| 520 | + df_summary.at[idx, wyckoff_col] = get_aflow_label_from_spglib(struct) |
| 521 | + except Exception as exc: |
| 522 | + print(f"{idx=} {exc=}") |
| 523 | + |
| 524 | +assert df_summary[wyckoff_col].isna().sum() == 0 |
509 | 525 |
|
510 | 526 |
|
511 | 527 | # %% make sure material IDs within each step are consecutive
|
|
0 commit comments