115
115
# re-index after dropping bad structures to get same indices as summary file
116
116
# where IDs are consecutive, i.e. step_3_70801 is followed by step_3_70802,
117
117
# not step_3_70804, etc.
118
- df .index = [f"step_3_{ idx + 1 } " for idx in range (len (df ))]
118
+ # df.index = [f"step_3_{idx + 1}" for idx in range(len(df))]
119
119
120
120
step_len = step_lens [step - 1 ]
121
121
assert len (df ) == step_len , f"bad len for { step = } : { len (df )} != { step_len } "
122
122
dfs_wbm_structs [step ] = df
123
123
124
124
125
- # NOTE step 5 is missing 2 initial structures
125
+ # NOTE step 5 is missing 2 initial structures, see nan_init_structs_ids below
126
126
assert dict (dfs_wbm_structs [5 ].isna ().sum ()) == {"opt" : 0 , "org" : 2 }
127
127
assert list (dfs_wbm_structs [5 ].query ("org.isna()" ).index ) == [
128
128
"step_5_23165" ,
@@ -227,13 +227,11 @@ def increment_wbm_material_id(wbm_id: str) -> str:
227
227
cse ["parameters" ]["run_type" ] for cse in tqdm (df_wbm .computed_structure_entry )
228
228
).value_counts ().to_dict () == {"GGA" : 248481 , "GGA+U" : 9008 }
229
229
230
-
231
- # drop two materials with missing initial structures
232
- assert list (df_wbm .query ("initial_structure.isna()" ).index ) == [
233
- "wbm-5-23166" ,
234
- "wbm-5-23294" ,
235
- ]
236
- df_wbm = df_wbm .dropna (subset = ["initial_structure" ])
230
+ # make sure only 2 materials have missing initial structures with expected IDs
231
+ nan_init_structs_ids = ["wbm-5-23166" , "wbm-5-23294" ]
232
+ assert list (df_wbm .query ("initial_structure.isna()" ).index ) == nan_init_structs_ids
233
+ # drop the two materials with missing initial structures
234
+ df_wbm = df_wbm .drop (index = nan_init_structs_ids )
237
235
238
236
239
237
# %% get composition from CSEs
@@ -275,22 +273,12 @@ def increment_wbm_material_id(wbm_id: str) -> str:
275
273
), f"composition mismatch for { row .Index = } "
276
274
277
275
278
- # %%
276
+ # %% extract alphabetical formula from CSEs (will be used as ground-truth formulas since
277
+ # more informative than reduced formulas found in df_summary)
279
278
df_wbm ["formula_from_cse" ] = [
280
279
x .alphabetical_formula for x in df_wbm .pop ("composition_from_cse" )
281
280
]
282
281
283
- for fname , cols in (
284
- ("computed-structure-entries" , ["computed_structure_entry" ]),
285
- ("init-structs" , ["initial_structure" ]),
286
- (
287
- "computed-structure-entries+init-structs" ,
288
- ["initial_structure" , "computed_structure_entry" ],
289
- ),
290
- ):
291
- cols = ["formula_from_cse" , * cols ]
292
- df_wbm [cols ].reset_index ().to_json (f"{ module_dir } /{ today } -wbm-{ fname } .json.bz2" )
293
-
294
282
295
283
# %%
296
284
col_map = {
@@ -322,31 +310,89 @@ def increment_wbm_material_id(wbm_id: str) -> str:
322
310
)
323
311
324
312
313
+ assert sum (df_summary .index == "None" ) == 6
314
+ # the 'None' materials have 0 volume, energy, n_sites, bandgap, etc.
315
+ assert all (df_summary [df_summary .index == "None" ].drop (columns = ["formula" ]) == 0 )
316
+ assert len (df_summary .query ("volume > 0" )) == len (df_wbm ) + len (nan_init_structs_ids )
325
317
# make sure dropping materials with 0 volume removes exactly 6 materials, the same ones
326
318
# listed in bad_struct_ids above
327
- assert len (df_summary .query ("volume > 0" )) == len (df_wbm )
328
319
assert all (
329
320
df_summary .reset_index ().query ("volume == 0" ).index .values - sum (step_lens [:2 ])
330
321
== bad_struct_ids
331
322
)
332
- df_summary = df_summary .query ("volume > 0" )
333
- df_summary .index = df_summary .index .map (increment_wbm_material_id )
323
+
324
+ df_summary .index = df_summary .index .map (increment_wbm_material_id ) # format IDs
325
+ # drop materials with id='None' and missing initial structures
326
+ df_summary = df_summary .drop (index = nan_init_structs_ids + ["None" ])
327
+
328
+ # the 8403 material IDs in step 3 with final number larger than any of the ones in
329
+ # bad_struct_ids are now misaligned between df_summary and df_wbm
330
+ # the IDs in df_summary are consecutive while the IDs in df_wbm skip over the numbers in
331
+ # bad_struct_ids. we fix this with fix_bad_struct_index_mismatch() by mapping the IDs in
332
+ # df_wbm to the ones in df_summary so that both indices become consecutive.
333
+ assert sum (df_summary .index != df_wbm .index ) == 8403
334
+
335
+
336
+ def fix_bad_struct_index_mismatch (material_id : str ) -> str :
337
+ """Decrement material IDs in step 3 by the number of IDs with smaller final number
338
+ in bad_struct_ids. This should fix the index mismatch between df_summary and df_wbm.
339
+ """
340
+ _ , step_num , mat_num = material_id .split ("-" )
341
+ step_num , mat_num = int (step_num ), int (mat_num )
342
+
343
+ if step_num == 3 :
344
+ mat_num -= sum (mat_num > idx + 1 for idx in bad_struct_ids )
345
+
346
+ return f"wbm-{ step_num } -{ mat_num } "
347
+
348
+
349
+ # don't accidentally apply the fix twice
350
+ if sum (df_summary .index != df_wbm .index ) != 0 :
351
+ df_wbm .index = df_wbm .index .map (fix_bad_struct_index_mismatch )
352
+
353
+ # check that the index mismatch is fixed
334
354
assert sum (df_summary .index != df_wbm .index ) == 0
335
355
356
+ # update ComputedStructureEntry entry_ids to match material_ids
357
+ for mat_id , cse in df_wbm .computed_structure_entry .items ():
358
+ entry_id = cse ["entry_id" ]
359
+ if mat_id != entry_id :
360
+ print (f"{ mat_id = } != { entry_id = } " )
361
+ cse ["entry_id" ] = mat_id
362
+
363
+
336
364
# sort formulas alphabetically
337
365
df_summary ["alph_formula" ] = [
338
366
Composition (x ).alphabetical_formula for x in df_summary .formula
339
367
]
340
- assert sum ( df_summary . alph_formula != df_summary . formula ) == 219_215
341
- assert df_summary . alph_formula [ 3 ] == "Ag2 Au1 Hg1"
342
- assert df_summary .formula [ 3 ] == "Ag2 Hg1 Au1"
368
+ # alphabetical formula and original formula differ due to spaces, number 1 after element
369
+ # symbols (FeO vs Fe1 O1), and element order (FeO vs OFe)
370
+ assert sum ( df_summary .alph_formula != df_summary . formula ) == 257_483
343
371
344
372
df_summary ["formula" ] = df_summary .pop ("alph_formula" )
345
373
346
374
375
+ # %% write initial structures and computed structure entries to compressed json
376
+ for fname , cols in (
377
+ ("computed-structure-entries" , ["computed_structure_entry" ]),
378
+ ("init-structs" , ["initial_structure" ]),
379
+ (
380
+ "computed-structure-entries+init-structs" ,
381
+ ["initial_structure" , "computed_structure_entry" ],
382
+ ),
383
+ ):
384
+ cols = ["formula_from_cse" , * cols ]
385
+ df_wbm [cols ].reset_index ().to_json (f"{ module_dir } /{ today } -wbm-{ fname } .json.bz2" )
386
+
387
+
347
388
# %%
348
- # check summary and CSE formulas agree
349
- assert all (df_summary ["formula" ] == df_wbm .formula_from_cse )
389
+ # df_summary and df_wbm formulas differ because summary formulas are reduced while
390
+ # df_wbm formulas are not (e.g. Ac6 U2 vs Ac3 U1 in summary). unreduced is more
391
+ # informative so we use it.
392
+ assert sum (df_summary .formula != df_wbm .formula_from_cse ) == 114_273
393
+ assert sum (df_summary .formula == df_wbm .formula_from_cse ) == 143_214
394
+
395
+ df_summary .formula = df_wbm .formula_from_cse
350
396
351
397
352
398
# fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE
@@ -418,34 +464,37 @@ def increment_wbm_material_id(wbm_id: str) -> str:
418
464
419
465
420
466
# %%
467
+ for mat_id , cse in df_wbm .computed_structure_entry .items ():
468
+ assert mat_id == cse ["entry_id" ], f"{ mat_id } != { cse ['entry_id' ]} "
469
+
470
+ df_wbm ["cse" ] = [
471
+ ComputedStructureEntry .from_dict (x ) for x in tqdm (df_wbm .computed_structure_entry )
472
+ ]
421
473
# raw WBM ComputedStructureEntries have no energy corrections applied:
422
474
assert all (cse .uncorrected_energy == cse .energy for cse in df_wbm .cse )
423
475
# summary and CSE n_sites match
424
476
assert all (df_summary .n_sites == [len (cse .structure ) for cse in df_wbm .cse ])
425
477
478
+ for mp_compat in [MPLegacyCompat (), MP2020Compat ()]:
479
+ compat_out = mp_compat .process_entries (df_wbm .cse , clean = True , verbose = True )
480
+ assert len (compat_out ) == len (df_wbm ) == len (df_summary )
426
481
427
- mp_compat = MP2020Compat () if False else MPLegacyCompat ()
428
- compat_out = mp_compat .process_entries (df_wbm .cse , clean = True , verbose = True )
429
-
430
- mp_compat .process_entry (cse )
431
- assert len (compat_out ) == len (df_wbm ) == len (df_summary )
432
-
433
- n_corrected = sum (cse .uncorrected_energy != cse .energy for cse in df_wbm .cse )
434
- if isinstance (mp_compat , MPLegacyCompat ):
435
- assert n_corrected == 39595 , f"{ n_corrected = } "
436
- if isinstance (mp_compat , MP2020Compat ):
437
- assert n_corrected == 100931 , f"{ n_corrected = } "
482
+ n_corrected = sum (cse .uncorrected_energy != cse .energy for cse in df_wbm .cse )
483
+ if isinstance (mp_compat , MPLegacyCompat ):
484
+ assert n_corrected == 39591 , f"{ n_corrected = } "
485
+ if isinstance (mp_compat , MP2020Compat ):
486
+ assert n_corrected == 100930 , f"{ n_corrected = } "
438
487
439
- corr_label = "mp2020" if isinstance (mp_compat , MP2020Compat ) else "legacy"
440
- df_summary [f"e_correction_per_atom_{ corr_label } " ] = [
441
- cse .correction_per_atom for cse in df_wbm .cse
442
- ]
488
+ corr_label = "mp2020" if isinstance (mp_compat , MP2020Compat ) else "legacy"
489
+ df_summary [f"e_correction_per_atom_{ corr_label } " ] = [
490
+ cse .correction_per_atom for cse in df_wbm .cse
491
+ ]
443
492
444
- assert df_summary .e_correction_per_atom_mp2020 .mean ().round (4 ) == - 0.1067
445
- assert df_summary .e_correction_per_atom_legacy .mean ().round (4 ) == - 0.0643
493
+ assert df_summary .e_correction_per_atom_mp2020 .mean ().round (4 ) == - 0.1069
494
+ assert df_summary .e_correction_per_atom_legacy .mean ().round (4 ) == - 0.0645
446
495
assert (df_summary .filter (like = "correction" ).abs () > 1e-4 ).sum ().to_dict () == {
447
- "e_correction_per_atom_mp2020" : 100931 ,
448
- "e_correction_per_atom_legacy" : 39595 ,
496
+ "e_correction_per_atom_mp2020" : 100930 ,
497
+ "e_correction_per_atom_legacy" : 39591 ,
449
498
}, "unexpected number of materials received non-zero corrections"
450
499
451
500
ax = density_scatter (
@@ -458,7 +507,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
458
507
459
508
460
509
# %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to
461
- # https://github.com/spglib/spglib/issues/194 when using spglib v2.0.{0,1}
510
+ # https://github.com/spglib/spglib/issues/194 when using spglib versions 2.0.0 or 2.0.1
511
+ # left here as a reminder and for future users in case they encounter the same issue
462
512
cse = df_wbm .computed_structure_entry ["wbm-1-24459" ]
463
513
cse = ComputedStructureEntry .from_dict (cse )
464
514
mp_compat .process_entry (cse )
@@ -470,13 +520,14 @@ def increment_wbm_material_id(wbm_id: str) -> str:
470
520
471
521
472
522
# %% calculate e_above_hull for each material
473
- # this loop needs the warnings filter above to not crash Jupyter kernel with logs
523
+ # this loop needs above warnings.filterwarnings() to not crash Jupyter kernel with logs
474
524
# takes ~20 min at 200 it/s for 250k entries in WBM
475
525
e_above_hull_key = "e_above_hull_uncorrected_ppd_mp"
476
526
assert e_above_hull_key not in df_summary
477
527
478
- for entry in tqdm (df_wbm .cse ):
479
- assert entry .entry_id .startswith ("wbm-" )
528
+ for mat_id , entry in tqdm (df_wbm .cse .items (), total = len (df_wbm )):
529
+ assert mat_id == entry .entry_id , f"{ mat_id = } != { entry .entry_id = } "
530
+ assert entry .entry_id in df_summary .index , f"{ entry .entry_id = } not in df_summary"
480
531
481
532
e_per_atom = entry .uncorrected_energy_per_atom
482
533
e_hull_per_atom = ppd_mp .get_hull_energy_per_atom (entry .composition )
@@ -497,8 +548,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
497
548
# first make sure source and target dfs have matching indices
498
549
assert sum (df_wbm .index != df_summary .index ) == 0
499
550
500
- e_form_key = "e_form_per_atom_uncorrected_mp_refs "
501
- assert e_form_key not in df_summary
551
+ e_form_col = "e_form_per_atom_uncorrected "
552
+ assert e_form_col not in df_summary
502
553
503
554
for row in tqdm (df_wbm .itertuples (), total = len (df_wbm )):
504
555
mat_id , cse , formula = row .Index , row .cse , row .formula_from_cse
@@ -509,40 +560,21 @@ def increment_wbm_material_id(wbm_id: str) -> str:
509
560
e_form = get_e_form_per_atom (entry_like )
510
561
e_form_ppd = ppd_mp .get_form_energy_per_atom (cse )
511
562
512
- # make sure the PPD and functional method of calculating formation energy agree
513
- assert abs (e_form - e_form_ppd ) < 1e-7 , f"{ e_form = } != { e_form_ppd = } "
514
- df_summary .at [cse .entry_id , e_form_key ] = e_form
515
-
516
- assert len (df_summary ) == sum (
517
- step_lens
518
- ), f"rows were added: { len (df_summary )= } { sum (step_lens )= } "
519
-
563
+ correction = cse .correction_per_atom
564
+ # make sure the PPD.get_e_form_per_atom() and standalone get_e_form_per_atom()
565
+ # method of calculating formation energy agree
566
+ assert (
567
+ abs (e_form - (e_form_ppd - correction )) < 1e-7
568
+ ), f"{ mat_id = } : { e_form = :.3} != { e_form_ppd - correction = :.3} "
569
+ df_summary .at [cse .entry_id , e_form_col ] = e_form
520
570
521
571
# add old + new MP energy corrections to formation energies
522
572
for corrections in ("mp2020" , "legacy" ):
523
- df_summary [e_form_key .replace ("un" , f"{ corrections } _" )] = (
524
- df_summary [e_form_key ] + df_summary [f"e_correction_per_atom_{ corrections } " ]
573
+ df_summary [e_form_col .replace ("un" , f"{ corrections } _" )] = (
574
+ df_summary [e_form_col ] + df_summary [f"e_correction_per_atom_{ corrections } " ]
525
575
)
526
576
527
577
528
- # %%
529
- df_summary .round (6 ).to_csv (f"{ module_dir } /{ today } -wbm-summary.csv" )
530
-
531
- df_summary = pd .read_csv (f"{ module_dir } /2022-10-19-wbm-summary.csv" ).set_index (
532
- "material_id"
533
- )
534
-
535
-
536
- # %% read WBM dataset from disk
537
- df_wbm = pd .read_json (
538
- f"{ module_dir } /2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
539
- ).set_index ("material_id" )
540
-
541
- df_wbm ["cse" ] = [
542
- ComputedStructureEntry .from_dict (x ) for x in tqdm (df_wbm .computed_structure_entry )
543
- ]
544
-
545
-
546
578
# %%
547
579
df_init_struct = pd .read_json (
548
580
f"{ module_dir } /2022-10-19-wbm-init-structs.json.bz2"
@@ -566,13 +598,21 @@ def increment_wbm_material_id(wbm_id: str) -> str:
566
598
assert df_summary [wyckoff_col ].isna ().sum () == 0
567
599
568
600
569
- # %% make sure material IDs within each step are consecutive
570
- for step in range (1 , 6 ):
571
- df = df_summary [df_summary .index .str .startswith (f"wbm-{ step } -" )]
572
- step_len = step_lens [step - 1 ]
573
- assert len (df ) == step_len , f"{ step = } has { len (df )= } , expected { step_len = } "
601
+ # %% write final summary data to disk (yeah!)
602
+ df_summary .round (6 ).to_csv (f"{ module_dir } /{ today } -wbm-summary.csv" )
603
+
604
+
605
+ # %% read summary data from disk
606
+ df_summary = pd .read_csv (f"{ module_dir } /2022-10-19-wbm-summary.csv" ).set_index (
607
+ "material_id"
608
+ )
609
+
610
+
611
+ # %% read WBM initial structures and computed structure entries from disk
612
+ df_wbm = pd .read_json (
613
+ f"{ module_dir } /2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
614
+ ).set_index ("material_id" )
574
615
575
- step_counts = list (df .index .str .split ("-" ).str [- 1 ].astype (int ))
576
- assert step_counts == list (
577
- range (1 , step_len + 1 )
578
- ), f"{ step = } counts not consecutive"
616
+ df_wbm ["cse" ] = [
617
+ ComputedStructureEntry .from_dict (x ) for x in tqdm (df_wbm .computed_structure_entry )
618
+ ]
0 commit comments