8
8
from glob import glob
9
9
10
10
import pandas as pd
11
+ from aviary .wren .utils import get_aflow_label_from_spglib
11
12
from pymatgen .analysis .phase_diagram import PatchedPhaseDiagram
12
- from pymatgen .core import Structure
13
+ from pymatgen .core import Composition , Structure
13
14
from pymatgen .entries .compatibility import (
14
15
MaterialsProject2020Compatibility as MP2020Compat ,
15
16
)
@@ -144,8 +145,10 @@ def increment_wbm_material_id(wbm_id: str) -> str:
144
145
print (f"bad { wbm_id = } " )
145
146
return wbm_id
146
147
147
- assert prefix == "step"
148
- assert step_num .isdigit () and material_num .isdigit ()
148
+ msg = f"bad { wbm_id = } , { prefix = } { step_num = } { material_num = } "
149
+ assert prefix == "step" , msg
150
+ assert step_num .isdigit (), msg
151
+ assert material_num .isdigit (), msg
149
152
150
153
return f"wbm-step-{ step_num } -{ int (material_num ) + 1 } "
151
154
@@ -266,7 +269,9 @@ def increment_wbm_material_id(wbm_id: str) -> str:
266
269
267
270
268
271
# %%
269
- df_wbm ["formula_from_cse" ] = [x .formula for x in df_wbm .pop ("composition_from_cse" )]
272
+ df_wbm ["formula_from_cse" ] = [
273
+ x .alphabetical_formula for x in df_wbm .pop ("composition_from_cse" )
274
+ ]
270
275
df_wbm [["initial_structure" , "computed_structure_entry" , "formula_from_cse" ]].to_json (
271
276
f"{ module_dir } /{ today } -wbm-cses+init-structs.json.bz2"
272
277
)
@@ -278,8 +283,8 @@ def increment_wbm_material_id(wbm_id: str) -> str:
278
283
"nsites" : "n_sites" ,
279
284
"vol" : "volume" ,
280
285
"e" : "uncorrected_energy" ,
281
- "e_form" : "e_form_per_atom " ,
282
- "e_hull" : "e_hull " ,
286
+ "e_form" : "e_form_per_atom_wbm " ,
287
+ "e_hull" : "e_hull_wbm " ,
283
288
"gap" : "bandgap_pbe" ,
284
289
"id" : "material_id" ,
285
290
}
@@ -313,11 +318,32 @@ def increment_wbm_material_id(wbm_id: str) -> str:
313
318
df_summary .index = df_summary .index .map (increment_wbm_material_id )
314
319
assert sum (df_summary .index != df_wbm .index ) == 0
315
320
321
+ # sort formulas alphabetically
322
+ df_summary ["alph_formula" ] = [
323
+ Composition (x ).alphabetical_formula for x in df_summary .formula
324
+ ]
325
+ assert sum (df_summary .alph_formula != df_summary .formula ) == 219_215
326
+ assert df_summary .alph_formula [3 ] == "Ag2 Au1 Hg1"
327
+ assert df_summary .formula [3 ] == "Ag2 Hg1 Au1"
328
+
329
+ df_summary ["formula" ] = df_summary .pop ("alph_formula" )
330
+
331
+
332
+ # %%
333
+ # check summary and CSE formulas agree
334
+ assert all (df_summary ["formula" ] == df_wbm .formula_from_cse )
335
+
336
+
316
337
# fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE
317
338
df_summary .at ["wbm-step-2-18689" , "uncorrected_energy" ] = df_wbm .loc [
318
339
"wbm-step-2-18689"
319
340
].computed_structure_entry ["energy" ]
320
341
342
+ # NOTE careful with ComputedEntries as object vs as dicts, the meaning of keys changes:
343
+ # cse.energy == cse.uncorrected_energy + cse.correction
344
+ # whereas
345
+ # cse.as_dict()["energy"] == cse.uncorrected_energy
346
+
321
347
322
348
# %% scatter plot summary energies vs CSE energies
323
349
df_summary ["uncorrected_energy_from_cse" ] = [
@@ -355,17 +381,24 @@ def increment_wbm_material_id(wbm_id: str) -> str:
355
381
assert n_corrected == 100931 , f"{ n_corrected = } "
356
382
357
383
corr_label = "mp2020" if isinstance (mp_compat , MP2020Compat ) else "legacy"
358
- df_summary [f"e_correction_ { corr_label } " ] = [
359
- cse .energy - cse . uncorrected_energy for cse in df_wbm .cse
384
+ df_summary [f"e_correction_per_atom_ { corr_label } " ] = [
385
+ cse .correction_per_atom for cse in df_wbm .cse
360
386
]
361
387
362
- assert df_summary .e_correction_mp2020 .mean ().round (4 ) == - 0.9979
363
- assert df_summary .e_correction_legacy .mean ().round (4 ) == - 0.0643
364
- assert (df_summary .filter (like = "corrections " ).abs () > 1e-4 ).sum ().to_dict () == {
365
- "e_correction_mp2020 " : 100931 ,
366
- "e_correction_legacy " : 39595 ,
367
- }
388
+ assert df_summary .e_correction_per_atom_mp2020 .mean ().round (4 ) == - 0.1067
389
+ assert df_summary .e_correction_per_atom_legacy .mean ().round (4 ) == - 0.0643
390
+ assert (df_summary .filter (like = "correction " ).abs () > 1e-4 ).sum ().to_dict () == {
391
+ "e_correction_per_atom_mp2020 " : 100931 ,
392
+ "e_correction_per_atom_legacy " : 39595 ,
393
+ }, "unexpected number of materials received non-zero corrections"
368
394
395
+ ax = density_scatter (
396
+ df_summary .e_correction_per_atom_legacy ,
397
+ df_summary .e_correction_per_atom_mp2020 ,
398
+ xlabel = "legacy corrections (eV / atom)" ,
399
+ ylabel = "MP2020 corrections (eV / atom)" ,
400
+ )
401
+ # ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
369
402
370
403
# mp_compat.process_entry(cse) for CSE with id wbm-step-1-24459 causes Jupyter kernel to
371
404
# crash reason unknown, still occurs even after updating deps like pymatgen, numpy,
@@ -382,65 +415,64 @@ def increment_wbm_material_id(wbm_id: str) -> str:
382
415
383
416
384
417
# %%
385
- with gzip .open (f"{ module_dir } /2022-10-13-rhys/ppd-mp.pkl.gz" , "rb" ) as zip_file :
386
- ppd_rhys : PatchedPhaseDiagram = pickle .load (zip_file )
387
-
388
-
389
418
with gzip .open (f"{ ROOT } /data/2022-09-18-ppd-mp.pkl.gz" , "rb" ) as zip_file :
390
- ppd_mp = pickle .load (zip_file )
419
+ ppd_mp : PatchedPhaseDiagram = pickle .load (zip_file )
391
420
392
421
393
- # %%
422
+ # %% calculate e_above_hull for each material
394
423
# this loop needs the warnings filter above to not crash Jupyter kernel with logs
395
424
# takes ~20 min at 200 it/s for 250k entries in WBM
425
+ e_above_hull_key = "e_above_hull_uncorrected_ppd_mp"
426
+ assert e_above_hull_key not in df_summary
427
+
396
428
for entry in tqdm (df_wbm .cse ):
397
429
assert entry .entry_id .startswith ("wbm-step-" )
398
- corr_label = "mp2020_" if isinstance (mp_compat , MP2020Compat ) else "legacy_"
399
- # corr_label = "un"
400
- at_idx = entry .entry_id , f"e_above_hull_{ corr_label } corrected_ppd_mp"
401
430
402
- if at_idx not in df_summary or pd .isna (df_summary .at [at_idx ]):
403
- # use entry.(uncorrected_)energy_per_atom
404
- e_above_hull = (
405
- entry .corrected_energy_per_atom
406
- - ppd_mp .get_hull_energy_per_atom (entry .composition )
407
- )
408
- df_summary .at [at_idx ] = e_above_hull
431
+ e_per_atom = entry .uncorrected_energy_per_atom
432
+ e_hull_per_atom = ppd_mp .get_hull_energy_per_atom (entry .composition )
433
+ e_above_hull = e_per_atom - e_hull_per_atom
434
+
435
+ df_summary .at [entry .entry_id , e_above_hull_key ] = e_above_hull
409
436
410
437
411
- # %% compute formation energies
438
+ # add old + new MP energy corrections to above hull energies
439
+ for corrections in ("mp2020" , "legacy" ):
440
+ df_summary [e_above_hull_key .replace ("un" , f"{ corrections } _" )] = (
441
+ df_summary [e_above_hull_key ]
442
+ + df_summary [f"e_correction_per_atom_{ corrections } " ]
443
+ )
444
+
445
+
446
+ # %% calculate formation energies from CSEs wrt MP elemental reference energies
412
447
# first make sure source and target dfs have matching indices
413
448
assert sum (df_wbm .index != df_summary .index ) == 0
414
449
415
- e_form_key = "e_form_per_atom_uncorrected_ppd_mp_rhys"
416
- for mat_id , cse in tqdm (df_wbm .cse .items (), total = len (df_wbm )):
417
- assert mat_id == cse .entry_id , f"{ mat_id = } { cse .entry_id = } "
418
- assert mat_id in df_summary .index , f"{ mat_id = } not in df_summary"
419
- df_summary .at [cse .entry_id , e_form_key ] = ppd_rhys .get_form_energy_per_atom (cse )
450
+ e_form_key = "e_form_per_atom_uncorrected_mp_refs"
451
+ assert e_form_key not in df_summary
420
452
421
- assert len (df_summary ) == sum (step_lens )
453
+ for row in tqdm (df_wbm .itertuples (), total = len (df_wbm )):
454
+ mat_id , cse , formula = row .Index , row .cse , row .formula_from_cse
455
+ assert mat_id == cse .entry_id , f"{ mat_id = } != { cse .entry_id = } "
456
+ assert mat_id in df_summary .index , f"{ mat_id = } not in df_summary"
422
457
423
- df_summary [ "e_form_per_atom_legacy_corrected_ppd_mp_rhys" ] = (
424
- df_summary [ e_form_key ] + df_summary . e_correction_legacy
425
- )
458
+ entry_like = dict ( composition = formula , energy = cse . uncorrected_energy )
459
+ e_form = get_e_form_per_atom ( entry_like )
460
+ e_form_ppd = ppd_mp . get_form_energy_per_atom ( cse )
426
461
462
+ # make sure the PPD and functional method of calculating formation energy agree
463
+ assert abs (e_form - e_form_ppd ) < 1e-7 , f"{ e_form = } != { e_form_ppd = } "
464
+ df_summary .at [cse .entry_id , e_form_key ] = e_form
427
465
428
- # %% calculate formation energies from CSEs wrt MP elemental reference energies
429
- df_summary ["e_form_per_atom_uncorrected" ] = [
430
- get_e_form_per_atom (dict (composition = row .formula , energy = row .uncorrected_energy ))
431
- for row in tqdm (df_summary .itertuples (), total = len (df_summary ))
432
- ]
466
+ assert len (df_summary ) == sum (
467
+ step_lens
468
+ ), f"rows were added: { len (df_summary )= } { sum (step_lens )= } "
433
469
434
470
435
- # %% MP2020 corrections are much larger than legacy corrections
436
- ax = density_scatter (
437
- df_summary .e_correction_legacy / df_summary .n_sites ,
438
- df_summary .e_correction_mp2020 / df_summary .n_sites ,
439
- xlabel = "legacy corrections (eV / atom)" ,
440
- ylabel = "MP2020 corrections (eV / atom)" ,
441
- )
442
- ax .axis ("equal" )
443
- # ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
471
+ # add old + new MP energy corrections to formation energies
472
+ for corrections in ("mp2020" , "legacy" ):
473
+ df_summary [e_form_key .replace ("un" , f"{ corrections } _" )] = (
474
+ df_summary [e_form_key ] + df_summary [f"e_correction_per_atom_{ corrections } " ]
475
+ )
444
476
445
477
446
478
# %%
@@ -457,3 +489,22 @@ def increment_wbm_material_id(wbm_id: str) -> str:
457
489
df_wbm ["cse" ] = [
458
490
ComputedStructureEntry .from_dict (x ) for x in tqdm (df_wbm .computed_structure_entry )
459
491
]
492
+
493
+ df_wbm ["init_struct" ] = df_wbm ["wyckoff" ] = float ("nan" )
494
+ for idx , dct in tqdm (df_wbm .initial_structure .items (), total = len (df_wbm )):
495
+ if not df_wbm [idx , "init_struct" ]:
496
+ df_wbm .at [idx , "init_struct" ] = struct = Structure .from_dict (dct )
497
+ if not df_wbm [idx , "wyckoff" ]:
498
+ df_wbm .at [idx , "wyckoff" ] = get_aflow_label_from_spglib (struct )
499
+
500
+
501
+ # %% make sure material IDs within each step are consecutive
502
+ for step in range (1 , 6 ):
503
+ df = df_summary [df_summary .index .str .startswith (f"wbm-step-{ step } -" )]
504
+ step_len = step_lens [step - 1 ]
505
+ assert len (df ) == step_len , f"{ step = } has { len (df )= } , expected { step_len = } "
506
+
507
+ step_counts = list (df .index .str .split ("-" ).str [- 1 ].astype (int ))
508
+ assert step_counts == list (
509
+ range (1 , step_len + 1 )
510
+ ), f"{ step = } counts not consecutive"
0 commit comments