File tree 6 files changed +76
-8
lines changed
6 files changed +76
-8
lines changed Original file line number Diff line number Diff line change @@ -24,6 +24,6 @@ models/**/*.csv
24
24
# temporary ignore rule
25
25
paper
26
26
meeting-notes
27
- models /voronoi /*
28
- ! models /voronoi /* .py
27
+ models /voronoi /* .ipynb
28
+ models /voronoi /* .zip
29
29
pretrained
Original file line number Diff line number Diff line change 67
67
68
68
# %%
69
69
data_path = f"{ ROOT } /data/wbm/2022-10-19-wbm-cses+init-structs.json.bz2"
70
- print (f"Loading from { data_path = } " )
70
+ print (f"{ data_path = } " )
71
71
df_wbm = pd .read_json (data_path ).set_index ("material_id" )
72
72
73
73
df_this_job : pd .DataFrame = np .array_split (df_wbm , slurm_array_task_count )[
Original file line number Diff line number Diff line change 53
53
54
54
# %%
55
55
data_path = f"{ ROOT } /data/wbm/2022-10-19-wbm-init-structs.json.bz2"
56
- print (f"Loading from { data_path = } " )
56
+ print (f"{ data_path = } " )
57
57
df_wbm_structs = pd .read_json (data_path ).set_index ("material_id" )
58
58
59
59
megnet_mp_e_form = load_model (model_name := "Eform_MP_2019" )
Original file line number Diff line number Diff line change 19
19
module_dir = os .path .dirname (__file__ )
20
20
21
21
22
- # data_path = f"{ROOT}/data/mp/2022-09-16-mp-computed-structure-entries.json.gz"
23
- data_path = f"{ ROOT } /data/wbm/2022-10-19-wbm-init-structs.json.bz2"
22
+ data_path = f"{ ROOT } /data/mp/2022-09-16-mp-computed-structure-entries.json.gz"
23
+ # data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
24
24
input_col = "initial_structure"
25
25
data_name = "wbm" if "wbm" in data_path else "mp"
26
26
slurm_array_task_count = 10
31
31
job_name = job_name ,
32
32
partition = "icelake-himem" ,
33
33
account = "LEE-SL3-CPU" ,
34
- time = (slurm_max_job_time := "5 :0:0" ),
34
+ time = (slurm_max_job_time := "8 :0:0" ),
35
35
array = f"1-{ slurm_array_task_count } " ,
36
36
log_dir = log_dir ,
37
37
)
45
45
if os .path .isfile (out_path ):
46
46
raise SystemExit (f"{ out_path = } already exists, exciting early" )
47
47
48
+ print (f"{ data_path = } " )
48
49
df = pd .read_json (data_path ).set_index ("material_id" )
49
50
df_this_job : pd .DataFrame = np .array_split (df , slurm_array_task_count )[
50
51
slurm_array_task_id - 1
51
52
]
52
53
53
- if data_name == "mp" :
54
+ if data_name == "mp" : # extract structure dicts from ComputedStructureEntry
54
55
struct_dicts = [x ["structure" ] for x in df_this_job .entry ]
55
56
if data_name == "wbm" :
56
57
struct_dicts = df_this_job .initial_structure
Original file line number Diff line number Diff line change
1
+ # %%
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from datetime import datetime
6
+ from glob import glob
7
+
8
+ import pandas as pd
9
+ from tqdm import tqdm
10
+
11
+ __author__ = "Janosh Riebesell"
12
+ __date__ = "2022-08-16"
13
+
14
+ today = f"{ datetime .now ():%Y-%m-%d} "
15
+
16
+
17
+ # %%
18
+ module_dir = os .path .dirname (__file__ )
19
+ date = "2022-11-18"
20
+ glob_pattern = f"{ date } -voronoi-features-wbm/voronoi-features-wbm-*.csv.bz2"
21
+ file_paths = sorted (glob (f"{ module_dir } /{ glob_pattern } " ))
22
+ print (f"Found { len (file_paths ):,} files for { glob_pattern = } " )
23
+
24
+ dfs : dict [str , pd .DataFrame ] = {}
25
+
26
+
27
+ # %%
28
+ # 2022-08-16 tried multiprocessing.Pool() to load files in parallel but was somehow
29
+ # slower than serial loading
30
+ for file_path in tqdm (file_paths ):
31
+ if file_path in dfs :
32
+ continue
33
+ try :
34
+ # keep whole dataframe in memory
35
+ df = pd .read_csv (file_path ).set_index ("material_id" )
36
+ dfs [file_path ] = df
37
+ except FileNotFoundError :
38
+ print (f"{ file_path = } not found" )
39
+ continue
40
+
41
+
42
+ # %%
43
+ df_features = pd .concat (dfs .values ())
44
+
45
+ assert df_features .isna ().sum ().max () <= 18
46
+
47
+
48
+ # %%
49
+ out_path = f"{ module_dir } /{ date } -voronoi-features-wbm.csv.bz2"
50
+ df_features .to_csv (out_path )
Original file line number Diff line number Diff line change
1
+ # Voronoi Tessellation with matminer featurezation piped into ` scikit-learn ` Random Forest
2
+
3
+ ## OOM errors during featurization
4
+
5
+ ` multiprocessing ` seems to be the cause of out-of-memory errors on large structures. Initially couldn't get the ` matminer ` ` MultipleFeaturizer ` to run without crashing even when running on small subsets of the data (1%) and setting ` sbatch ` flag ` --mem 100G ` :
6
+
7
+ ``` log
8
+ MultipleFeaturizer: 28%|██▊ | 724/2575 [01:08<04:15, 7.25it/s]/var/spool/slurm/slurmd/job7401930/slurm_script: line 4: 2625851 Killed python
9
+ slurmstepd: error: Detected 52 oom-kill event(s) in StepId=7401930.batch cgroup. Some of your processes may have been killed by the cgroup out-of-memory handler.
10
+ 4:00
11
+ ```
12
+
13
+ Saving tip came from [ Alex Dunn via Slack] ( https://berkeleytheory.slack.com/archives/D03ULSTNRMX/p1668746161675349 ) to set ` featurizer.set_n_jobs(1) ` .
14
+
15
+ ## Archive
16
+
17
+ Files in ` 2022-10-04-rhys-voronoi.zip ` received from Rhys via [ Slack] ( https://ml-physics.slack.com/archives/DD8GBBRLN/p1664929946687049 ) . All originals before making any changes for this project.
You can’t perform that action at this time.
0 commit comments