|
15 | 15 | from tqdm import tqdm
|
16 | 16 |
|
17 | 17 | from mb_discovery import ROOT, as_dict_handler
|
| 18 | +from mb_discovery.slurm import slurm_submit_python |
18 | 19 |
|
19 |
| -""" |
20 |
| -To slurm submit this file, use |
21 |
| -
|
22 |
| -```sh |
23 |
| -log_dir=models/bowsr/$(date +"%Y-%m-%d")-bowsr-megnet-wbm |
24 |
| -job_name=bowsr-megnet-wbm-IS2RE |
25 |
| -mkdir -p $log_dir # slurm fails if log_dir is missing |
26 |
| -
|
27 |
| -sbatch --partition icelake-himem --account LEE-SL3-CPU --array 1-500 \ |
28 |
| - --time 12:0:0 --job-name $job_name --mem 12000 \ |
29 |
| - --output $log_dir/slurm-%A-%a.out \ |
30 |
| - --wrap "TF_CPP_MIN_LOG_LEVEL=2 python models/bowsr/slurm_array_bowsr_wbm.py" |
31 |
| -``` |
| 20 | +__author__ = "Janosh Riebesell" |
| 21 | +__date__ = "2022-08-15" |
32 | 22 |
|
33 |
| ---time 2h is probably enough but missing indices are annoying so best be safe. |
34 |
| ---mem 12000 avoids slurmstepd: error: Detected 1 oom-kill event(s) |
35 |
| - Some of your processes may have been killed by the cgroup out-of-memory handler. |
| 23 | +""" |
| 24 | +To slurm submit this file, run: |
36 | 25 |
|
37 |
| -TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed |
38 |
| -https://stackoverflow.com/a/40982782 |
| 26 | +python path/to/file.py slurm-submit |
39 | 27 |
|
40 | 28 | Requires MEGNet and MAML installation: pip install megnet maml
|
41 | 29 | """
|
42 | 30 |
|
43 |
| -__author__ = "Janosh Riebesell" |
44 |
| -__date__ = "2022-08-15" |
45 |
| - |
| 31 | +task_type = "IS2RE" # "RS2RE" |
| 32 | +today = f"{datetime.now():%Y-%m-%d}" |
| 33 | +module_dir = os.path.dirname(__file__) |
| 34 | +# --mem 12000 avoids slurmstepd: error: Detected 1 oom-kill event(s) |
| 35 | +# Some of your processes may have been killed by the cgroup out-of-memory handler. |
| 36 | +slurm_mem_per_node = 12000 |
| 37 | +# set large job array size for fast testing/debugging |
| 38 | +slurm_array_task_count = 500 |
| 39 | +out_dir = f"{module_dir}/{today}-bowsr-megnet-wbm-{task_type}" |
46 | 40 |
|
47 |
| -task_type = "IS2RE" |
48 |
| -# task_type = "RS2RE" |
49 | 41 | data_path = f"{ROOT}/data/2022-06-26-wbm-cses-and-initial-structures.json.gz"
|
50 | 42 |
|
51 |
| -module_dir = os.path.dirname(__file__) |
| 43 | +slurm_submit_python( |
| 44 | + job_name=f"bowsr-megnet-wbm-{task_type}", |
| 45 | + log_dir=out_dir, |
| 46 | + time=(slurm_max_job_time := "3:0:0"), |
| 47 | + # --time 2h is probably enough but best be safe. |
| 48 | + array=f"1-{slurm_array_task_count}", |
| 49 | + slurm_flags=("--mem", str(slurm_mem_per_node)), |
| 50 | + partition="icelake-himem", |
| 51 | + # TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed |
| 52 | + # https://stackoverflow.com/a/40982782 |
| 53 | + env_vars="TF_CPP_MIN_LOG_LEVEL=2", |
| 54 | +) |
| 55 | + |
| 56 | + |
| 57 | +# %% |
52 | 58 | slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
|
53 | 59 | slurm_array_task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
|
54 |
| -# set large fallback job array size for fast testing/debugging |
55 |
| -slurm_array_task_count = int(os.environ.get("SLURM_ARRAY_TASK_COUNT", 10_000)) |
| 60 | +out_path = f"{out_dir}/{slurm_array_task_id}.json.gz" |
56 | 61 |
|
57 | 62 | print(f"Job started running {datetime.now():%Y-%m-%d@%H-%M}")
|
58 | 63 | print(f"{slurm_job_id = }")
|
59 | 64 | print(f"{slurm_array_task_id = }")
|
| 65 | +print(f"{data_path = }") |
| 66 | +print(f"{out_path = }") |
60 | 67 | print(f"{version('maml') = }")
|
61 | 68 | print(f"{version('megnet') = }")
|
62 | 69 |
|
63 |
| -today = f"{datetime.now():%Y-%m-%d}" |
64 |
| -out_dir = f"{module_dir}/{today}-bowsr-megnet-wbm-{task_type}" |
65 |
| -json_out_path = f"{out_dir}/{slurm_array_task_id}.json.gz" |
66 | 70 |
|
67 |
| -if os.path.isfile(json_out_path): |
68 |
| - raise SystemExit(f"{json_out_path = } already exists, exciting early") |
| 71 | +if os.path.isfile(out_path): |
| 72 | + raise SystemExit(f"{out_path = } already exists, exciting early") |
69 | 73 |
|
70 | 74 |
|
71 | 75 | # %%
|
|
78 | 82 | optimize_kwargs = dict(n_init=100, n_iter=100, alpha=0.026**2)
|
79 | 83 |
|
80 | 84 | run_params = dict(
|
81 |
| - megnet_version=version("megnet"), |
82 |
| - maml_version=version("maml"), |
83 |
| - slurm_job_id=slurm_job_id, |
84 |
| - slurm_array_task_id=slurm_array_task_id, |
85 |
| - slurm_array_task_count=slurm_array_task_count, |
86 |
| - data_path=data_path, |
87 | 85 | bayes_optim_kwargs=bayes_optim_kwargs,
|
| 86 | + data_path=data_path, |
| 87 | + maml_version=version("maml"), |
| 88 | + megnet_version=version("megnet"), |
88 | 89 | optimize_kwargs=optimize_kwargs,
|
| 90 | + slurm_array_task_count=slurm_array_task_count, |
| 91 | + slurm_array_task_id=slurm_array_task_id, |
| 92 | + slurm_job_id=slurm_job_id, |
| 93 | + slurm_max_job_time=slurm_max_job_time, |
| 94 | + slurm_mem_per_node=slurm_mem_per_node, |
89 | 95 | task_type=task_type,
|
90 | 96 | )
|
91 | 97 | if wandb.run is None:
|
|
127 | 133 |
|
128 | 134 |
|
129 | 135 | for material_id, structure in tqdm(
|
130 |
| - structures.items(), desc="Main loop", total=len(structures) |
| 136 | + structures.items(), desc="Main loop", total=len(structures), disable=None |
131 | 137 | ):
|
132 | 138 | if material_id in relax_results:
|
133 | 139 | continue
|
|
154 | 160 | df_output = pd.DataFrame(relax_results).T
|
155 | 161 | df_output.index.name = "material_id"
|
156 | 162 |
|
157 |
| -df_output.reset_index().to_json(json_out_path, default_handler=as_dict_handler) |
| 163 | +df_output.reset_index().to_json(out_path, default_handler=as_dict_handler) |
158 | 164 |
|
159 |
| -wandb.log_artifact(json_out_path, type=f"bowsr-megnet-wbm-{task_type}") |
| 165 | +wandb.log_artifact(out_path, type=f"bowsr-megnet-wbm-{task_type}") |
0 commit comments