|
32 | 32 | # Some of your processes may have been killed by the cgroup out-of-memory handler.
|
33 | 33 | slurm_mem_per_node = 12000
|
34 | 34 | # set large job array size for fast testing/debugging
|
35 |
| -slurm_array_task_count = 500 |
| 35 | +slurm_array_task_count = 1000 |
| 36 | +slurm_max_parallel = 100 |
36 | 37 | timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
|
37 | 38 | today = timestamp.split("@")[0]
|
38 | 39 | energy_model = "megnet"
|
|
46 | 47 | log_dir=out_dir,
|
47 | 48 | partition="icelake-himem",
|
48 | 49 | account="LEE-SL3-CPU",
|
49 |
| - time=(slurm_max_job_time := "3:0:0"), |
| 50 | + time=(slurm_max_job_time := "12:0:0"), |
50 | 51 | # --time 2h is probably enough but best be safe.
|
51 |
| - array=f"1-{slurm_array_task_count}", |
| 52 | + array=f"1-{slurm_array_task_count}%{slurm_max_parallel}", |
52 | 53 | slurm_flags=("--mem", str(slurm_mem_per_node)),
|
53 | 54 | # TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
|
54 | 55 | # https://stackoverflow.com/a/40982782
|
|
99 | 100 | optimize_kwargs=optimize_kwargs,
|
100 | 101 | task_type=task_type,
|
101 | 102 | slurm_max_job_time=slurm_max_job_time,
|
102 |
| - slurm_vars=slurm_vars, |
| 103 | + slurm_vars=slurm_vars | dict(slurm_max_parallel=slurm_max_parallel), |
103 | 104 | )
|
104 | 105 | if wandb.run is None:
|
105 | 106 | wandb.login()
|
|
135 | 136 | ):
|
136 | 137 | if material_id in relax_results:
|
137 | 138 | continue
|
138 |
| - bayes_optimizer = BayesianOptimizer( |
139 |
| - model=model, structure=structure, **bayes_optim_kwargs |
140 |
| - ) |
141 |
| - bayes_optimizer.set_bounds() |
142 |
| - # reason for devnull here: https://github.com/materialsvirtuallab/maml/issues/469 |
143 |
| - with open(os.devnull, "w") as devnull, contextlib.redirect_stdout(devnull): |
144 |
| - bayes_optimizer.optimize(**optimize_kwargs) |
145 |
| - |
146 |
| - structure_bowsr, energy_bowsr = bayes_optimizer.get_optimized_structure_and_energy() |
147 |
| - |
148 |
| - results = { |
149 |
| - f"e_form_per_atom_bowsr_{energy_model}": model.predict_energy(structure), |
150 |
| - "structure_bowsr": structure_bowsr, |
151 |
| - "energy_bowsr": energy_bowsr, |
152 |
| - } |
153 |
| - |
154 |
| - relax_results[material_id] = results |
| 139 | + try: |
| 140 | + optimizer = BayesianOptimizer( |
| 141 | + model=model, structure=structure, **bayes_optim_kwargs |
| 142 | + ) |
| 143 | + optimizer.set_bounds() |
| 144 | + # reason for devnull: https://github.com/materialsvirtuallab/maml/issues/469 |
| 145 | + with open(os.devnull, "w") as devnull, contextlib.redirect_stdout(devnull): |
| 146 | + optimizer.optimize(**optimize_kwargs) |
| 147 | + |
| 148 | + structure_bowsr, energy_bowsr = optimizer.get_optimized_structure_and_energy() |
| 149 | + |
| 150 | + results = { |
| 151 | + f"e_form_per_atom_bowsr_{energy_model}": model.predict_energy(structure), |
| 152 | + "structure_bowsr": structure_bowsr, |
| 153 | + "energy_bowsr": energy_bowsr, |
| 154 | + } |
| 155 | + |
| 156 | + relax_results[material_id] = results |
| 157 | + except Exception as exc: |
| 158 | + print(f"{material_id=} raised {exc=}") |
155 | 159 |
|
156 | 160 |
|
157 | 161 | # %%
|
|
0 commit comments