@@ -268,14 +268,30 @@ def sync_database(self, session: sa.orm.Session) -> None:
268
268
# if it doesn't find the request: re-queue it
269
269
else :
270
270
# FIXME: check if request status has changed
271
- logger .info (
272
- "request not found: re-queueing" , job_id = {request .request_uid }
273
- )
274
- db .requeue_request (request_uid = request .request_uid , session = session )
275
- self .queue .add (request .request_uid , request )
276
- self .qos .notify_end_of_request (
277
- request , session , scheduler = self .internal_scheduler
278
- )
271
+ if os .getenv (
272
+ "BROKER_REQUEUE_ON_LOST_REQUESTS" , False
273
+ ) and request .request_metadata .get ("resubmit" , 0 ) < os .getenv (
274
+ "BROKER_REQUEUE_LIMIT" , 3
275
+ ):
276
+ logger .info (
277
+ "request not found: re-queueing" , job_id = {request .request_uid }
278
+ )
279
+ db .requeue_request (request_uid = request .request_uid , session = session )
280
+ self .queue .add (request .request_uid , request )
281
+ self .qos .notify_end_of_request (
282
+ request , session , scheduler = self .internal_scheduler
283
+ )
284
+ else :
285
+ db .set_request_status (
286
+ request_uid = request .request_uid ,
287
+ status = "failed" ,
288
+ error_message = "Request not found in dask scheduler" ,
289
+ error_reason = "not_found" ,
290
+ session = session ,
291
+ )
292
+ self .qos .notify_end_of_request (
293
+ request , session , scheduler = self .internal_scheduler
294
+ )
279
295
280
296
@perf_logger
281
297
def sync_qos_rules (self , session_write ) -> None :
@@ -298,10 +314,6 @@ def sync_qos_rules(self, session_write) -> None:
298
314
def on_future_done (self , future : distributed .Future ) -> None :
299
315
job_status = DASK_STATUS_TO_STATUS .get (future .status , "accepted" )
300
316
logger_kwargs : dict [str , Any ] = {}
301
- log = list (self .client .get_events (f"{ future .key } /log" ))
302
- user_visible_log = list (
303
- self .client .get_events (f"{ future .key } /user_visible_log" )
304
- )
305
317
with self .session_maker_write () as session :
306
318
if future .status == "finished" :
307
319
result = future .result ()
@@ -310,27 +322,53 @@ def on_future_done(self, future: distributed.Future) -> None:
310
322
job_status ,
311
323
cache_id = result ,
312
324
session = session ,
313
- log = log ,
314
- user_visible_log = user_visible_log ,
315
325
)
316
326
elif future .status == "error" :
317
327
exception = future .exception ()
318
328
error_message = "" .join (traceback .format_exception (exception ))
319
329
error_reason = exception .__class__ .__name__
320
- if error_reason == "distributed.scheduler.KilledWorker" and os .getenv (
321
- "BROKER_REQUEUE_ON_KILLED_WORKER" , False
322
- ):
323
- logger .info ("worker killed: re-queueing" , job_id = future .key )
324
- db .requeue_request (request_uid = future .key , session = session )
325
- self .queue .add (request .request_uid , request )
330
+ request = db .get_request (future .key , session = session )
331
+ requeue = os .getenv ("BROKER_REQUEUE_ON_KILLED_WORKER_REQUESTS" , False )
332
+ if error_reason == "KilledWorker" :
333
+ worker_restart_events = self .client .get_events (
334
+ "worker-restart-memory"
335
+ )
336
+ # get info on worker and pid of the killed request
337
+ _ , worker_pid_event = self .client .get_events (future .key )[0 ]
338
+ if worker_restart_events :
339
+ for event in worker_restart_events :
340
+ _ , job = event
341
+ if (
342
+ job ["worker" ] == worker_pid_event ["worker" ]
343
+ and job ["pid" ] == worker_pid_event ["pid" ]
344
+ ):
345
+ db .add_event (
346
+ event_type = "killed_worker" ,
347
+ request_uid = future .key ,
348
+ message = "Worker has been killed by the Nanny due to memory usage. "
349
+ f"{ job ['worker' ]= } , { job ['pid' ]= } , { job ['rss' ]= } " ,
350
+ session = session ,
351
+ )
352
+ request = db .set_request_status (
353
+ future .key ,
354
+ "failed" ,
355
+ error_message = error_message ,
356
+ error_reason = error_reason ,
357
+ session = session ,
358
+ )
359
+ requeue = False
360
+ if requeue and request .request_metadata .get (
361
+ "resubmit" , 0
362
+ ) < os .getenv ("BROKER_REQUEUE_LIMIT" , 3 ):
363
+ logger .info ("worker killed: re-queueing" , job_id = future .key )
364
+ db .requeue_request (request_uid = future .key , session = session )
365
+ self .queue .add (future .key , request )
326
366
else :
327
367
request = db .set_request_status (
328
368
future .key ,
329
369
job_status ,
330
370
error_message = error_message ,
331
371
error_reason = error_reason ,
332
- log = log ,
333
- user_visible_log = user_visible_log ,
334
372
session = session ,
335
373
)
336
374
else :
@@ -340,9 +378,8 @@ def on_future_done(self, future: distributed.Future) -> None:
340
378
job_status ,
341
379
session = session ,
342
380
resubmit = True ,
343
- log = log ,
344
- user_visible_log = user_visible_log ,
345
381
)
382
+ self .queue .add (future .key , request )
346
383
logger .warning (
347
384
"unknown dask status, re-queing" ,
348
385
job_status = {future .status },
@@ -367,20 +404,17 @@ def submit_requests(
367
404
) -> None :
368
405
queue = sorted (
369
406
candidates ,
370
- key = lambda candidate : self .qos .priority (
371
- candidate , session_write
372
- ),
407
+ key = lambda candidate : self .qos .priority (candidate , session_write ),
373
408
reverse = True ,
374
409
)
375
410
requests_counter = 0
376
411
for request in queue :
377
412
if self .qos .can_run (
378
413
request , session = session_write , scheduler = self .internal_scheduler
379
414
):
380
- self .submit_request (request , session = session_write )
415
+ if requests_counter <= int (number_of_requests * WORKERS_MULTIPLIER ):
416
+ self .submit_request (request , session = session_write )
381
417
requests_counter += 1
382
- if requests_counter == int (number_of_requests * WORKERS_MULTIPLIER ):
383
- break
384
418
385
419
def submit_request (
386
420
self , request : db .SystemRequest , session : sa .orm .Session
@@ -421,9 +455,7 @@ def run(self) -> None:
421
455
with self .session_maker_read () as session_read :
422
456
if (rules_hash := get_rules_hash (self .qos .path )) != self .qos .rules_hash :
423
457
logger .info ("reloading qos rules" )
424
- self .qos .reload_rules (
425
- session = session_read
426
- )
458
+ self .qos .reload_rules (session = session_read )
427
459
self .qos .rules_hash = rules_hash
428
460
self .qos .environment .set_session (session_read )
429
461
# expire_on_commit=False is used to detach the accepted requests without an error
0 commit comments