Skip to content

Commit 39abb14

Browse files
Cancel stuck requests on dask scheduler (#136)
* fix: update qos when a request is running on a killed worker * cancel job on the dask scheduler when the broker has not the future * qa
1 parent 2906793 commit 39abb14

File tree

4 files changed

+111
-10
lines changed

4 files changed

+111
-10
lines changed

cads_broker/config.py

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class BrokerConfig(pydantic_settings.BaseSettings):
3939
broker_requeue_limit: int = 3
4040
broker_max_internal_scheduler_tasks: int = 500
4141
broker_max_accepted_requests: int = 2000
42+
broker_cancel_stuck_requests_cache_ttl: int = 60
43+
broker_stuck_requests_limit_hours: int = 1
4244

4345

4446
class SqlalchemySettings(pydantic_settings.BaseSettings):

cads_broker/database.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424

2525
logger: structlog.stdlib.BoundLogger = structlog.get_logger(__name__)
2626

27-
2827
status_enum = sa.Enum(
2928
"accepted", "running", "failed", "successful", "dismissed", "deleted", name="status"
3029
)
@@ -476,6 +475,21 @@ def get_users_queue_from_processing_time(
476475
return queueing_user_costs | running_user_costs
477476

478477

478+
def get_stuck_requests(session: sa.orm.Session, hours: int = 1) -> list[str]:
479+
"""Get all running requests that are not assigned to any worker."""
480+
query = (
481+
sa.select(SystemRequest.request_uid)
482+
.outerjoin(Events, SystemRequest.request_uid == Events.request_uid)
483+
.where(
484+
SystemRequest.status == "running",
485+
SystemRequest.started_at
486+
< sa.func.now() - sa.text(f"interval '{hours} hour'"),
487+
)
488+
.where(Events.event_id.is_(None))
489+
)
490+
return session.execute(query).scalars().all()
491+
492+
479493
def delete_request_qos_status(
480494
request_uid: str,
481495
rules: list,

cads_broker/dispatcher.py

+49-9
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,37 @@ def get_tasks_on_scheduler(dask_scheduler: distributed.Scheduler) -> dict[str, A
9292
return client.run_on_scheduler(get_tasks_on_scheduler)
9393

9494

95+
def cancel_jobs_on_scheduler(client: distributed.Client, job_ids: list[str]) -> None:
96+
"""Cancel jobs on the dask scheduler.
97+
98+
This function is executed on the scheduler pod. This just cancel the jobs on the scheduler.
99+
See https://stackoverflow.com/questions/49203128/how-do-i-stop-a-running-task-in-dask.
100+
"""
101+
102+
def cancel_jobs(dask_scheduler: distributed.Scheduler, job_ids: list[str]) -> None:
103+
for job_id in job_ids:
104+
if job_id in dask_scheduler.tasks:
105+
dask_scheduler.transitions(
106+
{job_id: "cancelled"}, stimulus_id="manual-cancel"
107+
)
108+
109+
return client.run_on_scheduler(cancel_jobs, job_ids=job_ids)
110+
111+
112+
@cachetools.cached( # type: ignore
113+
cache=cachetools.TTLCache(
114+
maxsize=1024, ttl=CONFIG.broker_cancel_stuck_requests_cache_ttl
115+
),
116+
info=True,
117+
)
118+
def cancel_stuck_requests(client: distributed.Client, session: sa.orm.Session) -> None:
119+
"""Get the stuck requests from the database and cancel them on the dask scheduler."""
120+
stuck_requests = db.get_stuck_requests(
121+
session=session, hours=CONFIG.broker_stuck_requests_limit_hours
122+
)
123+
cancel_jobs_on_scheduler(client, job_ids=stuck_requests)
124+
125+
95126
class Scheduler:
96127
"""A simple scheduler to store the tasks to update the qos_rules in the database.
97128
@@ -316,8 +347,12 @@ def set_request_error_status(
316347
< CONFIG.broker_requeue_limit
317348
):
318349
logger.info("worker killed: re-queueing", job_id=request_uid)
319-
db.requeue_request(request=request, session=session)
320-
self.queue.add(request_uid, request)
350+
queued_request = db.requeue_request(request=request, session=session)
351+
if queued_request:
352+
self.queue.add(request_uid, request)
353+
self.qos.notify_end_of_request(
354+
request, session, scheduler=self.internal_scheduler
355+
)
321356
else:
322357
request = db.set_request_status(
323358
request_uid,
@@ -375,9 +410,13 @@ def sync_database(self, session: sa.orm.Session) -> None:
375410
dismissed_requests = db.get_dismissed_requests(
376411
session, limit=CONFIG.broker_max_accepted_requests
377412
)
378-
for i, request in enumerate(dismissed_requests):
413+
for request in dismissed_requests:
379414
if future := self.futures.pop(request.request_uid, None):
380415
future.cancel()
416+
else:
417+
# if the request is not in the futures, it means that the request has been lost by the broker
418+
# try to cancel the job directly on the scheduler
419+
cancel_jobs_on_scheduler(self.client, job_ids=[request.request_uid])
381420
session = self.manage_dismissed_request(request, session)
382421
session.commit()
383422

@@ -606,12 +645,12 @@ def processing_time_priority_algorithm(
606645
interval_stop = datetime.datetime.now()
607646
# temporary solution to prioritize high priority user
608647
users_queue = {
609-
"27888ffa-0973-4794-9b3c-9efb6767f66f": 0, # wekeo
610-
"d67a13db-86cc-439d-823d-6517003de29f": 0, # CDS Apps user
611-
"365ac1da-090e-4b85-9088-30c676bc5251": 0, # Gionata
612-
"74c6f9a1-8efe-4a6c-b06b-9f8ddcab188d": 0, # User Support
613-
"4d92cc89-d586-4731-8553-07df5dae1886": 0, # Luke Jones
614-
"8d8ee054-6a09-4da8-a5be-d5dff52bbc5f": 0, # Petrut
648+
"27888ffa-0973-4794-9b3c-9efb6767f66f": 0, # wekeo
649+
"d67a13db-86cc-439d-823d-6517003de29f": 0, # CDS Apps user
650+
"365ac1da-090e-4b85-9088-30c676bc5251": 0, # Gionata
651+
"74c6f9a1-8efe-4a6c-b06b-9f8ddcab188d": 0, # User Support
652+
"4d92cc89-d586-4731-8553-07df5dae1886": 0, # Luke Jones
653+
"8d8ee054-6a09-4da8-a5be-d5dff52bbc5f": 0, # Petrut
615654
} | db.get_users_queue_from_processing_time(
616655
interval_stop=interval_stop,
617656
session=session_write,
@@ -736,6 +775,7 @@ def run(self) -> None:
736775
self.queue.values(), session_write
737776
)
738777

778+
cancel_stuck_requests(client=self.client, session=session_read)
739779
running_requests = len(db.get_running_requests(session=session_read))
740780
queue_length = self.queue.len()
741781
available_workers = self.number_of_workers - running_requests

tests/test_02_database.py

+45
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,51 @@ def test_count_requests(session_obj: sa.orm.sessionmaker) -> None:
189189
assert 3 == db.count_requests(session=session, status=["accepted", "running"])
190190

191191

192+
def test_get_stuck_running_requests(session_obj: sa.orm.sessionmaker) -> None:
193+
adaptor_properties = mock_config()
194+
request1 = mock_system_request(
195+
status="accepted",
196+
adaptor_properties_hash=adaptor_properties.hash,
197+
)
198+
request2 = mock_system_request(
199+
status="running",
200+
started_at=datetime.datetime.now() - datetime.timedelta(hours=2),
201+
adaptor_properties_hash=adaptor_properties.hash,
202+
)
203+
request_uid2 = request2.request_uid
204+
request3 = mock_system_request(
205+
status="running",
206+
started_at=datetime.datetime.now() - datetime.timedelta(hours=1, minutes=10),
207+
adaptor_properties_hash=adaptor_properties.hash,
208+
)
209+
request_uid3 = request3.request_uid
210+
request4 = mock_system_request(
211+
status="running",
212+
started_at=datetime.datetime.now() - datetime.timedelta(hours=5, minutes=10),
213+
adaptor_properties_hash=adaptor_properties.hash,
214+
)
215+
event4 = mock_event(
216+
request_uid=request4.request_uid,
217+
event_type="worker-name",
218+
message="worker-0",
219+
timestamp=datetime.datetime.now() - datetime.timedelta(minutes=5),
220+
)
221+
222+
with session_obj() as session:
223+
session.add(adaptor_properties)
224+
session.add(request1)
225+
session.add(request2)
226+
session.add(request3)
227+
session.add(request4)
228+
session.add(event4)
229+
session.commit()
230+
requests = db.get_stuck_requests(session=session)
231+
232+
assert len(requests) == 2
233+
assert request_uid2 in requests
234+
assert request_uid3 in requests
235+
236+
192237
def test_add_qos_rule(session_obj: sa.orm.sessionmaker) -> None:
193238
rule = MockRule("rule_name", "conclusion", "info", "condition")
194239
with session_obj() as session:

0 commit comments

Comments
 (0)