Skip to content

Commit 02bdcdd

Browse files
[feat] Mark stalled runs as finished (#3314)
1 parent 943942c commit 02bdcdd

12 files changed

+289
-99
lines changed

aim/cli/up/commands.py

+3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
)
1414
from aim.sdk.index_manager import RepoIndexManager
1515
from aim.sdk.repo import Repo
16+
from aim.sdk.run_status_manager import RunStatusManager
1617
from aim.sdk.utils import clean_repo_path
1718
from aim.web.configs import (
1819
AIM_ENV_MODE_KEY,
@@ -124,6 +125,8 @@ def up(
124125
os.environ[AIM_PROFILER_KEY] = '1'
125126

126127
RepoIndexManager.get_index_manager(repo_inst)
128+
run_status_mng = RunStatusManager(repo_inst)
129+
run_status_mng.start()
127130
try:
128131
server_cmd = build_uvicorn_command(
129132
'aim.web.run:app',

aim/sdk/repo.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,9 @@ def get_version(cls, path: str):
269269
def is_remote_path(cls, path: str):
270270
return path.startswith('aim://')
271271

272-
def _get_container(self, name: str, read_only: bool, from_union: bool = False, skip_read_optimization: bool = False) -> Container:
272+
def _get_container(
273+
self, name: str, read_only: bool, from_union: bool = False, skip_read_optimization: bool = False
274+
) -> Container:
273275
# TODO [AT]: refactor get container/tree logic to make it more simple
274276
if self.read_only and not read_only:
275277
raise ValueError('Repo is read-only')
@@ -317,11 +319,17 @@ def request_tree(
317319
read_only: bool,
318320
from_union: bool = False, # TODO maybe = True by default
319321
no_cache: bool = False,
320-
skip_read_optimization: bool = False
322+
skip_read_optimization: bool = False,
321323
):
322324
if not self.is_remote_repo:
323-
return self.request(name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache,
324-
skip_read_optimization=skip_read_optimization).tree()
325+
return self.request(
326+
name,
327+
sub,
328+
read_only=read_only,
329+
from_union=from_union,
330+
no_cache=no_cache,
331+
skip_read_optimization=skip_read_optimization,
332+
).tree()
325333
else:
326334
return ProxyTree(self._client, name, sub, read_only=read_only, from_union=from_union, no_cache=no_cache)
327335

@@ -333,7 +341,7 @@ def request(
333341
read_only: bool,
334342
from_union: bool = False, # TODO maybe = True by default
335343
no_cache: bool = False,
336-
skip_read_optimization: bool = False
344+
skip_read_optimization: bool = False,
337345
):
338346
container_config = ContainerConfig(name, sub, read_only)
339347
container_view = self.container_view_pool.get(container_config)
@@ -344,8 +352,9 @@ def request(
344352
else:
345353
assert sub is not None
346354
path = os.path.join(name, 'chunks', sub)
347-
container = self._get_container(path, read_only=True, from_union=from_union,
348-
skip_read_optimization=skip_read_optimization)
355+
container = self._get_container(
356+
path, read_only=True, from_union=from_union, skip_read_optimization=skip_read_optimization
357+
)
349358
else:
350359
assert sub is not None
351360
path = os.path.join(name, 'chunks', sub)

aim/sdk/reporter/file_manager.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@
1010

1111
class FileManager(object):
1212
@abstractmethod
13-
def poll(self, pattern: str) -> Optional[str]: ...
13+
def poll(self, pattern: str) -> Optional[str]:
14+
...
1415

1516
@abstractmethod
16-
def touch(self, filename: str, cleanup_file_pattern: Optional[str] = None): ...
17+
def touch(self, filename: str, cleanup_file_pattern: Optional[str] = None):
18+
...
1719

1820

1921
class LocalFileManager(FileManager):

aim/sdk/run_status_manager.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import time
2+
import os
3+
import datetime
4+
import pytz
5+
import threading
6+
from pathlib import Path
7+
8+
from typing import Iterable
9+
10+
import aimrocks.errors
11+
12+
from aim import Repo
13+
from aim.sdk.run_status_watcher import Event
14+
15+
16+
class RunStatusManager:
17+
INDEXING_GRACE_PERIOD = 10
18+
19+
def __init__(self, repo: Repo, scan_interval: int = 60):
20+
self.repo = repo
21+
self.scan_interval = scan_interval
22+
23+
self.progress_dir = Path(self.repo.path) / 'meta' / 'progress'
24+
self.progress_dir.mkdir(parents=True, exist_ok=True)
25+
26+
self.heartbeat_dir = Path(self.repo.path) / 'check_ins'
27+
self.run_heartbeat_cache = {}
28+
29+
self._stop_event = threading.Event()
30+
self._monitor_thread = None
31+
self._corrupted_runs = set()
32+
33+
def start(self):
34+
if not self._monitor_thread or not self._monitor_thread.is_alive():
35+
self._stop_event.clear()
36+
self._monitor_thread = threading.Thread(target=self._run_forever, daemon=True)
37+
self._monitor_thread.start()
38+
39+
def stop(self):
40+
self._stop_event.set()
41+
if self._monitor_thread:
42+
self._monitor_thread.join()
43+
44+
def _run_forever(self):
45+
while not self._stop_event.is_set():
46+
self.check_and_terminate_stalled_runs()
47+
time.sleep(self.scan_interval)
48+
49+
def _runs_with_progress(self) -> Iterable[str]:
50+
runs_with_progress = filter(lambda x: x not in self._corrupted_runs, os.listdir(self.progress_dir))
51+
run_hashes = sorted(runs_with_progress, key=lambda r: os.path.getmtime(os.path.join(self.progress_dir, r)))
52+
return run_hashes
53+
54+
def check_and_terminate_stalled_runs(self):
55+
for run_hash in self._runs_with_progress():
56+
if self._is_run_stalled(run_hash):
57+
self._mark_run_as_terminated(run_hash)
58+
59+
def _is_run_stalled(self, run_hash: str) -> bool:
60+
stalled = False
61+
62+
heartbeat_files = list(sorted(self.heartbeat_dir.glob(f'{run_hash}-*-progress-*-*'), reverse=True))
63+
if heartbeat_files:
64+
latest_file = heartbeat_files[0].name
65+
last_heartbeat = Event(latest_file)
66+
67+
last_recorded_heartbeat = self.run_heartbeat_cache.get(run_hash)
68+
if last_recorded_heartbeat is None:
69+
# First time seeing a heartbeat for this run; store and move on
70+
self.run_heartbeat_cache[run_hash] = last_heartbeat
71+
elif last_heartbeat.idx > last_recorded_heartbeat.idx:
72+
# Newer heartbeat arrived, so the run isn't stalled
73+
self.run_heartbeat_cache[run_hash] = last_heartbeat
74+
else:
75+
# No new heartbeat event since last time; check if enough time passed
76+
time_passed = time.time() - last_recorded_heartbeat.detected_epoch_time
77+
if (last_recorded_heartbeat.next_event_in + RunStatusManager.INDEXING_GRACE_PERIOD) < time_passed:
78+
stalled = True
79+
else:
80+
stalled = True
81+
82+
return stalled
83+
84+
def _mark_run_as_terminated(self, run_hash: str):
85+
# TODO [AT]: Add run state handling once decided on terms (finished, terminated, aborted, etc.)
86+
try:
87+
meta_run_tree = self.repo.request_tree('meta', run_hash, read_only=False).subtree(
88+
('meta', 'chunks', run_hash)
89+
)
90+
if meta_run_tree.get('end_time') is None:
91+
meta_run_tree['end_time'] = datetime.datetime.now(pytz.utc).timestamp()
92+
progress_path = self.progress_dir / run_hash
93+
progress_path.unlink(missing_ok=True)
94+
except (aimrocks.errors.RocksIOError, aimrocks.errors.Corruption):
95+
self._corrupted_runs.add(run_hash)

aim/sdk/run_status_watcher.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,16 @@ def __init__(self, *, obj_idx: Optional[str] = None, rank: Optional[int] = None,
8383
self.message = message
8484

8585
@abstractmethod
86-
def is_sent(self): ...
86+
def is_sent(self):
87+
...
8788

8889
@abstractmethod
89-
def update_last_sent(self): ...
90+
def update_last_sent(self):
91+
...
9092

9193
@abstractmethod
92-
def get_msg_details(self): ...
94+
def get_msg_details(self):
95+
...
9396

9497

9598
class StatusNotification(Notification):

aim/storage/arrayview.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ class ArrayView:
99
when index values are not important.
1010
"""
1111

12-
def __iter__(self) -> Iterator[Any]: ...
12+
def __iter__(self) -> Iterator[Any]:
13+
...
1314

1415
def keys(self) -> Iterator[int]:
1516
"""Return sparse indices iterator.
@@ -43,13 +44,16 @@ def items(self) -> Iterator[Tuple[int, Any]]:
4344
"""
4445
...
4546

46-
def __len__(self) -> int: ...
47+
def __len__(self) -> int:
48+
...
4749

48-
def __getitem__(self, idx: Union[int, slice]): ...
50+
def __getitem__(self, idx: Union[int, slice]):
51+
...
4952

5053
# TODO implement append
5154

52-
def __setitem__(self, idx: int, val: Any): ...
55+
def __setitem__(self, idx: int, val: Any):
56+
...
5357

5458
def sparse_list(self) -> Tuple[List[int], List[Any]]:
5559
"""Get sparse indices and values as :obj:`list`s."""

aim/storage/artifacts/artifact_storage.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@ def __init__(self, url: str):
77
self.url = url
88

99
@abstractmethod
10-
def upload_artifact(self, file_path: str, artifact_path: str, block: bool = False): ...
10+
def upload_artifact(self, file_path: str, artifact_path: str, block: bool = False):
11+
...
1112

1213
@abstractmethod
13-
def download_artifact(self, artifact_path: str, dest_dir: Optional[str] = None) -> str: ...
14+
def download_artifact(self, artifact_path: str, dest_dir: Optional[str] = None) -> str:
15+
...
1416

1517
@abstractmethod
16-
def delete_artifact(self, artifact_path: str): ...
18+
def delete_artifact(self, artifact_path: str):
19+
...

aim/storage/inmemorytreeview.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ def iterlevel(
117117
def array(self, path: Union[AimObjectKey, AimObjectPath] = (), dtype: Any = None) -> TreeArrayView:
118118
return TreeArrayView(self.subtree(path), dtype=dtype)
119119

120-
def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
120+
def first_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
121+
...
121122

122-
def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey: ...
123+
def last_key(self, path: Union[AimObjectKey, AimObjectPath] = ()) -> AimObjectKey:
124+
...

aim/storage/query.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ def __init__(self, expr: str):
8080
self.expr = expr
8181

8282
@abstractmethod
83-
def check(self, **params) -> bool: ...
83+
def check(self, **params) -> bool:
84+
...
8485

8586
def __call__(self, **params):
8687
return self.check(**params)

aim/storage/rockscontainer.pyx

+5-7
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class RocksAutoClean(AutoClean):
3535
super().__init__(instance)
3636
self._lock = None
3737
self._db = None
38+
self._progress_path = None
3839

3940
def _close(self):
4041
"""
@@ -48,6 +49,9 @@ class RocksAutoClean(AutoClean):
4849
self._db = None
4950
self._lock.release()
5051
self._lock = None
52+
if self._progress_path is not None:
53+
self._progress_path.unlink(missing_ok=True)
54+
self._progress_path = None
5155
if self._db is not None:
5256
self._db = None
5357

@@ -104,6 +108,7 @@ class RocksContainer(Container):
104108
if not self.read_only:
105109
progress_dir.mkdir(parents=True, exist_ok=True)
106110
self._progress_path.touch(exist_ok=True)
111+
self._resources._progress_path = self._progress_path
107112

108113
self.db
109114
# TODO check if Containers are reopenable
@@ -159,16 +164,9 @@ class RocksContainer(Container):
159164
Store the collection of `(key, value)` records in the :obj:`Container`
160165
`index` for fast reads.
161166
"""
162-
if not self._progress_path:
163-
return
164-
165167
for k, v in self.items():
166168
index[k] = v
167169

168-
if self._progress_path.exists():
169-
self._progress_path.unlink()
170-
self._progress_path = None
171-
172170
def close(self):
173171
"""Close all the resources."""
174172
if self._resources is None:

0 commit comments

Comments
 (0)