20
20
import posixpath
21
21
from typing import Any , Callable , Literal , Optional
22
22
23
+ import fsspec
23
24
import pydantic
24
25
import sqlalchemy as sa
25
26
import sqlalchemy .orm
35
36
)
36
37
37
38
38
- def _get_files_from_cache_entry (cache_entry : database .CacheEntry ) -> dict [str , str ]:
39
+ def _get_files_from_cache_entry (
40
+ cache_entry : database .CacheEntry , key : str | None
41
+ ) -> dict [str , Any ]:
39
42
result = cache_entry .result
40
43
if not isinstance (result , (list , tuple , set )):
41
44
result = [result ]
@@ -48,27 +51,57 @@ def _get_files_from_cache_entry(cache_entry: database.CacheEntry) -> dict[str, s
48
51
and obj ["callable" ] in FILE_RESULT_CALLABLES
49
52
):
50
53
fs , urlpath = extra_encoders ._get_fs_and_urlpath (* obj ["args" ][:2 ])
51
- files [fs .unstrip_protocol (urlpath )] = obj ["args" ][0 ]["type" ]
54
+ value = obj ["args" ][0 ]
55
+ if key is not None :
56
+ value = value [key ]
57
+ files [fs .unstrip_protocol (urlpath )] = value
52
58
return files
53
59
54
60
55
- def _delete_cache_entry (
56
- session : sa .orm .Session , cache_entry : database .CacheEntry
61
+ def _remove_files (
62
+ fs : fsspec .AbstractFileSystem ,
63
+ files : list [str ],
64
+ max_tries : int = 10 ,
65
+ ** kwargs : Any ,
57
66
) -> None :
58
- fs , _ = utils .get_cache_files_fs_dirname ()
59
- files_to_delete = _get_files_from_cache_entry (cache_entry )
60
- logger = config .get ().logger
67
+ assert max_tries >= 1
68
+ if not files :
69
+ return
70
+
71
+ config .get ().logger .info ("deleting files" , n_files_to_delete = len (files ), ** kwargs )
72
+
73
+ n_tries = 0
74
+ while files :
75
+ n_tries += 1
76
+ try :
77
+ fs .rm (files , ** kwargs )
78
+ return
79
+ except FileNotFoundError :
80
+ # Another concurrent process might have deleted files
81
+ if n_tries >= max_tries :
82
+ raise
83
+ files = [file for file in files if fs .exists (file )]
61
84
62
- # First, delete database entry
63
- logger .info ("deleting cache entry" , cache_entry = cache_entry )
64
- session .delete (cache_entry )
85
+
86
+ def _delete_cache_entries (
87
+ session : sa .orm .Session , * cache_entries : database .CacheEntry
88
+ ) -> None :
89
+ fs , _ = utils .get_cache_files_fs_dirname ()
90
+ files_to_delete = []
91
+ dirs_to_delete = []
92
+ for cache_entry in cache_entries :
93
+ session .delete (cache_entry )
94
+
95
+ files = _get_files_from_cache_entry (cache_entry , key = "type" )
96
+ for file , file_type in files .items ():
97
+ if file_type == "application/vnd+zarr" :
98
+ dirs_to_delete .append (file )
99
+ else :
100
+ files_to_delete .append (file )
65
101
database ._commit_or_rollback (session )
66
102
67
- # Then, delete files
68
- for urlpath , file_type in files_to_delete .items ():
69
- if fs .exists (urlpath ):
70
- logger .info ("deleting cache file" , urlpath = urlpath )
71
- fs .rm (urlpath , recursive = file_type == "application/vnd+zarr" )
103
+ _remove_files (fs , files_to_delete , recursive = False )
104
+ _remove_files (fs , dirs_to_delete , recursive = True )
72
105
73
106
74
107
def delete (func_to_del : str | Callable [..., Any ], * args : Any , ** kwargs : Any ) -> None :
@@ -88,19 +121,20 @@ def delete(func_to_del: str | Callable[..., Any], *args: Any, **kwargs: Any) ->
88
121
for cache_entry in session .scalars (
89
122
sa .select (database .CacheEntry ).filter (database .CacheEntry .key == hexdigest )
90
123
):
91
- _delete_cache_entry (session , cache_entry )
124
+ _delete_cache_entries (session , cache_entry )
92
125
93
126
94
127
class _Cleaner :
95
- def __init__ (self , depth : int ) -> None :
128
+ def __init__ (self , depth : int , use_database : bool ) -> None :
96
129
self .logger = config .get ().logger
97
130
self .fs , self .dirname = utils .get_cache_files_fs_dirname ()
98
131
99
132
self .urldir = self .fs .unstrip_protocol (self .dirname )
100
133
101
134
self .logger .info ("getting disk usage" )
102
135
self .file_sizes : dict [str , int ] = collections .defaultdict (int )
103
- for path , size in self .fs .du (self .dirname , total = False ).items ():
136
+ du = self .known_files if use_database else self .fs .du (self .dirname , total = False )
137
+ for path , size in du .items ():
104
138
# Group dirs
105
139
urlpath = self .fs .unstrip_protocol (path )
106
140
parts = urlpath .replace (self .urldir , "" , 1 ).strip ("/" ).split ("/" )
@@ -120,6 +154,16 @@ def log_disk_usage(self) -> None:
120
154
def stop_cleaning (self , maxsize : int ) -> bool :
121
155
return self .disk_usage <= maxsize
122
156
157
+ @property
158
+ def known_files (self ) -> dict [str , int ]:
159
+ known_files : dict [str , int ] = {}
160
+ with config .get ().instantiated_sessionmaker () as session :
161
+ for cache_entry in session .scalars (sa .select (database .CacheEntry )):
162
+ known_files .update (
163
+ _get_files_from_cache_entry (cache_entry , key = "file:size" )
164
+ )
165
+ return known_files
166
+
123
167
def get_unknown_files (self , lock_validity_period : float | None ) -> set [str ]:
124
168
self .logger .info ("getting unknown files" )
125
169
@@ -137,25 +181,15 @@ def get_unknown_files(self, lock_validity_period: float | None) -> set[str]:
137
181
locked_files .add (urlpath )
138
182
locked_files .add (urlpath .rsplit (".lock" , 1 )[0 ])
139
183
140
- if unknown_files := (set (self .file_sizes ) - locked_files ):
141
- with config .get ().instantiated_sessionmaker () as session :
142
- for cache_entry in session .scalars (sa .select (database .CacheEntry )):
143
- for known_file in _get_files_from_cache_entry (cache_entry ):
144
- unknown_files .discard (known_file )
145
- if not unknown_files :
146
- break
147
- return unknown_files
184
+ return set (self .file_sizes ) - locked_files - set (self .known_files )
148
185
149
186
def delete_unknown_files (
150
187
self , lock_validity_period : float | None , recursive : bool
151
188
) -> None :
152
189
unknown_files = self .get_unknown_files (lock_validity_period )
153
190
for urlpath in unknown_files :
154
191
self .pop_file_size (urlpath )
155
- self .remove_files (
156
- list (unknown_files ),
157
- recursive = recursive ,
158
- )
192
+ _remove_files (self .fs , list (unknown_files ), recursive = recursive )
159
193
self .log_disk_usage ()
160
194
161
195
@staticmethod
@@ -207,30 +241,6 @@ def _get_method_sorters(
207
241
sorters .append (database .CacheEntry .expiration )
208
242
return sorters
209
243
210
- def remove_files (
211
- self ,
212
- files : list [str ],
213
- max_tries : int = 10 ,
214
- ** kwargs : Any ,
215
- ) -> None :
216
- assert max_tries >= 1
217
- if not files :
218
- return
219
-
220
- self .logger .info ("deleting files" , n_files_to_delete = len (files ), ** kwargs )
221
-
222
- n_tries = 0
223
- while files :
224
- n_tries += 1
225
- try :
226
- self .fs .rm (files , ** kwargs )
227
- return
228
- except FileNotFoundError :
229
- # Another concurrent process might have deleted files
230
- if n_tries >= max_tries :
231
- raise
232
- files = [file for file in files if self .fs .exists (file )]
233
-
234
244
def delete_cache_files (
235
245
self ,
236
246
maxsize : int ,
@@ -244,37 +254,27 @@ def delete_cache_files(
244
254
if self .stop_cleaning (maxsize ):
245
255
return
246
256
247
- files_to_delete = []
248
- dirs_to_delete = []
257
+ entries_to_delete = []
249
258
self .logger .info ("getting cache entries to delete" )
250
- n_entries_to_delete = 0
251
259
with config .get ().instantiated_sessionmaker () as session :
252
260
for cache_entry in session .scalars (
253
261
sa .select (database .CacheEntry ).filter (* filters ).order_by (* sorters )
254
262
):
255
- files = _get_files_from_cache_entry (cache_entry )
263
+ files = _get_files_from_cache_entry (cache_entry , key = "file:size" )
256
264
if any (file .startswith (self .urldir ) for file in files ):
257
- n_entries_to_delete += 1
258
- session .delete (cache_entry )
259
-
260
- for file , file_type in files .items ():
265
+ entries_to_delete .append (cache_entry )
266
+ for file in files :
261
267
self .pop_file_size (file )
262
- if file_type == "application/vnd+zarr" :
263
- dirs_to_delete .append (file )
264
- else :
265
- files_to_delete .append (file )
266
268
267
269
if self .stop_cleaning (maxsize ):
268
270
break
269
271
270
- if n_entries_to_delete :
272
+ if entries_to_delete :
271
273
self .logger .info (
272
- "deleting cache entries" , n_entries_to_delete = n_entries_to_delete
274
+ "deleting cache entries" , n_entries_to_delete = len ( entries_to_delete )
273
275
)
274
- database . _commit_or_rollback (session )
276
+ _delete_cache_entries (session , * entries_to_delete )
275
277
276
- self .remove_files (files_to_delete , recursive = False )
277
- self .remove_files (dirs_to_delete , recursive = True )
278
278
self .log_disk_usage ()
279
279
280
280
if not self .stop_cleaning (maxsize ):
@@ -296,6 +296,7 @@ def clean_cache_files(
296
296
tags_to_clean : list [str | None ] | None = None ,
297
297
tags_to_keep : list [str | None ] | None = None ,
298
298
depth : int = 1 ,
299
+ use_database : bool = False ,
299
300
) -> None :
300
301
"""Clean cache files.
301
302
@@ -318,8 +319,15 @@ def clean_cache_files(
318
319
tags_to_clean and tags_to_keep are mutually exclusive.
319
320
depth: int, default: 1
320
321
depth for grouping cache files
322
+ use_database: bool, default: False
323
+ Whether to infer disk usage from the cacholote database
321
324
"""
322
- cleaner = _Cleaner (depth = depth )
325
+ if use_database and delete_unknown_files :
326
+ raise ValueError (
327
+ "'use_database' and 'delete_unknown_files' are mutually exclusive"
328
+ )
329
+
330
+ cleaner = _Cleaner (depth = depth , use_database = use_database )
323
331
324
332
if delete_unknown_files :
325
333
cleaner .delete_unknown_files (lock_validity_period , recursive )
@@ -352,15 +360,15 @@ def clean_invalid_cache_entries(
352
360
for cache_entry in session .scalars (
353
361
sa .select (database .CacheEntry ).filter (* filters )
354
362
):
355
- _delete_cache_entry (session , cache_entry )
363
+ _delete_cache_entries (session , cache_entry )
356
364
357
365
if try_decode :
358
366
with config .get ().instantiated_sessionmaker () as session :
359
367
for cache_entry in session .scalars (sa .select (database .CacheEntry )):
360
368
try :
361
369
decode .loads (cache_entry ._result_as_string )
362
370
except decode .DecodeError :
363
- _delete_cache_entry (session , cache_entry )
371
+ _delete_cache_entries (session , cache_entry )
364
372
365
373
366
374
def expire_cache_entries (
@@ -379,15 +387,14 @@ def expire_cache_entries(
379
387
if after is not None :
380
388
filters .append (database .CacheEntry .created_at > after )
381
389
382
- count = 0
383
390
with config .get ().instantiated_sessionmaker () as session :
384
- for cache_entry in session . scalars (
385
- sa .select (database .CacheEntry ).filter (* filters )
386
- ):
387
- count += 1
388
- if delete :
389
- session . delete ( cache_entry )
390
- else :
391
+ cache_entries = list (
392
+ session . scalars ( sa .select (database .CacheEntry ).filter (* filters ) )
393
+ )
394
+ if delete :
395
+ _delete_cache_entries ( session , * cache_entries )
396
+ else :
397
+ for cache_entry in cache_entries :
391
398
cache_entry .expiration = now
392
- database ._commit_or_rollback (session )
393
- return count
399
+ database ._commit_or_rollback (session )
400
+ return len ( cache_entries )
0 commit comments