20
20
import posixpath
21
21
from typing import Any , Callable , Literal , Optional
22
22
23
+ import fsspec
23
24
import pydantic
24
25
import sqlalchemy as sa
25
26
import sqlalchemy .orm
35
36
)
36
37
37
38
38
- def _get_files_from_cache_entry (cache_entry : database .CacheEntry ) -> dict [str , str ]:
39
+ def _get_files_from_cache_entry (
40
+ cache_entry : database .CacheEntry , key : str = "file:size"
41
+ ) -> dict [str , Any ]:
39
42
result = cache_entry .result
40
43
if not isinstance (result , (list , tuple , set )):
41
44
result = [result ]
@@ -48,27 +51,54 @@ def _get_files_from_cache_entry(cache_entry: database.CacheEntry) -> dict[str, s
48
51
and obj ["callable" ] in FILE_RESULT_CALLABLES
49
52
):
50
53
fs , urlpath = extra_encoders ._get_fs_and_urlpath (* obj ["args" ][:2 ])
51
- files [fs .unstrip_protocol (urlpath )] = obj ["args" ][0 ]["type" ]
54
+ files [fs .unstrip_protocol (urlpath )] = obj ["args" ][0 ][key ]
52
55
return files
53
56
54
57
55
- def _delete_cache_entry (
56
- session : sa .orm .Session , cache_entry : database .CacheEntry
58
+ def _remove_files (
59
+ fs : fsspec .AbstractFileSystem ,
60
+ files : list [str ],
61
+ max_tries : int = 10 ,
62
+ ** kwargs : Any ,
57
63
) -> None :
58
- fs , _ = utils .get_cache_files_fs_dirname ()
59
- files_to_delete = _get_files_from_cache_entry (cache_entry )
60
- logger = config .get ().logger
64
+ assert max_tries >= 1
65
+ if not files :
66
+ return
67
+
68
+ config .get ().logger .info ("deleting files" , n_files_to_delete = len (files ), ** kwargs )
69
+
70
+ n_tries = 0
71
+ while files :
72
+ n_tries += 1
73
+ try :
74
+ fs .rm (files , ** kwargs )
75
+ return
76
+ except FileNotFoundError :
77
+ # Another concurrent process might have deleted files
78
+ if n_tries >= max_tries :
79
+ raise
80
+ files = [file for file in files if fs .exists (file )]
61
81
62
- # First, delete database entry
63
- logger .info ("deleting cache entry" , cache_entry = cache_entry )
64
- session .delete (cache_entry )
82
+
83
+ def _delete_cache_entries (
84
+ session : sa .orm .Session , * cache_entries : database .CacheEntry
85
+ ) -> None :
86
+ fs , _ = utils .get_cache_files_fs_dirname ()
87
+ files_to_delete = []
88
+ dirs_to_delete = []
89
+ for cache_entry in cache_entries :
90
+ session .delete (cache_entry )
91
+
92
+ files = _get_files_from_cache_entry (cache_entry , key = "type" )
93
+ for file , file_type in files .items ():
94
+ if file_type == "application/vnd+zarr" :
95
+ dirs_to_delete .append (file )
96
+ else :
97
+ files_to_delete .append (file )
65
98
database ._commit_or_rollback (session )
66
99
67
- # Then, delete files
68
- for urlpath , file_type in files_to_delete .items ():
69
- if fs .exists (urlpath ):
70
- logger .info ("deleting cache file" , urlpath = urlpath )
71
- fs .rm (urlpath , recursive = file_type == "application/vnd+zarr" )
100
+ _remove_files (fs , files_to_delete , recursive = False )
101
+ _remove_files (fs , dirs_to_delete , recursive = True )
72
102
73
103
74
104
def delete (func_to_del : str | Callable [..., Any ], * args : Any , ** kwargs : Any ) -> None :
@@ -88,19 +118,24 @@ def delete(func_to_del: str | Callable[..., Any], *args: Any, **kwargs: Any) ->
88
118
for cache_entry in session .scalars (
89
119
sa .select (database .CacheEntry ).filter (database .CacheEntry .key == hexdigest )
90
120
):
91
- _delete_cache_entry (session , cache_entry )
121
+ _delete_cache_entries (session , cache_entry )
92
122
93
123
94
124
class _Cleaner :
95
- def __init__ (self , depth : int ) -> None :
125
+ def __init__ (self , depth : int , use_database : bool ) -> None :
96
126
self .logger = config .get ().logger
97
127
self .fs , self .dirname = utils .get_cache_files_fs_dirname ()
98
128
99
129
self .urldir = self .fs .unstrip_protocol (self .dirname )
100
130
101
131
self .logger .info ("getting disk usage" )
102
132
self .file_sizes : dict [str , int ] = collections .defaultdict (int )
103
- for path , size in self .fs .du (self .dirname , total = False ).items ():
133
+ du = (
134
+ self .get_known_files ()
135
+ if use_database
136
+ else self .fs .du (self .dirname , total = False )
137
+ )
138
+ for path , size in du .items ():
104
139
# Group dirs
105
140
urlpath = self .fs .unstrip_protocol (path )
106
141
parts = urlpath .replace (self .urldir , "" , 1 ).strip ("/" ).split ("/" )
@@ -120,6 +155,15 @@ def log_disk_usage(self) -> None:
120
155
def stop_cleaning (self , maxsize : int ) -> bool :
121
156
return self .disk_usage <= maxsize
122
157
158
+ def get_known_files (self ) -> dict [str , int ]:
159
+ known_files : dict [str , int ] = {}
160
+ with config .get ().instantiated_sessionmaker () as session :
161
+ for cache_entry in session .scalars (sa .select (database .CacheEntry )):
162
+ known_files .update (
163
+ _get_files_from_cache_entry (cache_entry , key = "file:size" )
164
+ )
165
+ return known_files
166
+
123
167
def get_unknown_files (self , lock_validity_period : float | None ) -> set [str ]:
124
168
self .logger .info ("getting unknown files" )
125
169
@@ -152,10 +196,7 @@ def delete_unknown_files(
152
196
unknown_files = self .get_unknown_files (lock_validity_period )
153
197
for urlpath in unknown_files :
154
198
self .pop_file_size (urlpath )
155
- self .remove_files (
156
- list (unknown_files ),
157
- recursive = recursive ,
158
- )
199
+ _remove_files (self .fs , list (unknown_files ), recursive = recursive )
159
200
self .log_disk_usage ()
160
201
161
202
@staticmethod
@@ -207,30 +248,6 @@ def _get_method_sorters(
207
248
sorters .append (database .CacheEntry .expiration )
208
249
return sorters
209
250
210
- def remove_files (
211
- self ,
212
- files : list [str ],
213
- max_tries : int = 10 ,
214
- ** kwargs : Any ,
215
- ) -> None :
216
- assert max_tries >= 1
217
- if not files :
218
- return
219
-
220
- self .logger .info ("deleting files" , n_files_to_delete = len (files ), ** kwargs )
221
-
222
- n_tries = 0
223
- while files :
224
- n_tries += 1
225
- try :
226
- self .fs .rm (files , ** kwargs )
227
- return
228
- except FileNotFoundError :
229
- # Another concurrent process might have deleted files
230
- if n_tries >= max_tries :
231
- raise
232
- files = [file for file in files if self .fs .exists (file )]
233
-
234
251
def delete_cache_files (
235
252
self ,
236
253
maxsize : int ,
@@ -244,37 +261,27 @@ def delete_cache_files(
244
261
if self .stop_cleaning (maxsize ):
245
262
return
246
263
247
- files_to_delete = []
248
- dirs_to_delete = []
264
+ entries_to_delete = []
249
265
self .logger .info ("getting cache entries to delete" )
250
- n_entries_to_delete = 0
251
266
with config .get ().instantiated_sessionmaker () as session :
252
267
for cache_entry in session .scalars (
253
268
sa .select (database .CacheEntry ).filter (* filters ).order_by (* sorters )
254
269
):
255
270
files = _get_files_from_cache_entry (cache_entry )
256
271
if any (file .startswith (self .urldir ) for file in files ):
257
- n_entries_to_delete += 1
258
- session .delete (cache_entry )
259
-
260
- for file , file_type in files .items ():
272
+ entries_to_delete .append (cache_entry )
273
+ for file in files :
261
274
self .pop_file_size (file )
262
- if file_type == "application/vnd+zarr" :
263
- dirs_to_delete .append (file )
264
- else :
265
- files_to_delete .append (file )
266
275
267
276
if self .stop_cleaning (maxsize ):
268
277
break
269
278
270
- if n_entries_to_delete :
279
+ if entries_to_delete :
271
280
self .logger .info (
272
- "deleting cache entries" , n_entries_to_delete = n_entries_to_delete
281
+ "deleting cache entries" , n_entries_to_delete = len ( entries_to_delete )
273
282
)
274
- database . _commit_or_rollback (session )
283
+ _delete_cache_entries (session , * entries_to_delete )
275
284
276
- self .remove_files (files_to_delete , recursive = False )
277
- self .remove_files (dirs_to_delete , recursive = True )
278
285
self .log_disk_usage ()
279
286
280
287
if not self .stop_cleaning (maxsize ):
@@ -296,6 +303,7 @@ def clean_cache_files(
296
303
tags_to_clean : list [str | None ] | None = None ,
297
304
tags_to_keep : list [str | None ] | None = None ,
298
305
depth : int = 1 ,
306
+ use_database : bool = False ,
299
307
) -> None :
300
308
"""Clean cache files.
301
309
@@ -318,8 +326,15 @@ def clean_cache_files(
318
326
tags_to_clean and tags_to_keep are mutually exclusive.
319
327
depth: int, default: 1
320
328
depth for grouping cache files
329
+ use_database: bool, default: False
330
+ Whether to infer disk usage from the cacholote database
321
331
"""
322
- cleaner = _Cleaner (depth = depth )
332
+ if use_database and delete_unknown_files :
333
+ raise ValueError (
334
+ "'use_database' and 'delete_unknown_files' are mutually exclusive"
335
+ )
336
+
337
+ cleaner = _Cleaner (depth = depth , use_database = use_database )
323
338
324
339
if delete_unknown_files :
325
340
cleaner .delete_unknown_files (lock_validity_period , recursive )
@@ -352,15 +367,15 @@ def clean_invalid_cache_entries(
352
367
for cache_entry in session .scalars (
353
368
sa .select (database .CacheEntry ).filter (* filters )
354
369
):
355
- _delete_cache_entry (session , cache_entry )
370
+ _delete_cache_entries (session , cache_entry )
356
371
357
372
if try_decode :
358
373
with config .get ().instantiated_sessionmaker () as session :
359
374
for cache_entry in session .scalars (sa .select (database .CacheEntry )):
360
375
try :
361
376
decode .loads (cache_entry ._result_as_string )
362
377
except decode .DecodeError :
363
- _delete_cache_entry (session , cache_entry )
378
+ _delete_cache_entries (session , cache_entry )
364
379
365
380
366
381
def expire_cache_entries (
@@ -379,15 +394,14 @@ def expire_cache_entries(
379
394
if after is not None :
380
395
filters .append (database .CacheEntry .created_at > after )
381
396
382
- count = 0
383
397
with config .get ().instantiated_sessionmaker () as session :
384
- for cache_entry in session . scalars (
385
- sa .select (database .CacheEntry ).filter (* filters )
386
- ):
387
- count += 1
388
- if delete :
389
- session . delete ( cache_entry )
390
- else :
398
+ cache_entries = list (
399
+ session . scalars ( sa .select (database .CacheEntry ).filter (* filters ) )
400
+ )
401
+ if delete :
402
+ _delete_cache_entries ( session , * cache_entries )
403
+ else :
404
+ for cache_entry in cache_entries :
391
405
cache_entry .expiration = now
392
- database ._commit_or_rollback (session )
393
- return count
406
+ database ._commit_or_rollback (session )
407
+ return len ( cache_entries )
0 commit comments