2
2
Utilities for working with the local dataset cache.
3
3
"""
4
4
import string
5
- import weakref
6
- from contextlib import contextmanager
7
5
import glob
8
- import io
9
6
import os
10
7
import logging
11
8
import tempfile
12
9
import json
13
- from abc import ABC
14
10
from collections import defaultdict
15
11
from dataclasses import dataclass , asdict
16
12
from datetime import timedelta
26
22
Callable ,
27
23
Set ,
28
24
List ,
29
- Iterator ,
30
- Iterable ,
31
25
Dict ,
32
26
NamedTuple ,
33
- MutableMapping ,
34
27
)
35
28
from hashlib import sha256
36
29
from functools import wraps
37
- from weakref import WeakValueDictionary
38
30
from zipfile import ZipFile , is_zipfile
39
31
import tarfile
40
32
import shutil
@@ -667,11 +659,10 @@ def _hf_hub_download(
667
659
668
660
if filename is not None :
669
661
hub_url = hf_hub .hf_hub_url (repo_id = repo_id , filename = filename , revision = revision )
670
- # TODO: change library name?
671
662
cache_path = str (
672
663
hf_hub .cached_download (
673
664
url = hub_url ,
674
- library_name = "allennlp " ,
665
+ library_name = "cached_path " ,
675
666
library_version = VERSION ,
676
667
cache_dir = cache_dir ,
677
668
)
@@ -831,24 +822,6 @@ def get_file_extension(path: str, dot=True, lower: bool = True):
831
822
return ext .lower () if lower else ext
832
823
833
824
834
- def open_compressed (
835
- filename : Union [str , PathLike ], mode : str = "rt" , encoding : Optional [str ] = "UTF-8" , ** kwargs
836
- ):
837
- if not isinstance (filename , str ):
838
- filename = str (filename )
839
- open_fn : Callable = open
840
-
841
- if filename .endswith (".gz" ):
842
- import gzip
843
-
844
- open_fn = gzip .open
845
- elif filename .endswith (".bz2" ):
846
- import bz2
847
-
848
- open_fn = bz2 .open
849
- return open_fn (get_cached_path (filename ), mode = mode , encoding = encoding , ** kwargs )
850
-
851
-
852
825
def _get_resource_size (path : str ) -> int :
853
826
"""
854
827
Get the size of a file or directory.
@@ -867,117 +840,3 @@ def _get_resource_size(path: str) -> int:
867
840
inodes .add (inode )
868
841
total_size += os .path .getsize (fp )
869
842
return total_size
870
-
871
-
872
- class _CacheEntry (NamedTuple ):
873
- regular_files : List [_Meta ]
874
- extraction_dirs : List [_Meta ]
875
-
876
-
877
- def _find_entries (
878
- patterns : List [str ] = None ,
879
- cache_dir : Union [str , Path ] = None ,
880
- ) -> Tuple [int , Dict [str , _CacheEntry ]]:
881
- """
882
- Find all cache entries, filtering ones that don't match any of the glob patterns given.
883
-
884
- Returns the total size of the matching entries and mapping or resource name to meta data.
885
-
886
- The values in the returned mapping are tuples because we seperate meta entries that
887
- correspond to extraction directories vs regular cache entries.
888
- """
889
- cache_dir = os .path .expanduser (cache_dir or CACHE_DIRECTORY )
890
-
891
- total_size : int = 0
892
- cache_entries : Dict [str , _CacheEntry ] = defaultdict (lambda : _CacheEntry ([], []))
893
- for meta_path in glob .glob (str (cache_dir ) + "/*.json" ):
894
- meta = _Meta .from_path (meta_path )
895
- if patterns and not any (fnmatch (meta .resource , p ) for p in patterns ):
896
- continue
897
- if meta .extraction_dir :
898
- cache_entries [meta .resource ].extraction_dirs .append (meta )
899
- else :
900
- cache_entries [meta .resource ].regular_files .append (meta )
901
- total_size += meta .size
902
-
903
- # Sort entries for each resource by creation time, newest first.
904
- for entry in cache_entries .values ():
905
- entry .regular_files .sort (key = lambda meta : meta .creation_time , reverse = True )
906
- entry .extraction_dirs .sort (key = lambda meta : meta .creation_time , reverse = True )
907
-
908
- return total_size , cache_entries
909
-
910
-
911
- def remove_cache_entries (patterns : List [str ], cache_dir : Union [str , Path ] = None ) -> int :
912
- """
913
- Remove cache entries matching the given patterns.
914
-
915
- Returns the total reclaimed space in bytes.
916
- """
917
- total_size , cache_entries = _find_entries (patterns = patterns , cache_dir = cache_dir )
918
- for resource , entry in cache_entries .items ():
919
- for meta in entry .regular_files :
920
- logger .info ("Removing cached version of %s at %s" , resource , meta .cached_path )
921
- os .remove (meta .cached_path )
922
- if os .path .exists (meta .cached_path + ".lock" ):
923
- os .remove (meta .cached_path + ".lock" )
924
- os .remove (meta .cached_path + ".json" )
925
- for meta in entry .extraction_dirs :
926
- logger .info ("Removing extracted version of %s at %s" , resource , meta .cached_path )
927
- shutil .rmtree (meta .cached_path )
928
- if os .path .exists (meta .cached_path + ".lock" ):
929
- os .remove (meta .cached_path + ".lock" )
930
- os .remove (meta .cached_path + ".json" )
931
- return total_size
932
-
933
-
934
- def inspect_cache (patterns : List [str ] = None , cache_dir : Union [str , Path ] = None ):
935
- """
936
- Print out useful information about the cache directory.
937
- """
938
- from allennlp .common .util import format_timedelta , format_size
939
-
940
- cache_dir = os .path .expanduser (cache_dir or CACHE_DIRECTORY )
941
-
942
- # Gather cache entries by resource.
943
- total_size , cache_entries = _find_entries (patterns = patterns , cache_dir = cache_dir )
944
-
945
- if patterns :
946
- print (f"Cached resources matching { patterns } :" )
947
- else :
948
- print ("Cached resources:" )
949
-
950
- for resource , entry in sorted (
951
- cache_entries .items (),
952
- # Sort by creation time, latest first.
953
- key = lambda x : max (
954
- 0 if not x [1 ][0 ] else x [1 ][0 ][0 ].creation_time ,
955
- 0 if not x [1 ][1 ] else x [1 ][1 ][0 ].creation_time ,
956
- ),
957
- reverse = True ,
958
- ):
959
- print ("\n -" , resource )
960
- if entry .regular_files :
961
- td = timedelta (seconds = time .time () - entry .regular_files [0 ].creation_time )
962
- n_versions = len (entry .regular_files )
963
- size = entry .regular_files [0 ].size
964
- print (
965
- f" { n_versions } { 'versions' if n_versions > 1 else 'version' } cached, "
966
- f"latest { format_size (size )} from { format_timedelta (td )} ago"
967
- )
968
- if entry .extraction_dirs :
969
- td = timedelta (seconds = time .time () - entry .extraction_dirs [0 ].creation_time )
970
- n_versions = len (entry .extraction_dirs )
971
- size = entry .extraction_dirs [0 ].size
972
- print (
973
- f" { n_versions } { 'versions' if n_versions > 1 else 'version' } extracted, "
974
- f"latest { format_size (size )} from { format_timedelta (td )} ago"
975
- )
976
- print (f"\n Total size: { format_size (total_size )} " )
977
-
978
-
979
- SAFE_FILENAME_CHARS = frozenset ("-_.%s%s" % (string .ascii_letters , string .digits ))
980
-
981
-
982
- def filename_is_safe (filename : str ) -> bool :
983
- return all (c in SAFE_FILENAME_CHARS for c in filename )
0 commit comments