allenai
diff --git a/‎cached_path/__init__.py
+1-142 b/‎cached_path/__init__.py
+1-142
diff --git a/‎cached_path/testing.py
+35 b/‎cached_path/testing.py
+35
diff --git a/‎setup.py
+2-4 b/‎setup.py
+2-4
diff --git a/‎test_fixtures/embeddings/fake_embeddings.5d.txt
-15 b/‎test_fixtures/embeddings/fake_embeddings.5d.txt
-15
diff --git a/‎test_fixtures/embeddings/fake_embeddings.5d.txt.bz2
-295 Bytes b/‎test_fixtures/embeddings/fake_embeddings.5d.txt.bz2
-295 Bytes
diff --git a/‎test_fixtures/embeddings/fake_embeddings.5d.txt.gz
-332 Bytes b/‎test_fixtures/embeddings/fake_embeddings.5d.txt.gz
-332 Bytes
diff --git a/‎test_fixtures/embeddings/fake_embeddings.5d.txt.lzma
-316 Bytes b/‎test_fixtures/embeddings/fake_embeddings.5d.txt.lzma
-316 Bytes
diff --git a/‎test_fixtures/embeddings/fake_embeddings.5d.txt.tar.gz
-408 Bytes b/‎test_fixtures/embeddings/fake_embeddings.5d.txt.tar.gz
-408 Bytes
diff --git a/‎test_fixtures/embeddings/fake_embeddings.5d.txt.zip
-485 Bytes b/‎test_fixtures/embeddings/fake_embeddings.5d.txt.zip
-485 Bytes
diff --git a/‎test_fixtures/embeddings/glove.6B.300d.sample.txt.gz
-96.4 KB b/‎test_fixtures/embeddings/glove.6B.300d.sample.txt.gz
-96.4 KB
diff --git a/‎test_fixtures/embeddings/multi-file-archive.tar.gz
-472 Bytes b/‎test_fixtures/embeddings/multi-file-archive.tar.gz
-472 Bytes
diff --git a/‎test_fixtures/embeddings/multi-file-archive.zip
-1.08 KB b/‎test_fixtures/embeddings/multi-file-archive.zip
-1.08 KB
diff --git a/‎test_fixtures/utf-8_sample/archives/utf-8.tar.bz2
-5.52 KB b/‎test_fixtures/utf-8_sample/archives/utf-8.tar.bz2
-5.52 KB
diff --git a/‎test_fixtures/utf-8_sample/archives/utf-8.tar.lzma
-5.4 KB b/‎test_fixtures/utf-8_sample/archives/utf-8.tar.lzma
-5.4 KB
diff --git a/‎tests/cached_path_test.py
+54-43 b/‎tests/cached_path_test.py
+54-43
@@ -2,15 +2,11 @@
 Utilities for working with the local dataset cache.
 """
 import string
-import weakref
-from contextlib import contextmanager
 import glob
-import io
 import os
 import logging
 import tempfile
 import json
-from abc import ABC
 from collections import defaultdict
 from dataclasses import dataclass, asdict
 from datetime import timedelta
@@ -26,15 +22,11 @@
     Callable,
     Set,
     List,
-    Iterator,
-    Iterable,
     Dict,
     NamedTuple,
-    MutableMapping,
 )
 from hashlib import sha256
 from functools import wraps
-from weakref import WeakValueDictionary
 from zipfile import ZipFile, is_zipfile
 import tarfile
 import shutil
@@ -667,11 +659,10 @@ def _hf_hub_download(
 
     if filename is not None:
         hub_url = hf_hub.hf_hub_url(repo_id=repo_id, filename=filename, revision=revision)
-        # TODO: change library name?
         cache_path = str(
             hf_hub.cached_download(
                 url=hub_url,
-                library_name="allennlp",
+                library_name="cached_path",
                 library_version=VERSION,
                 cache_dir=cache_dir,
             )
@@ -831,24 +822,6 @@ def get_file_extension(path: str, dot=True, lower: bool = True):
     return ext.lower() if lower else ext
 
 
-def open_compressed(
-    filename: Union[str, PathLike], mode: str = "rt", encoding: Optional[str] = "UTF-8", **kwargs
-):
-    if not isinstance(filename, str):
-        filename = str(filename)
-    open_fn: Callable = open
-
-    if filename.endswith(".gz"):
-        import gzip
-
-        open_fn = gzip.open
-    elif filename.endswith(".bz2"):
-        import bz2
-
-        open_fn = bz2.open
-    return open_fn(get_cached_path(filename), mode=mode, encoding=encoding, **kwargs)
-
-
 def _get_resource_size(path: str) -> int:
     """
     Get the size of a file or directory.
@@ -867,117 +840,3 @@ def _get_resource_size(path: str) -> int:
                 inodes.add(inode)
                 total_size += os.path.getsize(fp)
     return total_size
-
-
-class _CacheEntry(NamedTuple):
-    regular_files: List[_Meta]
-    extraction_dirs: List[_Meta]
-
-
-def _find_entries(
-    patterns: List[str] = None,
-    cache_dir: Union[str, Path] = None,
-) -> Tuple[int, Dict[str, _CacheEntry]]:
-    """
-    Find all cache entries, filtering ones that don't match any of the glob patterns given.
-
-    Returns the total size of the matching entries and mapping or resource name to meta data.
-
-    The values in the returned mapping are tuples because we seperate meta entries that
-    correspond to extraction directories vs regular cache entries.
-    """
-    cache_dir = os.path.expanduser(cache_dir or CACHE_DIRECTORY)
-
-    total_size: int = 0
-    cache_entries: Dict[str, _CacheEntry] = defaultdict(lambda: _CacheEntry([], []))
-    for meta_path in glob.glob(str(cache_dir) + "/*.json"):
-        meta = _Meta.from_path(meta_path)
-        if patterns and not any(fnmatch(meta.resource, p) for p in patterns):
-            continue
-        if meta.extraction_dir:
-            cache_entries[meta.resource].extraction_dirs.append(meta)
-        else:
-            cache_entries[meta.resource].regular_files.append(meta)
-        total_size += meta.size
-
-    # Sort entries for each resource by creation time, newest first.
-    for entry in cache_entries.values():
-        entry.regular_files.sort(key=lambda meta: meta.creation_time, reverse=True)
-        entry.extraction_dirs.sort(key=lambda meta: meta.creation_time, reverse=True)
-
-    return total_size, cache_entries
-
-
-def remove_cache_entries(patterns: List[str], cache_dir: Union[str, Path] = None) -> int:
-    """
-    Remove cache entries matching the given patterns.
-
-    Returns the total reclaimed space in bytes.
-    """
-    total_size, cache_entries = _find_entries(patterns=patterns, cache_dir=cache_dir)
-    for resource, entry in cache_entries.items():
-        for meta in entry.regular_files:
-            logger.info("Removing cached version of %s at %s", resource, meta.cached_path)
-            os.remove(meta.cached_path)
-            if os.path.exists(meta.cached_path + ".lock"):
-                os.remove(meta.cached_path + ".lock")
-            os.remove(meta.cached_path + ".json")
-        for meta in entry.extraction_dirs:
-            logger.info("Removing extracted version of %s at %s", resource, meta.cached_path)
-            shutil.rmtree(meta.cached_path)
-            if os.path.exists(meta.cached_path + ".lock"):
-                os.remove(meta.cached_path + ".lock")
-            os.remove(meta.cached_path + ".json")
-    return total_size
-
-
-def inspect_cache(patterns: List[str] = None, cache_dir: Union[str, Path] = None):
-    """
-    Print out useful information about the cache directory.
-    """
-    from allennlp.common.util import format_timedelta, format_size
-
-    cache_dir = os.path.expanduser(cache_dir or CACHE_DIRECTORY)
-
-    # Gather cache entries by resource.
-    total_size, cache_entries = _find_entries(patterns=patterns, cache_dir=cache_dir)
-
-    if patterns:
-        print(f"Cached resources matching {patterns}:")
-    else:
-        print("Cached resources:")
-
-    for resource, entry in sorted(
-        cache_entries.items(),
-        # Sort by creation time, latest first.
-        key=lambda x: max(
-            0 if not x[1][0] else x[1][0][0].creation_time,
-            0 if not x[1][1] else x[1][1][0].creation_time,
-        ),
-        reverse=True,
-    ):
-        print("\n-", resource)
-        if entry.regular_files:
-            td = timedelta(seconds=time.time() - entry.regular_files[0].creation_time)
-            n_versions = len(entry.regular_files)
-            size = entry.regular_files[0].size
-            print(
-                f"  {n_versions} {'versions' if n_versions > 1 else 'version'} cached, "
-                f"latest {format_size(size)} from {format_timedelta(td)} ago"
-            )
-        if entry.extraction_dirs:
-            td = timedelta(seconds=time.time() - entry.extraction_dirs[0].creation_time)
-            n_versions = len(entry.extraction_dirs)
-            size = entry.extraction_dirs[0].size
-            print(
-                f"  {n_versions} {'versions' if n_versions > 1 else 'version'} extracted, "
-                f"latest {format_size(size)} from {format_timedelta(td)} ago"
-            )
-    print(f"\nTotal size: {format_size(total_size)}")
-
-
-SAFE_FILENAME_CHARS = frozenset("-_.%s%s" % (string.ascii_letters, string.digits))
-
-
-def filename_is_safe(filename: str) -> bool:
-    return all(c in SAFE_FILENAME_CHARS for c in filename)
@@ -0,0 +1,35 @@
+import logging
+import os
+import pathlib
+import shutil
+import tempfile
+
+TEST_DIR = tempfile.mkdtemp(prefix="cached_path_tests")
+
+
+class BaseTestClass:
+    """
+    A custom testing class that disables some of the more verbose
+    logging and that creates and destroys a temp directory as a test fixture.
+    """
+
+    PROJECT_ROOT = (pathlib.Path(__file__).parent / "..").resolve()
+    MODULE_ROOT = PROJECT_ROOT / "cached_path"
+    TOOLS_ROOT = MODULE_ROOT / "tools"
+    TESTS_ROOT = PROJECT_ROOT / "tests"
+    FIXTURES_ROOT = PROJECT_ROOT / "test_fixtures"
+
+    def setup_method(self):
+        logging.basicConfig(
+            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG
+        )
+        # Disabling some of the more verbose logging statements that typically aren't very helpful
+        # in tests.
+        logging.getLogger("urllib3.connectionpool").disabled = True
+
+        self.TEST_DIR = pathlib.Path(TEST_DIR)
+
+        os.makedirs(self.TEST_DIR, exist_ok=True)
+
+    def teardown_method(self):
+        shutil.rmtree(self.TEST_DIR)
@@ -42,8 +42,8 @@ def fix_url_dependencies(req: str) -> str:
         "Programming Language :: Python :: 3",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    keywords="",
-    url="",
+    keywords="allennlp cached_path file utils",
+    url="https://github.com/allenai/cached_path",
     author="Allen Institute for Artificial Intelligence",
     author_email="[email protected]",
     license="Apache",
@@ -55,8 +55,6 @@ def fix_url_dependencies(req: str) -> str:
             "tests",
             "test_fixtures",
             "test_fixtures.*",
-            "benchmarks",
-            "benchmarks.*",
         ]
     ),
     install_requires=install_requirements,
 
@@ -1,7 +1,11 @@
 from collections import Counter
 import json
+import os
+import shutil
 import time
+import pathlib
 
+from filelock import Timeout
 import pytest
 import responses
 from requests.exceptions import ConnectionError, HTTPError
@@ -15,50 +19,11 @@
     get_cached_path,
     _split_s3_path,
     _split_gcs_path,
-    open_compressed,
     CacheFile,
     _Meta,
-    _find_entries,
-    inspect_cache,
-    remove_cache_entries,
-    # LocalCacheResource,
 )
 
-import logging
-import os
-import pathlib
-import shutil
-import tempfile
-
-TEST_DIR = tempfile.mkdtemp(prefix="cached_path_tests")
-
-
-class BaseTestCase:
-    """
-    A custom testing class that disables some of the more verbose AllenNLP
-    logging and that creates and destroys a temp directory as a test fixture.
-    """
-
-    PROJECT_ROOT = (pathlib.Path(__file__).parent / "..").resolve()
-    MODULE_ROOT = PROJECT_ROOT / "cached_path"
-    TOOLS_ROOT = MODULE_ROOT / "tools"
-    TESTS_ROOT = PROJECT_ROOT / "tests"
-    FIXTURES_ROOT = PROJECT_ROOT / "test_fixtures"
-
-    def setup_method(self):
-        logging.basicConfig(
-            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG
-        )
-        # Disabling some of the more verbose logging statements that typically aren't very helpful
-        # in tests.
-        logging.getLogger("urllib3.connectionpool").disabled = True
-
-        self.TEST_DIR = pathlib.Path(TEST_DIR)
-
-        os.makedirs(self.TEST_DIR, exist_ok=True)
-
-    def teardown_method(self):
-        shutil.rmtree(self.TEST_DIR)
+from cached_path.testing import BaseTestClass
 
 
 def set_up_glove(url: str, byt: bytes, change_etag_every: int = 1000):
@@ -94,7 +59,53 @@ def head_callback(_):
     responses.add_callback(responses.HEAD, url, callback=head_callback)
 
 
-class TestFileUtils(BaseTestCase):
+class TestFileLock(BaseTestClass):
+    def setup_method(self):
+        super().setup_method()
+
+        # Set up a regular lock and a read-only lock.
+        open(self.TEST_DIR / "lock", "a").close()
+        open(self.TEST_DIR / "read_only_lock", "a").close()
+        os.chmod(self.TEST_DIR / "read_only_lock", 0o555)
+
+        # Also set up a read-only directory.
+        os.mkdir(self.TEST_DIR / "read_only_dir", 0o555)
+
+    def test_locking(self):
+        with FileLock(self.TEST_DIR / "lock"):
+            # Trying to acquire the lock again should fail.
+            with pytest.raises(Timeout):
+                with FileLock(self.TEST_DIR / "lock", timeout=0.1):
+                    pass
+
+        # Trying to acquire a lock when lacking write permissions on the file should fail.
+        with pytest.raises(PermissionError):
+            with FileLock(self.TEST_DIR / "read_only_lock"):
+                pass
+
+        # But this should only issue a warning if we set the `read_only_ok` flag to `True`.
+        with pytest.warns(UserWarning, match="Lacking permissions"):
+            with FileLock(self.TEST_DIR / "read_only_lock", read_only_ok=True):
+                pass
+
+        # However this should always fail when we lack write permissions and the file lock
+        # doesn't exist yet.
+        with pytest.raises(PermissionError):
+            with FileLock(self.TEST_DIR / "read_only_dir" / "lock", read_only_ok=True):
+                pass
+
+
+class TestCacheFile(BaseTestClass):
+    def test_temp_file_removed_on_error(self):
+        cache_filename = self.TEST_DIR / "cache_file"
+        with pytest.raises(IOError, match="I made this up"):
+            with CacheFile(cache_filename) as handle:
+                raise IOError("I made this up")
+        assert not os.path.exists(handle.name)
+        assert not os.path.exists(cache_filename)
+
+
+class TestFileUtils(BaseTestClass):
     def setup_method(self):
         super().setup_method()
         self.glove_file = self.FIXTURES_ROOT / "embeddings/glove.6B.100d.sample.txt.gz"
@@ -333,7 +344,7 @@ def test_extract_with_external_symlink(self):
             get_cached_path(dangerous_file, extract_archive=True)
 
 
-class TestCachedPathWithArchive(BaseTestCase):
+class TestCachedPathWithArchive(BaseTestClass):
     def setup_method(self):
         super().setup_method()
         self.tar_file = self.TEST_DIR / "utf-8.tar.gz"
@@ -411,7 +422,7 @@ def test_cached_path_extract_remote_zip(self):
         self.check_extracted(extracted)
 
 
-class TestHFHubDownload(BaseTestCase):
+class TestHFHubDownload(BaseTestClass):
     def test_cached_download_no_user_or_org(self):
         path = get_cached_path("hf://t5-small/config.json", cache_dir=self.TEST_DIR)
         assert os.path.isfile(path)