allenai
diff --git a/‎CHANGELOG.md
+13 b/‎CHANGELOG.md
+13
diff --git a/‎cached_path/__init__.py
+2-1 b/‎cached_path/__init__.py
+2-1
diff --git a/‎cached_path/_cached_path.py
+51-5 b/‎cached_path/_cached_path.py
+51-5
diff --git a/‎cached_path/common.py
-21 b/‎cached_path/common.py
-21
diff --git a/‎cached_path/progress.py
+97 b/‎cached_path/progress.py
+97
diff --git a/‎cached_path/schemes/__init__.py
+1-1 b/‎cached_path/schemes/__init__.py
+1-1
diff --git a/‎cached_path/schemes/gs.py
+19-17 b/‎cached_path/schemes/gs.py
+19-17
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Added
+
+- Added `quiet` parameter to `cached_path()` for turning off progress displays, and `progress` parameter for customizing displays.
+- Added `SchemeClient.get_size()` method.
+
+### Changed
+
+- Switched to `rich` for progress displays, removed dependency on `tqdm`.
+
+### Removed
+
+- Removed `file_friendly_logging()` function.
+
 ## [v1.1.2](https://github.com/allenai/cached_path/releases/tag/v1.1.2) - 2022-04-08
 
 ## [v1.1.1](https://github.com/allenai/cached_path/releases/tag/v1.1.1) - 2022-03-25
 
@@ -11,7 +11,8 @@
 """
 
 from ._cached_path import cached_path
-from .common import file_friendly_logging, get_cache_dir, set_cache_dir
+from .common import get_cache_dir, set_cache_dir
+from .progress import get_download_progress
 from .schemes import SchemeClient, add_scheme_client
 from .util import (
     check_tarfile,
 
@@ -4,7 +4,7 @@
 import tarfile
 import tempfile
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import TYPE_CHECKING, Optional, Tuple
 from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
 
@@ -21,6 +21,9 @@
     resource_to_filename,
 )
 
+if TYPE_CHECKING:
+    from rich.progress import Progress
+
 logger = logging.getLogger("cached_path")
 
 
@@ -29,6 +32,8 @@ def cached_path(
     cache_dir: Optional[PathOrStr] = None,
     extract_archive: bool = False,
     force_extract: bool = False,
+    quiet: bool = False,
+    progress: Optional["Progress"] = None,
 ) -> Path:
     """
     Given something that might be a URL or local path, determine which.
@@ -97,6 +102,13 @@ def cached_path(
             Use this flag with caution! This can lead to race conditions if used
             from multiple processes on the same file.
 
+    quiet :
+        If ``True``, progress displays won't be printed.
+
+    progress :
+        A custom progress display to use. If not set and ``quiet=False``, a default display
+        from :func:`~cached_path.get_download_progress()` will be used.
+
     Returns
     -------
     :class:`pathlib.Path`
@@ -133,7 +145,14 @@ def cached_path(
         file_name = url_or_filename[exclamation_index + 1 :]
 
         # Call 'cached_path' recursively now to get the local path to the archive itself.
-        cached_archive_path = cached_path(archive_path, cache_dir, True, force_extract)
+        cached_archive_path = cached_path(
+            archive_path,
+            cache_dir=cache_dir,
+            extract_archive=True,
+            force_extract=force_extract,
+            quiet=quiet,
+            progress=progress,
+        )
         if not cached_archive_path.is_dir():
             raise ValueError(
                 f"{url_or_filename} uses the ! syntax, but does not specify an archive file."
@@ -151,7 +170,7 @@ def cached_path(
 
     if parsed.scheme in get_supported_schemes():
         # URL, so get it from the cache (downloading if necessary)
-        file_path, etag = get_from_cache(url_or_filename, cache_dir)
+        file_path, etag = get_from_cache(url_or_filename, cache_dir, quiet=quiet, progress=progress)
 
         if extract_archive and (is_zipfile(file_path) or tarfile.is_tarfile(file_path)):
             # This is the path the file should be extracted to.
@@ -243,7 +262,12 @@ def cached_path(
     return file_path
 
 
-def get_from_cache(url: str, cache_dir: Optional[PathOrStr] = None) -> Tuple[Path, Optional[str]]:
+def get_from_cache(
+    url: str,
+    cache_dir: Optional[PathOrStr] = None,
+    quiet: bool = False,
+    progress: Optional["Progress"] = None,
+) -> Tuple[Path, Optional[str]]:
     """
     Given a URL, look for the corresponding dataset in the local cache.
     If it's not there, download it. Then return the path to the cached file and the ETag.
@@ -301,9 +325,31 @@ def get_from_cache(url: str, cache_dir: Optional[PathOrStr] = None) -> Tuple[Pat
         if os.path.exists(cache_path):
             logger.info("cache of %s is up-to-date", url)
         else:
+            size = client.get_size()
             with CacheFile(cache_path) as cache_file:
                 logger.info("%s not found in cache, downloading to %s", url, cache_path)
-                client.get_resource(cache_file)
+
+                from .progress import BufferedWriterWithProgress, get_download_progress
+
+                start_and_cleanup = progress is None
+                progress = progress or get_download_progress(quiet=quiet)
+
+                if start_and_cleanup:
+                    progress.start()
+
+                try:
+                    display_url = url if len(url) <= 50 else f"{url[:49]}\N{horizontal ellipsis}"
+                    task_id = progress.add_task(f"Downloading [cyan i]{display_url}[/]", total=size)
+                    writer_with_progress = BufferedWriterWithProgress(cache_file, progress, task_id)
+                    client.get_resource(writer_with_progress)
+                    progress.update(
+                        task_id,
+                        total=writer_with_progress.total_written,
+                        completed=writer_with_progress.total_written,
+                    )
+                finally:
+                    if start_and_cleanup:
+                        progress.stop()
 
             logger.debug("creating metadata file for %s", cache_path)
             meta = Meta.new(
 
@@ -14,17 +14,6 @@
 """
 
 
-def _parse_bool(value: Union[bool, str]) -> bool:
-    if isinstance(value, bool):
-        return value
-    if value in {"1", "true", "True", "TRUE"}:
-        return True
-    return False
-
-
-FILE_FRIENDLY_LOGGING: bool = _parse_bool(os.environ.get("FILE_FRIENDLY_LOGGING", False))
-
-
 def _split_cloud_path(url: str, provider: str) -> Tuple[str, str]:
     """Split a full s3 path into the bucket name and path."""
     parsed = urlparse(url)
@@ -51,13 +40,3 @@ def get_cache_dir() -> Path:
     Get the global default cache directory.
     """
     return Path(CACHE_DIRECTORY)
-
-
-def file_friendly_logging(on: bool = True) -> None:
-    """
-    Turn on (or off) file-friendly logging globally.
-
-    You can also control this through the environment variable `FILE_FRIENDLY_LOGGING`.
-    """
-    global FILE_FRIENDLY_LOGGING
-    FILE_FRIENDLY_LOGGING = on
@@ -0,0 +1,97 @@
+import io
+from typing import List, Optional
+
+from rich.progress import BarColumn, DownloadColumn, Progress, TaskID, TimeElapsedColumn
+
+
+class BufferedWriterWithProgress(io.BufferedWriter):
+    def __init__(self, handle: io.BufferedWriter, progress: Progress, task_id: TaskID):
+        self.handle = handle
+        self.progress = progress
+        self.task_id = task_id
+        self.total_written = 0
+
+    def __enter__(self) -> "BufferedWriterWithProgress":
+        self.handle.__enter__()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    @property
+    def closed(self) -> bool:
+        return self.handle.closed
+
+    def close(self):
+        self.handle.close()
+
+    def fileno(self):
+        return self.handle.fileno()
+
+    def flush(self):
+        self.handle.flush()
+
+    def isatty(self) -> bool:
+        return self.handle.isatty()
+
+    def readable(self) -> bool:
+        return self.handle.readable()
+
+    def seekable(self) -> bool:
+        return self.handle.seekable()
+
+    def writable(self) -> bool:
+        return True
+
+    def read(self, size: Optional[int] = -1) -> bytes:
+        return self.handle.read(size)
+
+    def read1(self, size: Optional[int] = -1) -> bytes:
+        return self.handle.read1()
+
+    def readinto(self, b):
+        return self.handle.readinto(b)
+
+    def readinto1(self, b):
+        return self.handle.readinto1(b)
+
+    def readline(self, size: Optional[int] = -1) -> bytes:
+        return self.handle.readline(size)
+
+    def readlines(self, hint: int = -1) -> List[bytes]:
+        return self.handle.readlines(hint)
+
+    def write(self, b) -> int:
+        n = self.handle.write(b)
+        self.total_written += n
+        self.progress.advance(self.task_id, n)
+        return n
+
+    def writelines(self, lines):
+        return self.handle.writelines(lines)
+
+    def seek(self, offset: int, whence: int = 0) -> int:
+        pos = self.handle.seek(offset, whence)
+        self.progress.update(self.task_id, completed=pos)
+        return pos
+
+    def tell(self) -> int:
+        return self.handle.tell()
+
+    @property
+    def raw(self):
+        return self.handle.raw
+
+    def detach(self):
+        return self.handle.detach()
+
+
+def get_download_progress(quiet: bool = False) -> Progress:
+    return Progress(
+        "[progress.description]{task.description}",
+        BarColumn(),
+        "[progress.percentage]{task.percentage:>3.0f}%",
+        TimeElapsedColumn(),
+        DownloadColumn(),
+        disable=quiet,
+    )
@@ -27,7 +27,7 @@ def add_scheme_client(client: Type[SchemeClient]) -> None:
 
 
 for client in (HttpClient, S3Client, GsClient):
-    add_scheme_client(client)
+    add_scheme_client(client)  # type: ignore
 
 
 def get_scheme_client(resource: str) -> SchemeClient:
 
@@ -2,7 +2,8 @@
 Google Cloud Storage.
 """
 
-from typing import IO, Optional, Tuple
+import io
+from typing import Optional, Tuple
 
 from google.api_core.exceptions import NotFound
 from google.auth.exceptions import DefaultCredentialsError
@@ -11,7 +12,6 @@
 
 from cached_path.common import _split_cloud_path
 from cached_path.schemes.scheme_client import SchemeClient
-from cached_path.tqdm import Tqdm
 
 
 class GsClient(SchemeClient):
@@ -20,25 +20,27 @@ class GsClient(SchemeClient):
     def __init__(self, resource: str) -> None:
         super().__init__(resource)
         self.blob = GsClient.get_gcs_blob(resource)
+        self._loaded = False
+
+    def load(self):
+        if not self._loaded:
+            try:
+                self.blob.reload()
+                self._loaded = True
+            except NotFound:
+                raise FileNotFoundError(self.resource)
 
     def get_etag(self) -> Optional[str]:
-        try:
-            self.blob.reload()
-        except NotFound:
-            raise FileNotFoundError(self.resource)
+        self.load()
         return self.blob.etag or self.blob.md5_hash
 
-    def get_resource(self, temp_file: IO) -> None:
-        with Tqdm.wrapattr(
-            temp_file,
-            "write",
-            unit="iB",
-            unit_scale=True,
-            unit_divisor=1024,
-            total=self.blob.size,
-            desc="downloading",
-        ) as file_obj:
-            self.blob.download_to_file(file_obj, checksum="md5", retry=DEFAULT_RETRY)
+    def get_size(self) -> Optional[int]:
+        self.load()
+        return self.blob.size
+
+    def get_resource(self, temp_file: io.BufferedWriter) -> None:
+        self.load()
+        self.blob.download_to_file(temp_file, checksum="md5", retry=DEFAULT_RETRY)
 
     @staticmethod
     def split_gcs_path(resource: str) -> Tuple[str, str]: