allenai · epwalsh · Apr 20, 2021 · Apr 5, 2021 · Apr 5, 2021 · Apr 14, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,14 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added support for the HuggingFace Hub as an alternative way to handle loading files.
 - The test for distributed metrics now takes a parameter specifying how often you want to run it.
 
 
 ## [v2.3.0](https://github.com/allenai/allennlp/releases/tag/v2.3.0) - 2021-04-14
 
 ### Added
 
-- Ported the following Huggingface `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`.
+- Ported the following HuggingFace `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`.
 - Added new `sub_token_mode` parameter to `pretrained_transformer_mismatched_embedder` class to support first sub-token embedding
 - Added a way to run a multi task model with a dataset reader as part of `allennlp predict`.
 - Added new `eval_mode` in `PretrainedTransformerEmbedder`. If it is set to `True`, the transformer is _always_ run in evaluation mode, which, e.g., disables dropout and does not update batch normalization statistics.

diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py
@@ -50,10 +50,12 @@
 from overrides import overrides
 import requests
 from requests.adapters import HTTPAdapter
-from requests.exceptions import ConnectionError
+from requests.exceptions import ConnectionError, HTTPError
 from requests.packages.urllib3.util.retry import Retry
 import lmdb
 from torch import Tensor
+from huggingface_hub import hf_hub_url, cached_download, snapshot_download
+from allennlp.version import VERSION
 
 from allennlp.common.tqdm import Tqdm
 
@@ -233,9 +235,45 @@ def cached_path(
     cache_dir = os.path.expanduser(cache_dir)
     os.makedirs(cache_dir, exist_ok=True)
 
+    extraction_path: Optional[str] = None
+
     if not isinstance(url_or_filename, str):
         url_or_filename = str(url_or_filename)
 
+    if not os.path.isfile(url_or_filename) and not urlparse(url_or_filename).scheme in ("http", "https", "s3"):
+        # If can't be resolved to a path or a URL,
+        # let's try to find it on Hugging Face model hub
+        # e.g. lysandre/pair-classification-roberta-mnli is a valid model id
+        # and  lysandre/pair-classification-roberta-mnli@main supports specifying a commit/branch/tag.
+        if len(url_or_filename.split("/")) > 2:
+            filename = url_or_filename.split("/")[2]
+            url_or_filename = "/".join(url_or_filename.split("/")[:2])
+        else:
+            filename = None
+
+        if "@" in url_or_filename:
+            repo_id = url_or_filename.split("@")[0]
+            revision = url_or_filename.split("@")[1]
+        else:
+            repo_id = url_or_filename
+            revision = None
+
+        try:
+            if filename is not None:
+                url = hf_hub_url(
+                    repo_id=repo_id, filename=filename, revision=revision
+                )
+                url_or_filename = cached_download(
+                    url=url,
+                    library_name="allennlp",
+                    library_version=VERSION,
+                    cache_dir=CACHE_DIRECTORY,
+                )
+            else:
+                extraction_path = Path(snapshot_download(repo_id, revision=revision, cache_dir=CACHE_DIRECTORY))
+        except HTTPError as e:
+            logger.warning(f"Tried to download from Hugging Face Hub but failed with {e}")
+
     file_path: str
 
     # If we're using the /a/b/foo.zip!c/d/file.txt syntax, handle it here.
@@ -261,9 +299,7 @@ def cached_path(
 
     parsed = urlparse(url_or_filename)
 
-    extraction_path: Optional[str] = None
-
-    if parsed.scheme in ("http", "https", "s3"):
+    if parsed.scheme in ("http", "https", "s3") and extraction_path is None:
         # URL, so get it from the cache (downloading if necessary)
         file_path = get_from_cache(url_or_filename, cache_dir)
 
@@ -272,7 +308,7 @@ def cached_path(
             # For example ~/.allennlp/cache/234234.21341 -> ~/.allennlp/cache/234234.21341-extracted
             extraction_path = file_path + "-extracted"
 
-    else:
+    elif extraction_path is None:
         url_or_filename = os.path.expanduser(url_or_filename)
 
         if os.path.exists(url_or_filename):

diff --git a/setup.py b/setup.py
@@ -72,6 +72,7 @@
         "lmdb",
         "more-itertools",
         "wandb>=0.10.0,<0.11.0",
+        "huggingface_hub>=0.0.8"
     ],
     entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]},
     include_package_data=True,

diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py
@@ -28,7 +28,10 @@
     LocalCacheResource,
     TensorCache,
 )
+from allennlp.common import Params
+from allennlp.modules.token_embedders import ElmoTokenEmbedder
 from allennlp.common.testing import AllenNlpTestCase
+from allennlp.predictors import Predictor
 
 
 def set_up_glove(url: str, byt: bytes, change_etag_every: int = 1000):
@@ -563,3 +566,21 @@ def test_tensor_cache(self):
         with pytest.warns(UserWarning, match="cache will be read-only"):
             cache = TensorCache(self.TEST_DIR / "cache")
             assert cache.read_only
+
+
+class TestHFHubDownload(AllenNlpTestCase):
+    def test_cached_download(self):
+        params = Params(
+            {
+                "options_file":  "lysandre/test-elmo-tiny/options.json",
+                "weight_file": "lysandre/test-elmo-tiny/lm_weights.hdf5",
+            }
+        )
+        embedding_layer = ElmoTokenEmbedder.from_params(vocab=None, params=params)
+
+        assert isinstance(embedding_layer, ElmoTokenEmbedder), "Embedding layer badly instantiated from HF Hub."
+        assert embedding_layer.get_output_dim() == 32, "Embedding layer badly instantiated from HF Hub."
+
+    def test_snapshot_download(self):
+        predictor = Predictor.from_path("lysandre/test-simple-tagger-tiny")
+        assert predictor._dataset_reader._token_indexers["tokens"].namespace == "test_tokens"