Skip to content
This repository was archived by the owner on Dec 16, 2022. It is now read-only.

Hf hub #5052

Merged
merged 13 commits into from
Apr 20, 2021
Merged

Hf hub #5052

Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Added support for the HuggingFace Hub as an alternative way to handle loading files.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we go with huggingface:// or hf:// type urls, we should mention that here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went with hf://! I updated the changelog.

- The test for distributed metrics now takes a parameter specifying how often you want to run it.


## [v2.3.0](https://github.com/allenai/allennlp/releases/tag/v2.3.0) - 2021-04-14

### Added

- Ported the following Huggingface `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`.
- Ported the following HuggingFace `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, good catch! 😉

- Added new `sub_token_mode` parameter to `pretrained_transformer_mismatched_embedder` class to support first sub-token embedding
- Added a way to run a multi task model with a dataset reader as part of `allennlp predict`.
- Added new `eval_mode` in `PretrainedTransformerEmbedder`. If it is set to `True`, the transformer is _always_ run in evaluation mode, which, e.g., disables dropout and does not update batch normalization statistics.
Expand Down
46 changes: 41 additions & 5 deletions allennlp/common/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,12 @@
from overrides import overrides
import requests
from requests.adapters import HTTPAdapter
from requests.exceptions import ConnectionError
from requests.exceptions import ConnectionError, HTTPError
from requests.packages.urllib3.util.retry import Retry
import lmdb
from torch import Tensor
from huggingface_hub import hf_hub_url, cached_download, snapshot_download
from allennlp.version import VERSION

from allennlp.common.tqdm import Tqdm

Expand Down Expand Up @@ -233,9 +235,45 @@ def cached_path(
cache_dir = os.path.expanduser(cache_dir)
os.makedirs(cache_dir, exist_ok=True)

extraction_path: Optional[str] = None

if not isinstance(url_or_filename, str):
url_or_filename = str(url_or_filename)

if not os.path.isfile(url_or_filename) and not urlparse(url_or_filename).scheme in ("http", "https", "s3"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a little worried about this. It's not transparent to a user that the behavior of this hinges on the presence of absence of a local file in their current working directory. It also makes the AllenNLP configuration files a lot less self-contained.

Do you think we could treat this like a URL, and make it huggingface://lysandre/pair-classification-roberta-mnli or hf://lysandre/pair-classification-roberta-mnli or even hub://lysandre/pair-classification-roberta-mnli?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we could definitely treat it like an URL and have it preceded by either of the three propositions you have made. Is there one you favor?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think hub is too generic. So I vote for either huggingface://, hf://, or both.

# If can't be resolved to a path or a URL,
# let's try to find it on Hugging Face model hub
# e.g. lysandre/pair-classification-roberta-mnli is a valid model id
# and lysandre/pair-classification-roberta-mnli@main supports specifying a commit/branch/tag.
if len(url_or_filename.split("/")) > 2:
filename = url_or_filename.split("/")[2]
url_or_filename = "/".join(url_or_filename.split("/")[:2])
else:
filename = None

if "@" in url_or_filename:
repo_id = url_or_filename.split("@")[0]
revision = url_or_filename.split("@")[1]
else:
repo_id = url_or_filename
revision = None

try:
if filename is not None:
url = hf_hub_url(
repo_id=repo_id, filename=filename, revision=revision
)
url_or_filename = cached_download(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit confusing to me.

We set url_or_filename (along with filename) on line 250, but then override it to something else on line 266.

url=url,
library_name="allennlp",
library_version=VERSION,
cache_dir=CACHE_DIRECTORY,
)
else:
extraction_path = Path(snapshot_download(repo_id, revision=revision, cache_dir=CACHE_DIRECTORY))
except HTTPError as e:
logger.warning(f"Tried to download from Hugging Face Hub but failed with {e}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why warn here and not just let the exception propagate?

I think it would be better to emit a logger.info() statement above that says "treating url_or_filename as a HuggingFace Hub resource" or something, and then just let this error propagate if it happens.


file_path: str

# If we're using the /a/b/foo.zip!c/d/file.txt syntax, handle it here.
Expand All @@ -261,9 +299,7 @@ def cached_path(

parsed = urlparse(url_or_filename)

extraction_path: Optional[str] = None

if parsed.scheme in ("http", "https", "s3"):
if parsed.scheme in ("http", "https", "s3") and extraction_path is None:
# URL, so get it from the cache (downloading if necessary)
file_path = get_from_cache(url_or_filename, cache_dir)

Expand All @@ -272,7 +308,7 @@ def cached_path(
# For example ~/.allennlp/cache/234234.21341 -> ~/.allennlp/cache/234234.21341-extracted
extraction_path = file_path + "-extracted"

else:
elif extraction_path is None:
url_or_filename = os.path.expanduser(url_or_filename)

if os.path.exists(url_or_filename):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"lmdb",
"more-itertools",
"wandb>=0.10.0,<0.11.0",
"huggingface_hub>=0.0.8"
],
entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]},
include_package_data=True,
Expand Down
21 changes: 21 additions & 0 deletions tests/common/file_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
LocalCacheResource,
TensorCache,
)
from allennlp.common import Params
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.common.testing import AllenNlpTestCase
from allennlp.predictors import Predictor


def set_up_glove(url: str, byt: bytes, change_etag_every: int = 1000):
Expand Down Expand Up @@ -563,3 +566,21 @@ def test_tensor_cache(self):
with pytest.warns(UserWarning, match="cache will be read-only"):
cache = TensorCache(self.TEST_DIR / "cache")
assert cache.read_only


class TestHFHubDownload(AllenNlpTestCase):
def test_cached_download(self):
params = Params(
{
"options_file": "lysandre/test-elmo-tiny/options.json",
"weight_file": "lysandre/test-elmo-tiny/lm_weights.hdf5",
}
)
embedding_layer = ElmoTokenEmbedder.from_params(vocab=None, params=params)

assert isinstance(embedding_layer, ElmoTokenEmbedder), "Embedding layer badly instantiated from HF Hub."
assert embedding_layer.get_output_dim() == 32, "Embedding layer badly instantiated from HF Hub."

def test_snapshot_download(self):
predictor = Predictor.from_path("lysandre/test-simple-tagger-tiny")
assert predictor._dataset_reader._token_indexers["tokens"].namespace == "test_tokens"