Allow files to be downloaded from S3 (#1620)

epwalsh · joelgrus · commit 659bf2562b3f · 2018-08-17T14:54:59.000-07:00
* handle s3 files

* pull in to file object

* get s3 etag

* add some tests

* better unit tests

* added test for etag

* add a few more tests

* add status code to error msg

* fix unit tests

* add a few comments
diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py
@@ -9,9 +9,12 @@
 import json
 from urllib.parse import urlparse
 from pathlib import Path
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union, IO, Callable
 from hashlib import sha256
+from functools import wraps
 
+import boto3
+from botocore.exceptions import ClientError
 import requests
 
 from allennlp.common.tqdm import Tqdm
@@ -78,7 +81,7 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str
 
     parsed = urlparse(url_or_filename)
 
-    if parsed.scheme in ('http', 'https'):
+    if parsed.scheme in ('http', 'https', 's3'):
         # URL, so get it from the cache (downloading if necessary)
         return get_from_cache(url_or_filename, cache_dir)
     elif os.path.exists(url_or_filename):
@@ -92,6 +95,67 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str
         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
 
 
+def split_s3_path(url: str) -> Tuple[str, str]:
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func: Callable):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url: str, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise FileNotFoundError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url: str) -> Optional[str]:
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url: str, temp_file: IO) -> None:
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url: str, temp_file: IO) -> None:
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = Tqdm.tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk: # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
 # TODO(joelgrus): do we want to do checksums or anything like that?
 def get_from_cache(url: str, cache_dir: str = None) -> str:
     """
@@ -103,13 +167,16 @@ def get_from_cache(url: str, cache_dir: str = None) -> str:
 
     os.makedirs(cache_dir, exist_ok=True)
 
-    # make HEAD request to check ETag
-    response = requests.head(url, allow_redirects=True)
-    if response.status_code != 200:
-        raise IOError("HEAD request failed for url {}".format(url))
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError("HEAD request failed for url {} with status code {}"
+                          .format(url, response.status_code))
+        etag = response.headers.get("ETag")
 
-    # add ETag to filename if it exists
-    etag = response.headers.get("ETag")
     filename = url_to_filename(url, etag)
 
     # get cache path to put the file
@@ -122,15 +189,10 @@ def get_from_cache(url: str, cache_dir: str = None) -> str:
             logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
 
             # GET file object
-            req = requests.get(url, stream=True)
-            content_length = req.headers.get('Content-Length')
-            total = int(content_length) if content_length is not None else None
-            progress = Tqdm.tqdm(unit="B", total=total)
-            for chunk in req.iter_content(chunk_size=1024):
-                if chunk: # filter out keep-alive new chunks
-                    progress.update(len(chunk))
-                    temp_file.write(chunk)
-            progress.close()
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
 
             # we are copying the file before closing it, so flush to avoid truncation
             temp_file.flush()
diff --git a/allennlp/tests/common/file_utils_test.py b/allennlp/tests/common/file_utils_test.py
@@ -3,11 +3,17 @@
 import os
 import pathlib
 import json
+import tempfile
+from typing import List, Tuple
 
+import boto3
+from moto import mock_s3
 import pytest
 import responses
 
-from allennlp.common.file_utils import url_to_filename, filename_to_url, get_from_cache, cached_path
+from allennlp.common.file_utils import (
+        url_to_filename, filename_to_url, get_from_cache, cached_path, split_s3_path,
+        s3_request, s3_etag, s3_get)
 from allennlp.common.testing import AllenNlpTestCase
 
 
@@ -47,6 +53,14 @@ def head_callback(_):
     )
 
 
+def set_up_s3_bucket(bucket_name: str = "my-bucket", s3_objects: List[Tuple[str, str]] = None):
+    """Creates a mock s3 bucket optionally with objects uploaded from local files."""
+    s3_client = boto3.client("s3")
+    s3_client.create_bucket(Bucket=bucket_name)
+    for filename, key in s3_objects or []:
+        s3_client.upload_file(Filename=filename, Bucket=bucket_name, Key=key)
+
+
 class TestFileUtils(AllenNlpTestCase):
     def setUp(self):
         super().setUp()
@@ -97,6 +111,68 @@ def test_url_to_filename_with_etags_eliminates_quotes(self):
             assert back_to_url == url
             assert etag == "mytag"
 
+    def test_split_s3_path(self):
+        # Test splitting good urls.
+        assert split_s3_path("s3://my-bucket/subdir/file.txt") == ("my-bucket", "subdir/file.txt")
+        assert split_s3_path("s3://my-bucket/file.txt") == ("my-bucket", "file.txt")
+
+        # Test splitting bad urls.
+        with pytest.raises(ValueError):
+            split_s3_path("s3://")
+            split_s3_path("s3://myfile.txt")
+            split_s3_path("myfile.txt")
+
+    @mock_s3
+    def test_s3_bucket(self):
+        """This just ensures the bucket gets set up correctly."""
+        set_up_s3_bucket()
+        s3_client = boto3.client("s3")
+        buckets = s3_client.list_buckets()["Buckets"]
+        assert len(buckets) == 1
+        assert buckets[0]["Name"] == "my-bucket"
+
+    @mock_s3
+    def test_s3_request_wrapper(self):
+        set_up_s3_bucket(s3_objects=[(str(self.glove_file), "embeddings/glove.txt.gz")])
+        s3_resource = boto3.resource("s3")
+
+        @s3_request
+        def get_file_info(url):
+            bucket_name, s3_path = split_s3_path(url)
+            return s3_resource.Object(bucket_name, s3_path).content_type
+
+        # Good request, should work.
+        assert get_file_info("s3://my-bucket/embeddings/glove.txt.gz") == "text/plain"
+
+        # File missing, should raise FileNotFoundError.
+        with pytest.raises(FileNotFoundError):
+            get_file_info("s3://my-bucket/missing_file.txt")
+
+    @mock_s3
+    def test_s3_etag(self):
+        set_up_s3_bucket(s3_objects=[(str(self.glove_file), "embeddings/glove.txt.gz")])
+        # Ensure we can get the etag for an s3 object and that it looks as expected.
+        etag = s3_etag("s3://my-bucket/embeddings/glove.txt.gz")
+        assert isinstance(etag, str)
+        assert etag.startswith("'") or etag.startswith('"')
+
+        # Should raise FileNotFoundError if the file does not exist on the bucket.
+        with pytest.raises(FileNotFoundError):
+            s3_etag("s3://my-bucket/missing_file.txt")
+
+    @mock_s3
+    def test_s3_get(self):
+        set_up_s3_bucket(s3_objects=[(str(self.glove_file), "embeddings/glove.txt.gz")])
+
+        with tempfile.NamedTemporaryFile() as temp_file:
+            s3_get("s3://my-bucket/embeddings/glove.txt.gz", temp_file)
+            assert os.stat(temp_file.name).st_size != 0
+
+        # Should raise FileNotFoundError if the file does not exist on the bucket.
+        with pytest.raises(FileNotFoundError):
+            with tempfile.NamedTemporaryFile() as temp_file:
+                s3_get("s3://my-bucket/missing_file.txt", temp_file)
+
     @responses.activate
     def test_get_from_cache(self):
         url = 'http://fake.datastore.com/glove.txt.gz'
diff --git a/requirements.txt b/requirements.txt
@@ -43,6 +43,9 @@ cffi==1.11.2
 # aws commandline tools for running on Docker remotely.
 awscli>=1.11.91
 
+# Accessing files from S3 directly.
+boto3
+
 # REST interface for models
 flask==0.12.1
 flask-cors==3.0.3
@@ -106,6 +109,9 @@ codecov
 # Required to run sanic tests
 aiohttp
 
+# For mocking s3.
+moto==1.3.4
+
 #### DOC-RELATED PACKAGES ####
 
 # Builds our documentation.
diff --git a/setup.py b/setup.py
@@ -111,6 +111,8 @@
           'tensorboardX==1.2',
           'cffi==1.11.2',
           'awscli>=1.11.91',
+          'boto3',
+          'moto==1.3.4',
           'flask==0.12.1',
           'flask-cors==3.0.3',
           'gevent==1.3.5',