allenai
diff --git a/‎CHANGELOG.md
+9-1 b/‎CHANGELOG.md
+9-1
diff --git a/‎allennlp/common/file_utils.py
+49-23 b/‎allennlp/common/file_utils.py
+49-23
diff --git a/‎allennlp/data/__init__.py
+5-1 b/‎allennlp/data/__init__.py
+5-1
diff --git a/‎allennlp/data/dataloader.py
+10-2 b/‎allennlp/data/dataloader.py
+10-2
diff --git a/‎allennlp/data/dataset_readers/__init__.py
+5-1 b/‎allennlp/data/dataset_readers/__init__.py
+5-1
@@ -7,9 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
-## Fixed
+### Fixed
+
+- Lazy dataset readers now work correctly with multi-process data loading.
+- Fixed race conditions that could occur when using a dataset cache.
+
+### Added
 
 - A bug where where all datasets would be loaded for vocab creation even if not needed.
+- A parameter to the `DatasetReader` class: `manual_multi_process_sharding`. This is similar
+  to the `manual_distributed_sharding` parameter, but applies when using a multi-process
+  `DataLoader`.
 
 ## [v1.0.0rc6](https://github.com/allenai/allennlp/releases/tag/v1.0.0rc6) - 2020-06-11
 
 
@@ -5,7 +5,6 @@
 import glob
 import os
 import logging
-import shutil
 import tempfile
 import json
 from urllib.parse import urlparse
@@ -243,6 +242,46 @@ def _find_latest_cached(url: str, cache_dir: str) -> Optional[str]:
     return None
 
 
+class CacheFile:
+    """
+    This is a context manager that makes robust caching easier.
+
+    On `__enter__`, an IO handle to a temporarily file is returned, which can
+    be treated as if it's the actual cache file.
+
+    On `__exit__`, the temporarily file is renamed to the cache file. If anything
+    goes wrong while writing to the temporary file, it will be removed.
+    """
+
+    def __init__(self, cache_filename: Union[Path, str], mode="w+b") -> None:
+        self.cache_filename = (
+            cache_filename if isinstance(cache_filename, Path) else Path(cache_filename)
+        )
+        self.cache_directory = os.path.dirname(self.cache_filename)
+        self.mode = mode
+        self.temp_file = tempfile.NamedTemporaryFile(
+            self.mode, dir=self.cache_directory, delete=False, suffix=".tmp"
+        )
+
+    def __enter__(self):
+        return self.temp_file
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.temp_file.close()
+        if exc_value is None:
+            # Success.
+            logger.info(
+                "Renaming temp file %s to cache at %s", self.temp_file.name, self.cache_filename
+            )
+            # Rename the temp file to the actual cache filename.
+            os.replace(self.temp_file.name, self.cache_filename)
+            return True
+        # Something went wrong, remove the temp file.
+        logger.info("removing temp file %s", self.temp_file.name)
+        os.remove(self.temp_file.name)
+        return False
+
+
 # TODO(joelgrus): do we want to do checksums or anything like that?
 def get_from_cache(url: str, cache_dir: str = None) -> str:
     """
@@ -303,33 +342,20 @@ def get_from_cache(url: str, cache_dir: str = None) -> str:
         if os.path.exists(cache_path):
             logger.info("cache of %s is up-to-date", url)
         else:
-            # Download to temporary file, then copy to cache dir once finished.
-            # Otherwise you get corrupt cache entries if the download gets interrupted.
-            with tempfile.NamedTemporaryFile() as temp_file:
-                logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+            with CacheFile(cache_path) as cache_file:
+                logger.info("%s not found in cache, downloading to %s", url, cache_file.name)
 
                 # GET file object
                 if url.startswith("s3://"):
-                    _s3_get(url, temp_file)
+                    _s3_get(url, cache_file)
                 else:
-                    _http_get(url, temp_file)
-
-                # we are copying the file before closing it, so flush to avoid truncation
-                temp_file.flush()
-                # shutil.copyfileobj() starts at the current position, so go to the start
-                temp_file.seek(0)
-
-                logger.info("copying %s to cache at %s", temp_file.name, cache_path)
-                with open(cache_path, "wb") as cache_file:
-                    shutil.copyfileobj(temp_file, cache_file)  # type: ignore
-
-                logger.info("creating metadata file for %s", cache_path)
-                meta = {"url": url, "etag": etag}
-                meta_path = cache_path + ".json"
-                with open(meta_path, "w") as meta_file:
-                    json.dump(meta, meta_file)
+                    _http_get(url, cache_file)
 
-                logger.info("removing temp file %s", temp_file.name)
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {"url": url, "etag": etag}
+            meta_path = cache_path + ".json"
+            with open(meta_path, "w") as meta_file:
+                json.dump(meta, meta_file)
 
     return cache_path
 
 
@@ -1,5 +1,9 @@
 from allennlp.data.dataloader import DataLoader, allennlp_collate
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_reader import (
+    DatasetReader,
+    AllennlpDataset,
+    AllennlpLazyDataset,
+)
 from allennlp.data.fields.field import DataArray, Field
 from allennlp.data.fields.text_field import TextFieldTensors
 from allennlp.data.instance import Instance
 
@@ -1,12 +1,13 @@
 from typing import List, Dict, Union
+import warnings
 
 import torch
 from torch.utils import data
 
 from allennlp.common.registrable import Registrable
-from allennlp.data.instance import Instance
-
 from allennlp.common.lazy import Lazy
+from allennlp.data.instance import Instance
+from allennlp.data.dataset_readers.dataset_reader import AllennlpLazyDataset
 from allennlp.data.batch import Batch
 from allennlp.data.samplers import Sampler, BatchSampler
 
@@ -65,6 +66,13 @@ def __init__(
         multiprocessing_context: str = None,
         batches_per_epoch: int = None,
     ):
+        if num_workers and isinstance(dataset, AllennlpLazyDataset):
+            warnings.warn(
+                "Using multi-process data loading with a lazy dataset could lead to "
+                "deadlocks with certain tokenizers. See:\n"
+                "  https://github.com/allenai/allennlp/issues/4330\n",
+                UserWarning,
+            )
         super().__init__(
             dataset=dataset,
             batch_size=batch_size,
 
@@ -8,7 +8,11 @@
 
 
 from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_reader import (
+    DatasetReader,
+    AllennlpDataset,
+    AllennlpLazyDataset,
+)
 from allennlp.data.dataset_readers.interleaving_dataset_reader import InterleavingDatasetReader
 from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
 from allennlp.data.dataset_readers.sharded_dataset_reader import ShardedDatasetReader