Skip to content
This repository was archived by the owner on Dec 16, 2022. It is now read-only.

Commit 24ec7db

Browse files
authored
fix #5132 (#5134)
1 parent 2526674 commit 24ec7db

File tree

3 files changed

+24
-3
lines changed

3 files changed

+24
-3
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1717

1818
- Updated CONTRIBUTING.md to remind reader to upgrade pip setuptools to avoid spaCy installation issues.
1919

20+
### Fixed
21+
22+
- Fixed a bug with the `ShardedDatasetReader` when used with multi-process data loading (https://github.com/allenai/allennlp/issues/5132).
23+
2024

2125
## [v2.3.0](https://github.com/allenai/allennlp/releases/tag/v2.3.0) - 2021-04-14
2226

allennlp/data/dataset_readers/sharded_dataset_reader.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import os
44
from typing import Iterable
55

6+
from overrides import overrides
7+
68
from allennlp.common.checks import ConfigurationError
79
from allennlp.common.file_utils import cached_path
810
from allennlp.data.dataset_readers.dataset_reader import DatasetReader, PathOrStr
@@ -46,12 +48,18 @@ def __init__(self, base_reader: DatasetReader, **kwargs) -> None:
4648
self.reader._set_worker_info(None)
4749
self.reader._set_distributed_info(None)
4850

51+
@overrides
4952
def text_to_instance(self, *args, **kwargs) -> Instance:
5053
"""
5154
Just delegate to the base reader text_to_instance.
5255
"""
5356
return self.reader.text_to_instance(*args, **kwargs) # type: ignore
5457

58+
@overrides
59+
def apply_token_indexers(self, instance: Instance) -> None:
60+
self.reader.apply_token_indexers(instance)
61+
62+
@overrides
5563
def _read(self, file_path: PathOrStr) -> Iterable[Instance]:
5664
try:
5765
maybe_extracted_archive = cached_path(file_path, extract_archive=True)
@@ -76,5 +84,7 @@ def _read(self, file_path: PathOrStr) -> Iterable[Instance]:
7684

7785
for shard in self.shard_iterable(shards):
7886
logger.info(f"reading instances from {shard}")
79-
for instance in self.reader.read(shard):
87+
# We call `self.reader._read()` here instead of `self.reader.read()` because `.read()`
88+
# will prematurely call `self.reader.apply_token_indexers()`.
89+
for instance in self.reader._read(shard):
8090
yield instance

tests/data/dataset_readers/sharded_dataset_reader_test.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import Tuple
66

77
from allennlp.common.testing import AllenNlpTestCase
8+
from allennlp.data.data_loaders import MultiProcessDataLoader
89
from allennlp.data.dataset_readers import (
910
SequenceTaggingDatasetReader,
1011
ShardedDatasetReader,
@@ -51,9 +52,12 @@ def setup_method(self) -> None:
5152

5253
self.reader = ShardedDatasetReader(base_reader=self.base_reader)
5354

54-
def read_and_check_instances(self, filepath: str):
55+
def read_and_check_instances(self, filepath: str, num_workers: int = 0):
56+
data_loader = MultiProcessDataLoader(
57+
self.reader, filepath, num_workers=num_workers, batch_size=1
58+
)
5559
all_instances = []
56-
for instance in self.reader.read(filepath):
60+
for instance in data_loader.iter_instances():
5761
all_instances.append(instance)
5862

5963
# 100 files * 4 sentences / file
@@ -71,5 +75,8 @@ def read_and_check_instances(self, filepath: str):
7175
def test_sharded_read_glob(self):
7276
self.read_and_check_instances(self.identical_files_glob)
7377

78+
def test_sharded_read_with_multiprocess_loader(self):
79+
self.read_and_check_instances(self.identical_files_glob, num_workers=2)
80+
7481
def test_sharded_read_archive(self):
7582
self.read_and_check_instances(str(self.archive_filename))

0 commit comments

Comments
 (0)