Skip to content

Commit bd3b80b

Browse files
authored
chore: fix typos (#7436)
1 parent d5a2af1 commit bd3b80b

File tree

11 files changed

+29
-29
lines changed

11 files changed

+29
-29
lines changed

docs/source/about_dataset_load.mdx

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ To ensure a dataset is complete, [`load_dataset`] will perform a series of tests
9797
- The number of splits in the generated `DatasetDict`.
9898
- The number of samples in each split of the generated `DatasetDict`.
9999
- The list of downloaded files.
100-
- The SHA256 checksums of the downloaded files (disabled by defaut).
100+
- The SHA256 checksums of the downloaded files (disabled by default).
101101

102102
If the dataset doesn't pass the verifications, it is likely that the original host of the dataset made some changes in the data files.
103103

src/datasets/arrow_dataset.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -1488,7 +1488,7 @@ def save_to_disk(
14881488
parent_cache_files_paths = {
14891489
Path(cache_filename["filename"]).resolve().parent for cache_filename in self.cache_files
14901490
}
1491-
# Check that the dataset doesn't overwrite iself. It can cause a permission error on Windows and a segfault on linux.
1491+
# Check that the dataset doesn't overwrite itself. It can cause a permission error on Windows and a segfault on linux.
14921492
if Path(dataset_path).expanduser().resolve() in parent_cache_files_paths:
14931493
raise PermissionError(
14941494
f"Tried to overwrite {Path(dataset_path).expanduser().resolve()} but a dataset can't overwrite itself."
@@ -2867,7 +2867,7 @@ def map(
28672867
Note that the last batch may have less than `n` examples.
28682868
A batch is a dictionary, e.g. a batch of `n` examples is `{"text": ["Hello there !"] * n}`.
28692869
2870-
If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simulatenous calls.
2870+
If the function is asynchronous, then `map` will run your function in parallel, with up to one thousand simultaneous calls.
28712871
It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.
28722872
28732873
Args:
@@ -3475,7 +3475,7 @@ def iter_outputs(shard_iterable):
34753475
yield i, apply_function(example, i, offset=offset)
34763476

34773477
num_examples_progress_update = 0
3478-
# If `update_data` is True after processing the first example/batch, initalize these resources with `init_buffer_and_writer`
3478+
# If `update_data` is True after processing the first example/batch, initialize these resources with `init_buffer_and_writer`
34793479
buf_writer, writer, tmp_file = None, None, None
34803480

34813481
# Check if Polars is available and import it if so
@@ -3659,7 +3659,7 @@ def filter(
36593659
"""Apply a filter function to all the elements in the table in batches
36603660
and update the table so that the dataset only includes examples according to the filter function.
36613661
3662-
If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simulatenous calls (configurable).
3662+
If the function is asynchronous, then `filter` will run your function in parallel, with up to one thousand simultaneous calls (configurable).
36633663
It is recommended to use a `asyncio.Semaphore` in your function if you want to set a maximum number of operations that can run at the same time.
36643664
36653665
Args:
@@ -4277,7 +4277,7 @@ def sort(
42774277
f"Column '{column}' not found in the dataset. Please provide a column selected in: {self._data.column_names}"
42784278
)
42794279

4280-
# Change null_placement to conform to pyarrow's sort_indices() while ensuring backwards compatability
4280+
# Change null_placement to conform to pyarrow's sort_indices() while ensuring backwards compatibility
42814281
if null_placement not in ["at_start", "at_end"]:
42824282
if null_placement == "first":
42834283
null_placement = "at_start"
@@ -5345,7 +5345,7 @@ def _push_parquet_shards_to_hub(
53455345
Returns:
53465346
additions (`List[CommitOperation]`): list of the `CommitOperationAdd` of the uploaded shards
53475347
uploaded_size (`int`): number of uploaded bytes to the repository
5348-
dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset afer uncompression
5348+
dataset_nbytes (`int`): approximate size in bytes of the uploaded dataset after uncompression
53495349
"""
53505350
# Find decodable columns, because if there are any, we need to:
53515351
# embed the bytes from the files in the shards
@@ -6178,7 +6178,7 @@ def _concatenate_map_style_datasets(
61786178
# Return first dataset if all datasets are empty
61796179
return dsets[0]
61806180

6181-
# Perform checks (and a potentional cast if axis=0)
6181+
# Perform checks (and a potential cast if axis=0)
61826182
if axis == 0:
61836183
_check_if_features_can_be_aligned([dset.features for dset in dsets])
61846184
else:

src/datasets/arrow_writer.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def get_inferred_type(self) -> FeatureType:
173173
def _infer_custom_type_and_encode(data: Iterable) -> tuple[Iterable, Optional[FeatureType]]:
174174
"""Implement type inference for custom objects like PIL.Image.Image -> Image type.
175175
176-
This function is only used for custom python objects that can't be direclty passed to build
176+
This function is only used for custom python objects that can't be directly passed to build
177177
an Arrow array. In such cases is infers the feature type to use, and it encodes the data so
178178
that they can be passed to an Arrow array.
179179
@@ -492,7 +492,7 @@ def write_examples_on_file(self):
492492
batch_examples = {}
493493
for col in cols:
494494
# We use row[0][col] since current_examples contains (example, key) tuples.
495-
# Morever, examples could be Arrow arrays of 1 element.
495+
# Moreover, examples could be Arrow arrays of 1 element.
496496
# This can happen in `.map()` when we want to re-write the same Arrow data
497497
if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
498498
arrays = [row[0][col] for row in self.current_examples]
@@ -546,7 +546,7 @@ def write(
546546
if writer_batch_size is not None and len(self.current_examples) >= writer_batch_size:
547547
if self._check_duplicates:
548548
self.check_duplicate_keys()
549-
# Re-intializing to empty list for next batch
549+
# Re-initializing to empty list for next batch
550550
self.hkey_record = []
551551

552552
self.write_examples_on_file()
@@ -652,7 +652,7 @@ def finalize(self, close_stream=True):
652652
# In case current_examples < writer_batch_size, but user uses finalize()
653653
if self._check_duplicates:
654654
self.check_duplicate_keys()
655-
# Re-intializing to empty list for next batch
655+
# Re-initializing to empty list for next batch
656656
self.hkey_record = []
657657
self.write_examples_on_file()
658658
# If schema is known, infer features even if no examples were written

src/datasets/config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -265,5 +265,5 @@
265265
# Maximum number of uploaded files per commit
266266
UPLOADS_MAX_NUMBER_PER_COMMIT = 50
267267

268-
# Backward compatibiliy
268+
# Backward compatibility
269269
MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30

src/datasets/data_files.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ class EmptyDatasetError(FileNotFoundError):
113113

114114

115115
def contains_wildcards(pattern: str) -> bool:
116-
return any(wilcard_character in pattern for wilcard_character in WILDCARD_CHARACTERS)
116+
return any(wildcard_character in pattern for wildcard_character in WILDCARD_CHARACTERS)
117117

118118

119119
def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]:
@@ -156,7 +156,7 @@ def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[
156156

157157
def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
158158
"""
159-
When a path matches a pattern, we additionnally check if it's inside a special directory
159+
When a path matches a pattern, we additionally check if it's inside a special directory
160160
we ignore by default (if it starts with a double underscore).
161161
162162
Users can still explicitly request a filepath inside such a directory if "__pycache__" is
@@ -179,7 +179,7 @@ def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> b
179179
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
180180
False
181181
"""
182-
# We just need to check if every special directories from the path is present explicly in the pattern.
182+
# We just need to check if every special directories from the path is present explicitly in the pattern.
183183
# Since we assume that the path matches the pattern, it's equivalent to counting that both
184184
# the parent path and the parent pattern have the same number of special directories.
185185
data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
@@ -189,7 +189,7 @@ def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> b
189189

190190
def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
191191
"""
192-
When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
192+
When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
193193
a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.
194194
195195
Users can still explicitly request a filepath that is hidden or is inside a hidden directory
@@ -237,7 +237,7 @@ def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_
237237
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
238238
False
239239
"""
240-
# We just need to check if every hidden part from the path is present explicly in the pattern.
240+
# We just need to check if every hidden part from the path is present explicitly in the pattern.
241241
# Since we assume that the path matches the pattern, it's equivalent to counting that both
242242
# the path and the pattern have the same number of hidden parts.
243243
hidden_directories_in_path = [
@@ -318,7 +318,7 @@ def resolve_pattern(
318318
319319
Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
320320
The same applies to special directories that start with a double underscore like "__pycache__".
321-
You can still include one if the pattern explicilty mentions it:
321+
You can still include one if the pattern explicitly mentions it:
322322
- to include a hidden file: "*/.hidden.txt" or "*/.*"
323323
- to include a hidden directory: ".hidden/*" or ".*/*"
324324
- to include a special directory: "__special__/*" or "__*/*"

src/datasets/download/download_config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ class DownloadConfig:
1515
Specify a cache directory to save the file to (overwrite the
1616
default cache dir).
1717
force_download (`bool`, defaults to `False`):
18-
If `True`, re-dowload the file even if it's already cached in
18+
If `True`, re-download the file even if it's already cached in
1919
the cache dir.
2020
resume_download (`bool`, defaults to `False`):
2121
If `True`, resume the download if an incompletely received file is

src/datasets/features/translation.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
@dataclass
1212
class Translation:
1313
"""`Feature` for translations with fixed languages per example.
14-
Here for compatiblity with tfds.
14+
Here for compatibility with tfds.
1515
1616
Args:
1717
languages (`dict`):
@@ -51,7 +51,7 @@ def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
5151
@dataclass
5252
class TranslationVariableLanguages:
5353
"""`Feature` for translations with variable languages per example.
54-
Here for compatiblity with tfds.
54+
Here for compatibility with tfds.
5555
5656
Args:
5757
languages (`dict`):

src/datasets/formatting/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def get_formatter(format_type: Optional[str], **format_kwargs) -> Formatter:
124124
"""
125125
Factory function to get a Formatter given its type name and keyword arguments.
126126
A formatter is an object that extracts and formats data from pyarrow table.
127-
It defines the formatting for rows, colums and batches.
127+
It defines the formatting for rows, columns and batches.
128128
If the formatter for a given type name doesn't exist or is not available, an error is raised.
129129
"""
130130
format_type = get_format_type_from_alias(format_type)

src/datasets/formatting/formatting.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,7 @@ class CustomFormatter(Formatter[dict, ColumnFormat, dict]):
494494
The transform must take as input a batch of data extracted for an arrow table using the python extractor,
495495
and return a batch.
496496
If the output batch is not a dict, then output_all_columns won't work.
497-
If the ouput batch has several fields, then querying a single column won't work since we don't know which field
497+
If the output batch has several fields, then querying a single column won't work since we don't know which field
498498
to return.
499499
"""
500500

src/datasets/search.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def search_batch(self, queries, k: int = 10, **kwargs) -> BatchedSearchResults:
7474
queries (`Union[List[str], np.ndarray]`): The queries as a list of strings if `column` is a text index or as a numpy array if `column` is a vector index.
7575
k (`int`): The number of examples to retrieve per query.
7676
77-
Ouput:
77+
Output:
7878
total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
7979
total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
8080
"""
@@ -186,7 +186,7 @@ def search(self, query: str, k=10, **kwargs) -> SearchResults:
186186
query (`str`): The query as a string.
187187
k (`int`): The number of examples to retrieve.
188188
189-
Ouput:
189+
Output:
190190
scores (`List[List[float]`): The retrieval scores of the retrieved examples.
191191
indices (`List[List[int]]`): The indices of the retrieved examples.
192192
"""
@@ -353,7 +353,7 @@ def search(self, query: np.array, k=10, **kwargs) -> SearchResults:
353353
query (`np.array`): The query as a numpy array.
354354
k (`int`): The number of examples to retrieve.
355355
356-
Ouput:
356+
Output:
357357
scores (`List[List[float]`): The retrieval scores of the retrieved examples.
358358
indices (`List[List[int]]`): The indices of the retrieved examples.
359359
"""
@@ -373,7 +373,7 @@ def search_batch(self, queries: np.array, k=10, **kwargs) -> BatchedSearchResult
373373
queries (`np.array`): The queries as a numpy array.
374374
k (`int`): The number of examples to retrieve.
375375
376-
Ouput:
376+
Output:
377377
total_scores (`List[List[float]`): The retrieval scores of the retrieved examples per query.
378378
total_indices (`List[List[int]]`): The indices of the retrieved examples per query.
379379
"""

src/datasets/table.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def _memory_mapped_record_batch_reader_from_file(filename: str) -> pa.RecordBatc
5353
def read_schema_from_file(filename: str) -> pa.Schema:
5454
"""
5555
Infer arrow table schema from file without loading whole file into memory.
56-
Usefull especially while having very big files.
56+
Useful especially while having very big files.
5757
"""
5858
with pa.memory_map(filename) as memory_mapped_stream:
5959
schema = pa.ipc.open_stream(memory_mapped_stream).schema

0 commit comments

Comments
 (0)