Skip to content

Commit b275462

Browse files
Remove beam (#6987)
* Delete beam tests * Delete BeamBasedBuilder * Delete BeamWriter * Delete RunBeamCommand * Delete DownloadManager.ship_files_with_pipeline * Delete beam_utils * Delete require_beam * Delete config beam variables * Delete apache-beam extras require * Update setup.py without dependency conflicting with apache-beam * Delete Beam from docs * Delete Beam from comments and docstrings * Delete tests of HF GCP
1 parent 637246b commit b275462

19 files changed

+9
-1069
lines changed

docs/source/_toctree.yml

-2
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,6 @@
5050
title: CLI
5151
- local: how_to_metrics
5252
title: Metrics
53-
- local: beam
54-
title: Beam Datasets
5553
- local: troubleshoot
5654
title: Troubleshooting
5755
title: "General usage"

docs/source/beam.mdx

-52
This file was deleted.

docs/source/cli.mdx

+1-2
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,11 @@ You can check the available commands:
88
usage: datasets-cli <command> [<args>]
99

1010
positional arguments:
11-
{convert,env,test,run_beam,dummy_data,convert_to_parquet}
11+
{convert,env,test,dummy_data,convert_to_parquet}
1212
datasets-cli command helpers
1313
convert Convert a TensorFlow Datasets dataset to a HuggingFace Datasets dataset.
1414
env Print relevant system environment info.
1515
test Test dataset implementation.
16-
run_beam Run a Beam dataset processing pipeline
1716
dummy_data Generate dummy data.
1817
convert_to_parquet Convert dataset to Parquet
1918
delete_from_hub Delete dataset config from the Hub

docs/source/package_reference/builder_classes.mdx

-2
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88

99
[[autodoc]] datasets.GeneratorBasedBuilder
1010

11-
[[autodoc]] datasets.BeamBasedBuilder
12-
1311
[[autodoc]] datasets.ArrowBasedBuilder
1412

1513
[[autodoc]] datasets.BuilderConfig

setup.py

-2
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,6 @@
180180
"torch>=2.0.0",
181181
"soundfile>=0.12.1",
182182
"transformers",
183-
"typing-extensions>=4.6.1", # due to conflict between apache-beam and pydantic
184183
"zstandard",
185184
"polars[timezone]>=0.20.0",
186185
]
@@ -230,7 +229,6 @@
230229
EXTRAS_REQUIRE = {
231230
"audio": AUDIO_REQUIRE,
232231
"vision": VISION_REQUIRE,
233-
"apache-beam": ["apache-beam>=2.26.0"],
234232
"tensorflow": [
235233
"tensorflow>=2.6.0",
236234
],

src/datasets/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from .arrow_dataset import Dataset
1818
from .arrow_reader import ReadInstruction
19-
from .builder import ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
19+
from .builder import ArrowBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder
2020
from .combine import concatenate_datasets, interleave_datasets
2121
from .dataset_dict import DatasetDict, IterableDatasetDict
2222
from .download import *

src/datasets/arrow_writer.py

-132
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,8 @@
1313
# Lint as: python3
1414
"""To write records into Parquet files."""
1515

16-
import errno
1716
import json
18-
import os
1917
import sys
20-
from pathlib import Path
2118
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
2219

2320
import fsspec
@@ -43,8 +40,6 @@
4340
from .keyhash import DuplicatedKeysError, KeyHasher
4441
from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
4542
from .utils import logging
46-
from .utils import tqdm as hf_tqdm
47-
from .utils.file_utils import hash_url_to_filename
4843
from .utils.py_utils import asdict, first_non_null_value
4944

5045

@@ -617,130 +612,3 @@ def finalize(self, close_stream=True):
617612

618613
class ParquetWriter(ArrowWriter):
619614
_WRITER_CLASS = pq.ParquetWriter
620-
621-
622-
class BeamWriter:
623-
"""
624-
Shuffles and writes Examples to Arrow files.
625-
The Arrow files are converted from Parquet files that are the output of Apache Beam pipelines.
626-
"""
627-
628-
def __init__(
629-
self,
630-
features: Optional[Features] = None,
631-
schema: Optional[pa.Schema] = None,
632-
path: Optional[str] = None,
633-
namespace: Optional[str] = None,
634-
cache_dir: Optional[str] = None,
635-
):
636-
if features is None and schema is None:
637-
raise ValueError("At least one of features and schema must be provided.")
638-
if path is None:
639-
raise ValueError("Path must be provided.")
640-
641-
if features is not None:
642-
self._features: Features = features
643-
self._schema: pa.Schema = features.arrow_schema
644-
else:
645-
self._schema: pa.Schema = schema
646-
self._features: Features = Features.from_arrow_schema(schema)
647-
648-
self._path = path
649-
self._parquet_path = os.path.splitext(path)[0] # remove extension
650-
self._namespace = namespace or "default"
651-
self._num_examples = None
652-
self._cache_dir = cache_dir or config.HF_DATASETS_CACHE
653-
654-
def write_from_pcollection(self, pcoll_examples):
655-
"""Add the final steps of the beam pipeline: write to parquet files."""
656-
import apache_beam as beam
657-
658-
def inc_num_examples(example):
659-
beam.metrics.Metrics.counter(self._namespace, "num_examples").inc()
660-
661-
# count examples
662-
_ = pcoll_examples | "Count N. Examples" >> beam.Map(inc_num_examples)
663-
664-
# save dataset
665-
return (
666-
pcoll_examples
667-
| "Get values" >> beam.Values()
668-
| "Save to parquet"
669-
>> beam.io.parquetio.WriteToParquet(
670-
self._parquet_path, self._schema, shard_name_template="-SSSSS-of-NNNNN.parquet"
671-
)
672-
)
673-
674-
def finalize(self, metrics_query_result: dict):
675-
"""
676-
Run after the pipeline has finished.
677-
It converts the resulting parquet files to arrow and it completes the info from the pipeline metrics.
678-
679-
Args:
680-
metrics_query_result: `dict` obtained from pipeline_results.metrics().query(m_filter). Make sure
681-
that the filter keeps only the metrics for the considered split, under the namespace `split_name`.
682-
"""
683-
684-
# Beam FileSystems require the system's path separator in the older versions
685-
fs, parquet_path = url_to_fs(self._parquet_path)
686-
parquet_path = str(Path(parquet_path)) if not is_remote_filesystem(fs) else fs.unstrip_protocol(parquet_path)
687-
688-
shards = fs.glob(parquet_path + "*.parquet")
689-
num_bytes = sum(fs.sizes(shards))
690-
shard_lengths = get_parquet_lengths(shards)
691-
692-
# Convert to arrow
693-
if self._path.endswith(".arrow"):
694-
logger.info(f"Converting parquet files {self._parquet_path} to arrow {self._path}")
695-
try: # stream conversion
696-
num_bytes = 0
697-
for shard in hf_tqdm(shards, unit="shards"):
698-
with fs.open(shard, "rb") as source:
699-
with fs.open(shard.replace(".parquet", ".arrow"), "wb") as destination:
700-
shard_num_bytes, _ = parquet_to_arrow(source, destination)
701-
num_bytes += shard_num_bytes
702-
except OSError as e: # broken pipe can happen if the connection is unstable, do local conversion instead
703-
if e.errno != errno.EPIPE: # not a broken pipe
704-
raise
705-
logger.warning(
706-
"Broken Pipe during stream conversion from parquet to arrow. Using local convert instead"
707-
)
708-
local_convert_dir = os.path.join(self._cache_dir, "beam_convert")
709-
os.makedirs(local_convert_dir, exist_ok=True)
710-
num_bytes = 0
711-
for shard in hf_tqdm(shards, unit="shards"):
712-
local_parquet_path = os.path.join(local_convert_dir, hash_url_to_filename(shard) + ".parquet")
713-
fs.download(shard, local_parquet_path)
714-
local_arrow_path = local_parquet_path.replace(".parquet", ".arrow")
715-
shard_num_bytes, _ = parquet_to_arrow(local_parquet_path, local_arrow_path)
716-
num_bytes += shard_num_bytes
717-
remote_arrow_path = shard.replace(".parquet", ".arrow")
718-
fs.upload(local_arrow_path, remote_arrow_path)
719-
720-
# Save metrics
721-
counters_dict = {metric.key.metric.name: metric.result for metric in metrics_query_result["counters"]}
722-
self._num_examples = counters_dict["num_examples"]
723-
self._num_bytes = num_bytes
724-
self._shard_lengths = shard_lengths
725-
return self._num_examples, self._num_bytes
726-
727-
728-
def get_parquet_lengths(sources) -> List[int]:
729-
shard_lengths = []
730-
for source in hf_tqdm(sources, unit="parquet files"):
731-
parquet_file = pa.parquet.ParquetFile(source)
732-
shard_lengths.append(parquet_file.metadata.num_rows)
733-
return shard_lengths
734-
735-
736-
def parquet_to_arrow(source, destination) -> List[int]:
737-
"""Convert parquet file to arrow file. Inputs can be str paths or file-like objects"""
738-
stream = None if isinstance(destination, str) else destination
739-
parquet_file = pa.parquet.ParquetFile(source)
740-
# Beam can create empty Parquet files, so we need to pass the source Parquet file's schema
741-
with ArrowWriter(schema=parquet_file.schema_arrow, path=destination, stream=stream) as writer:
742-
for record_batch in parquet_file.iter_batches():
743-
pa_table = pa.Table.from_batches([record_batch])
744-
writer.write_table(pa_table)
745-
num_bytes, num_examples = writer.finalize()
746-
return num_bytes, num_examples

0 commit comments

Comments
 (0)