Skip to content

Commit a3bc89d

Browse files
Add CLI function to convert script-dataset to Parquet (#6795)
* Add CLI function to convert script-dataset to Parquet * Refactor code * Fix quality * Add trust_remote_code argument
1 parent 828aff9 commit a3bc89d

File tree

2 files changed

+156
-0
lines changed

2 files changed

+156
-0
lines changed
+154
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import time
2+
from argparse import ArgumentParser
3+
from typing import Optional
4+
5+
from huggingface_hub import HfApi, get_repo_discussions
6+
7+
from datasets import get_dataset_config_names, get_dataset_default_config_name, load_dataset
8+
from datasets.commands import BaseDatasetsCLICommand
9+
10+
11+
def _command_factory(args):
12+
return ConvertToParquetCommand(
13+
args.dataset_id,
14+
args.token,
15+
args.revision,
16+
args.trust_remote_code,
17+
)
18+
19+
20+
class ConvertToParquetCommand(BaseDatasetsCLICommand):
21+
@staticmethod
22+
def register_subcommand(parser):
23+
parser: ArgumentParser = parser.add_parser("convert_to_parquet", help="Convert dataset to Parquet")
24+
parser.add_argument("dataset_id", help="source dataset ID")
25+
parser.add_argument("--token", help="access token to the Hugging Face Hub")
26+
parser.add_argument("--revision", help="source revision")
27+
parser.add_argument(
28+
"--trust_remote_code", action="store_true", help="whether to trust the code execution of the load script"
29+
)
30+
parser.set_defaults(func=_command_factory)
31+
32+
def __init__(
33+
self,
34+
dataset_id: str,
35+
token: Optional[str],
36+
revision: Optional[str],
37+
trust_remote_code: bool,
38+
):
39+
self._dataset_id = dataset_id
40+
self._token = token
41+
self._revision = revision
42+
self._trust_remote_code = trust_remote_code
43+
44+
def run(self) -> None:
45+
dataset_id = self._dataset_id
46+
token = self._token
47+
revision = self._revision
48+
trust_remote_code = self._trust_remote_code
49+
print(f"{dataset_id}")
50+
configs = get_dataset_config_names(
51+
dataset_id, token=token, revision=revision, trust_remote_code=trust_remote_code
52+
)
53+
print(f"{configs = }")
54+
default_config = get_dataset_default_config_name(
55+
dataset_id, token=token, revision=revision, trust_remote_code=trust_remote_code
56+
)
57+
print(f"{default_config = }")
58+
if default_config:
59+
config = default_config
60+
configs.remove(default_config)
61+
else:
62+
config = configs.pop(0)
63+
print(f"{config = }")
64+
dataset = load_dataset(dataset_id, config, revision=revision, trust_remote_code=trust_remote_code)
65+
commit_info = dataset.push_to_hub(
66+
dataset_id,
67+
config_name=config,
68+
commit_message="Convert dataset to Parquet",
69+
commit_description="Convert dataset to Parquet.",
70+
create_pr=True,
71+
token=token,
72+
set_default=default_config is not None,
73+
)
74+
time.sleep(5)
75+
if commit_info:
76+
pr_revision, pr_url = commit_info.pr_revision, commit_info.pr_url
77+
else:
78+
pr_revision, pr_url = infer_pr(dataset_id, token=token)
79+
for config in configs:
80+
print(f"{config = }")
81+
dataset = load_dataset(dataset_id, config, revision=revision, trust_remote_code=trust_remote_code)
82+
dataset.push_to_hub(
83+
dataset_id,
84+
config_name=config,
85+
commit_message=f"Add {config} data files",
86+
revision=pr_revision,
87+
token=token,
88+
)
89+
time.sleep(5)
90+
delete_files(dataset_id, revision=pr_revision, token=token)
91+
print(f"You can find your PR to convert the dataset to Parquet at: {pr_url}")
92+
93+
94+
def infer_pr(dataset_id, token=None):
95+
discussions = get_repo_discussions(dataset_id, repo_type="dataset", token=token)
96+
prs = [discussion for discussion in discussions if discussion.is_pull_request and discussion.status == "open"]
97+
pr = sorted(prs, key=lambda pr: pr.num)[-1]
98+
return pr.git_reference, pr.url
99+
100+
101+
def delete_files(dataset_id, revision=None, token=None):
102+
dataset_name = dataset_id.split("/")[-1]
103+
hf_api = HfApi(token=token)
104+
repo_files = hf_api.list_repo_files(
105+
dataset_id,
106+
repo_type="dataset",
107+
)
108+
if repo_files:
109+
legacy_json_file = []
110+
python_files = []
111+
data_files = []
112+
for filename in repo_files:
113+
if filename in {".gitattributes", "README.md"}:
114+
continue
115+
elif filename == f"{dataset_name}.py":
116+
hf_api.delete_file(
117+
filename,
118+
dataset_id,
119+
repo_type="dataset",
120+
revision=revision,
121+
commit_message="Delete loading script",
122+
)
123+
elif filename == "dataset_infos.json":
124+
legacy_json_file.append(filename)
125+
elif filename.endswith(".py"):
126+
python_files.append(filename)
127+
else:
128+
data_files.append(filename)
129+
if legacy_json_file:
130+
hf_api.delete_file(
131+
"dataset_infos.json",
132+
dataset_id,
133+
repo_type="dataset",
134+
revision=revision,
135+
commit_message="Delete legacy dataset_infos.json",
136+
)
137+
if python_files:
138+
for filename in python_files:
139+
hf_api.delete_file(
140+
filename,
141+
dataset_id,
142+
repo_type="dataset",
143+
revision=revision,
144+
commit_message="Delete loading script auxiliary file",
145+
)
146+
if data_files:
147+
for filename in data_files:
148+
hf_api.delete_file(
149+
filename,
150+
dataset_id,
151+
repo_type="dataset",
152+
revision=revision,
153+
commit_message="Delete data file",
154+
)

src/datasets/commands/datasets_cli.py

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from argparse import ArgumentParser
33

44
from datasets.commands.convert import ConvertCommand
5+
from datasets.commands.convert_to_parquet import ConvertToParquetCommand
56
from datasets.commands.dummy_data import DummyDataCommand
67
from datasets.commands.env import EnvironmentCommand
78
from datasets.commands.run_beam import RunBeamCommand
@@ -26,6 +27,7 @@ def main():
2627
TestCommand.register_subcommand(commands_parser)
2728
RunBeamCommand.register_subcommand(commands_parser)
2829
DummyDataCommand.register_subcommand(commands_parser)
30+
ConvertToParquetCommand.register_subcommand(commands_parser)
2931

3032
# Parse args
3133
args, unknown_args = parser.parse_known_args()

0 commit comments

Comments
 (0)