From 58377dd4e9ecf769973dac631ed0db9e929c8ed7 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:44:19 +0200 Subject: [PATCH 1/4] Add CLI function to convert script-dataset to Parquet --- src/datasets/commands/convert_to_parquet.py | 151 ++++++++++++++++++++ src/datasets/commands/datasets_cli.py | 2 + 2 files changed, 153 insertions(+) create mode 100644 src/datasets/commands/convert_to_parquet.py diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py new file mode 100644 index 00000000000..17f5baa748c --- /dev/null +++ b/src/datasets/commands/convert_to_parquet.py @@ -0,0 +1,151 @@ +import time +from argparse import ArgumentParser +from typing import Optional + +from huggingface_hub import HfApi, get_repo_discussions +from huggingface_hub.utils import EntryNotFoundError + +from datasets import get_dataset_config_names, get_dataset_default_config_name, load_dataset +from datasets.commands import BaseDatasetsCLICommand + + +def _command_factory(args): + return ConvertToParquetCommand( + args.dataset_id, + args.token, + args.revision, + ) + + +class ConvertToParquetCommand(BaseDatasetsCLICommand): + @staticmethod + def register_subcommand(parser): + parser: ArgumentParser = parser.add_parser("convert_to_parquet", help="Convert dataset to Parquet") + parser.add_argument("dataset_id", help="source dataset ID") + parser.add_argument("--token", help="access token to the Hugging Face Hub") + parser.add_argument("--revision", help="source revision") + parser.set_defaults(func=_command_factory) + + def __init__( + self, + dataset_id: str, + token: Optional[str], + revision: Optional[str], + ): + self._dataset_id = dataset_id + self._token = token + self._revision = revision + + def run(self) -> None: + dataset_id = self._dataset_id + token = self._token + revision = self._revision + print(f"{dataset_id}") + configs = get_dataset_config_names(dataset_id, token=token, revision=revision) + print(f"{configs = }") + default_config = get_dataset_default_config_name(dataset_id, token=token, revision=revision) + print(f"{default_config = }") + if default_config: + config = default_config + configs.remove(default_config) + else: + config = configs.pop(0) + print(f"{config = }") + dataset = load_dataset(dataset_id, config, revision=revision) + commit_info = dataset.push_to_hub( + dataset_id, + config_name=config, + commit_message="Convert dataset to Parquet", + commit_description="Convert dataset to Parquet.", + create_pr=True, + token=token, + set_default=default_config is not None, + ) + time.sleep(5) + if commit_info: + pr_revision, pr_url = commit_info.pr_revision, commit_info.pr_url + else: + pr_revision, pr_url = infer_pr(dataset_id, token=token) + for config in configs: + print(f"{config = }") + dataset = load_dataset(dataset_id, config, revision=revision) + dataset.push_to_hub( + dataset_id, + config_name=config, + commit_message=f"Add {config} data files", + revision=pr_revision, + token=token, + ) + time.sleep(5) + delete_files(dataset_id, revision=pr_revision, token=token) + print(f"You can find your PR to convert the dataset to Parquet at: {pr_url}") + + +def infer_pr(dataset_id, token=None): + discussions = get_repo_discussions(dataset_id, repo_type="dataset", token=token) + prs = [discussion for discussion in discussions if discussion.is_pull_request and discussion.status == "open"] + pr = sorted(prs, key=lambda pr: pr.num)[-1] + return pr.git_reference, pr.url + + +def delete_files(dataset_id, revision=None, token=None): + dataset_name = dataset_id.split("/")[-1] + hf_api = HfApi(token=token) + try: + hf_api.delete_file( + f"{dataset_name}.py", + dataset_id, + repo_type="dataset", + revision=revision, + commit_message="Delete loading script", + ) + except EntryNotFoundError: + pass + try: + hf_api.delete_file( + "dataset_infos.json", + dataset_id, + repo_type="dataset", + revision=revision, + commit_message="Delete legacy dataset_infos.json", + ) + except EntryNotFoundError: + pass + repo_files = hf_api.list_repo_files( + dataset_id, + repo_type="dataset", + ) + if ".gitattributes" in repo_files: + repo_files.remove(".gitattributes") + if "README.md" in repo_files: + repo_files.remove("README.md") + if f"{dataset_name}.py" in repo_files: + repo_files.remove(f"{dataset_name}.py") + if "dataset_infos.json" in repo_files: + repo_files.remove("dataset_infos.json") + if repo_files: + python_files = [] + data_files = [] + for filename in repo_files: + if filename.endswith(".py"): + python_files.append(filename) + else: + data_files.append(filename) + if python_files: + for filename in python_files: + hf_api.delete_file( + filename, + dataset_id, + repo_type="dataset", + revision=revision, + commit_message="Delete loading script auxiliary file", + ) + if data_files: + for filename in data_files: + hf_api.delete_file( + filename, + dataset_id, + repo_type="dataset", + revision=revision, + commit_message="Delete data file", + ) diff --git a/src/datasets/commands/datasets_cli.py b/src/datasets/commands/datasets_cli.py index 927518e311c..23afce216fa 100644 --- a/src/datasets/commands/datasets_cli.py +++ b/src/datasets/commands/datasets_cli.py @@ -2,6 +2,7 @@ from argparse import ArgumentParser from datasets.commands.convert import ConvertCommand +from datasets.commands.convert_to_parquet import ConvertToParquetCommand from datasets.commands.dummy_data import DummyDataCommand from datasets.commands.env import EnvironmentCommand from datasets.commands.run_beam import RunBeamCommand @@ -26,6 +27,7 @@ def main(): TestCommand.register_subcommand(commands_parser) RunBeamCommand.register_subcommand(commands_parser) DummyDataCommand.register_subcommand(commands_parser) + ConvertToParquetCommand.register_subcommand(commands_parser) # Parse args args, unknown_args = parser.parse_known_args() From 999f3da19b6be9b59c4b2a6f64f191f0ddb043c0 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Apr 2024 18:02:46 +0200 Subject: [PATCH 2/4] Refactor code --- src/datasets/commands/convert_to_parquet.py | 51 +++++++++------------ 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py index 17f5baa748c..d5855547752 100644 --- a/src/datasets/commands/convert_to_parquet.py +++ b/src/datasets/commands/convert_to_parquet.py @@ -91,46 +91,39 @@ def infer_pr(dataset_id, token=None): def delete_files(dataset_id, revision=None, token=None): dataset_name = dataset_id.split("/")[-1] hf_api = HfApi(token=token) - try: - hf_api.delete_file( - f"{dataset_name}.py", - dataset_id, - repo_type="dataset", - revision=revision, - commit_message="Delete loading script", - ) - except EntryNotFoundError: - pass - try: - hf_api.delete_file( - "dataset_infos.json", - dataset_id, - repo_type="dataset", - revision=revision, - commit_message="Delete legacy dataset_infos.json", - ) - except EntryNotFoundError: - pass repo_files = hf_api.list_repo_files( dataset_id, repo_type="dataset", ) - if ".gitattributes" in repo_files: - repo_files.remove(".gitattributes") - if "README.md" in repo_files: - repo_files.remove("README.md") - if f"{dataset_name}.py" in repo_files: - repo_files.remove(f"{dataset_name}.py") - if "dataset_infos.json" in repo_files: - repo_files.remove("dataset_infos.json") if repo_files: + legacy_json_file = [] python_files = [] data_files = [] for filename in repo_files: - if filename.endswith(".py"): + if filename in {".gitattributes", "README.md"}: + continue + elif filename == f"{dataset_name}.py": + hf_api.delete_file( + filename, + dataset_id, + repo_type="dataset", + revision=revision, + commit_message="Delete loading script", + ) + elif filename == "dataset_infos.json": + legacy_json_file.append(filename) + elif filename.endswith(".py"): python_files.append(filename) else: data_files.append(filename) + if legacy_json_file: + hf_api.delete_file( + "dataset_infos.json", + dataset_id, + repo_type="dataset", + revision=revision, + commit_message="Delete legacy dataset_infos.json", + ) if python_files: for filename in python_files: hf_api.delete_file( From 70d5ec65f89ef4259ddce17c868674f3831dc16a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Apr 2024 09:17:51 +0200 Subject: [PATCH 3/4] Fix quality --- src/datasets/commands/convert_to_parquet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py index d5855547752..604307476a3 100644 --- a/src/datasets/commands/convert_to_parquet.py +++ b/src/datasets/commands/convert_to_parquet.py @@ -3,7 +3,6 @@ from typing import Optional from huggingface_hub import HfApi, get_repo_discussions -from huggingface_hub.utils import EntryNotFoundError from datasets import get_dataset_config_names, get_dataset_default_config_name, load_dataset from datasets.commands import BaseDatasetsCLICommand From f9421e55c1fc495746ee9c26078e0720ad0db0e4 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:04:30 +0200 Subject: [PATCH 4/4] Add trust_remote_code argument --- src/datasets/commands/convert_to_parquet.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py index 604307476a3..cbe154dd469 100644 --- a/src/datasets/commands/convert_to_parquet.py +++ b/src/datasets/commands/convert_to_parquet.py @@ -13,6 +13,7 @@ def _command_factory(args): args.dataset_id, args.token, args.revision, + args.trust_remote_code, ) @@ -23,6 +24,9 @@ def register_subcommand(parser): parser.add_argument("dataset_id", help="source dataset ID") parser.add_argument("--token", help="access token to the Hugging Face Hub") parser.add_argument("--revision", help="source revision") + parser.add_argument( + "--trust_remote_code", action="store_true", help="whether to trust the code execution of the load script" + ) parser.set_defaults(func=_command_factory) def __init__( @@ -30,19 +34,26 @@ def __init__( dataset_id: str, token: Optional[str], revision: Optional[str], + trust_remote_code: bool, ): self._dataset_id = dataset_id self._token = token self._revision = revision + self._trust_remote_code = trust_remote_code def run(self) -> None: dataset_id = self._dataset_id token = self._token revision = self._revision + trust_remote_code = self._trust_remote_code print(f"{dataset_id}") - configs = get_dataset_config_names(dataset_id, token=token, revision=revision) + configs = get_dataset_config_names( + dataset_id, token=token, revision=revision, trust_remote_code=trust_remote_code + ) print(f"{configs = }") - default_config = get_dataset_default_config_name(dataset_id, token=token, revision=revision) + default_config = get_dataset_default_config_name( + dataset_id, token=token, revision=revision, trust_remote_code=trust_remote_code + ) print(f"{default_config = }") if default_config: config = default_config @@ -50,7 +61,7 @@ def run(self) -> None: else: config = configs.pop(0) print(f"{config = }") - dataset = load_dataset(dataset_id, config, revision=revision) + dataset = load_dataset(dataset_id, config, revision=revision, trust_remote_code=trust_remote_code) commit_info = dataset.push_to_hub( dataset_id, config_name=config, @@ -67,7 +78,7 @@ def run(self) -> None: pr_revision, pr_url = infer_pr(dataset_id, token=token) for config in configs: print(f"{config = }") - dataset = load_dataset(dataset_id, config, revision=revision) + dataset = load_dataset(dataset_id, config, revision=revision, trust_remote_code=trust_remote_code) dataset.push_to_hub( dataset_id, config_name=config,