From 58377dd4e9ecf769973dac631ed0db9e929c8ed7 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 9 Apr 2024 16:44:19 +0200
Subject: [PATCH 1/4] Add CLI function to convert script-dataset to Parquet

---
 src/datasets/commands/convert_to_parquet.py | 151 ++++++++++++++++++++
 src/datasets/commands/datasets_cli.py       |   2 +
 2 files changed, 153 insertions(+)
 create mode 100644 src/datasets/commands/convert_to_parquet.py

diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py
new file mode 100644
index 00000000000..17f5baa748c
--- /dev/null
+++ b/src/datasets/commands/convert_to_parquet.py
@@ -0,0 +1,151 @@
+import time
+from argparse import ArgumentParser
+from typing import Optional
+
+from huggingface_hub import HfApi, get_repo_discussions
+from huggingface_hub.utils import EntryNotFoundError
+
+from datasets import get_dataset_config_names, get_dataset_default_config_name, load_dataset
+from datasets.commands import BaseDatasetsCLICommand
+
+
+def _command_factory(args):
+    return ConvertToParquetCommand(
+        args.dataset_id,
+        args.token,
+        args.revision,
+    )
+
+
+class ConvertToParquetCommand(BaseDatasetsCLICommand):
+    @staticmethod
+    def register_subcommand(parser):
+        parser: ArgumentParser = parser.add_parser("convert_to_parquet", help="Convert dataset to Parquet")
+        parser.add_argument("dataset_id", help="source dataset ID")
+        parser.add_argument("--token", help="access token to the Hugging Face Hub")
+        parser.add_argument("--revision", help="source revision")
+        parser.set_defaults(func=_command_factory)
+
+    def __init__(
+        self,
+        dataset_id: str,
+        token: Optional[str],
+        revision: Optional[str],
+    ):
+        self._dataset_id = dataset_id
+        self._token = token
+        self._revision = revision
+
+    def run(self) -> None:
+        dataset_id = self._dataset_id
+        token = self._token
+        revision = self._revision
+        print(f"{dataset_id}")
+        configs = get_dataset_config_names(dataset_id, token=token, revision=revision)
+        print(f"{configs = }")
+        default_config = get_dataset_default_config_name(dataset_id, token=token, revision=revision)
+        print(f"{default_config = }")
+        if default_config:
+            config = default_config
+            configs.remove(default_config)
+        else:
+            config = configs.pop(0)
+        print(f"{config = }")
+        dataset = load_dataset(dataset_id, config, revision=revision)
+        commit_info = dataset.push_to_hub(
+            dataset_id,
+            config_name=config,
+            commit_message="Convert dataset to Parquet",
+            commit_description="Convert dataset to Parquet.",
+            create_pr=True,
+            token=token,
+            set_default=default_config is not None,
+        )
+        time.sleep(5)
+        if commit_info:
+            pr_revision, pr_url = commit_info.pr_revision, commit_info.pr_url
+        else:
+            pr_revision, pr_url = infer_pr(dataset_id, token=token)
+        for config in configs:
+            print(f"{config = }")
+            dataset = load_dataset(dataset_id, config, revision=revision)
+            dataset.push_to_hub(
+                dataset_id,
+                config_name=config,
+                commit_message=f"Add {config} data files",
+                revision=pr_revision,
+                token=token,
+            )
+            time.sleep(5)
+        delete_files(dataset_id, revision=pr_revision, token=token)
+        print(f"You can find your PR to convert the dataset to Parquet at: {pr_url}")
+
+
+def infer_pr(dataset_id, token=None):
+    discussions = get_repo_discussions(dataset_id, repo_type="dataset", token=token)
+    prs = [discussion for discussion in discussions if discussion.is_pull_request and discussion.status == "open"]
+    pr = sorted(prs, key=lambda pr: pr.num)[-1]
+    return pr.git_reference, pr.url
+
+
+def delete_files(dataset_id, revision=None, token=None):
+    dataset_name = dataset_id.split("/")[-1]
+    hf_api = HfApi(token=token)
+    try:
+        hf_api.delete_file(
+            f"{dataset_name}.py",
+            dataset_id,
+            repo_type="dataset",
+            revision=revision,
+            commit_message="Delete loading script",
+        )
+    except EntryNotFoundError:
+        pass
+    try:
+        hf_api.delete_file(
+            "dataset_infos.json",
+            dataset_id,
+            repo_type="dataset",
+            revision=revision,
+            commit_message="Delete legacy dataset_infos.json",
+        )
+    except EntryNotFoundError:
+        pass
+    repo_files = hf_api.list_repo_files(
+        dataset_id,
+        repo_type="dataset",
+    )
+    if ".gitattributes" in repo_files:
+        repo_files.remove(".gitattributes")
+    if "README.md" in repo_files:
+        repo_files.remove("README.md")
+    if f"{dataset_name}.py" in repo_files:
+        repo_files.remove(f"{dataset_name}.py")
+    if "dataset_infos.json" in repo_files:
+        repo_files.remove("dataset_infos.json")
+    if repo_files:
+        python_files = []
+        data_files = []
+        for filename in repo_files:
+            if filename.endswith(".py"):
+                python_files.append(filename)
+            else:
+                data_files.append(filename)
+        if python_files:
+            for filename in python_files:
+                hf_api.delete_file(
+                    filename,
+                    dataset_id,
+                    repo_type="dataset",
+                    revision=revision,
+                    commit_message="Delete loading script auxiliary file",
+                )
+        if data_files:
+            for filename in data_files:
+                hf_api.delete_file(
+                    filename,
+                    dataset_id,
+                    repo_type="dataset",
+                    revision=revision,
+                    commit_message="Delete data file",
+                )
diff --git a/src/datasets/commands/datasets_cli.py b/src/datasets/commands/datasets_cli.py
index 927518e311c..23afce216fa 100644
--- a/src/datasets/commands/datasets_cli.py
+++ b/src/datasets/commands/datasets_cli.py
@@ -2,6 +2,7 @@
 from argparse import ArgumentParser
 
 from datasets.commands.convert import ConvertCommand
+from datasets.commands.convert_to_parquet import ConvertToParquetCommand
 from datasets.commands.dummy_data import DummyDataCommand
 from datasets.commands.env import EnvironmentCommand
 from datasets.commands.run_beam import RunBeamCommand
@@ -26,6 +27,7 @@ def main():
     TestCommand.register_subcommand(commands_parser)
     RunBeamCommand.register_subcommand(commands_parser)
     DummyDataCommand.register_subcommand(commands_parser)
+    ConvertToParquetCommand.register_subcommand(commands_parser)
 
     # Parse args
     args, unknown_args = parser.parse_known_args()

From 999f3da19b6be9b59c4b2a6f64f191f0ddb043c0 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 9 Apr 2024 18:02:46 +0200
Subject: [PATCH 2/4] Refactor code

---
 src/datasets/commands/convert_to_parquet.py | 51 +++++++++------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py
index 17f5baa748c..d5855547752 100644
--- a/src/datasets/commands/convert_to_parquet.py
+++ b/src/datasets/commands/convert_to_parquet.py
@@ -91,46 +91,39 @@ def infer_pr(dataset_id, token=None):
 def delete_files(dataset_id, revision=None, token=None):
     dataset_name = dataset_id.split("/")[-1]
     hf_api = HfApi(token=token)
-    try:
-        hf_api.delete_file(
-            f"{dataset_name}.py",
-            dataset_id,
-            repo_type="dataset",
-            revision=revision,
-            commit_message="Delete loading script",
-        )
-    except EntryNotFoundError:
-        pass
-    try:
-        hf_api.delete_file(
-            "dataset_infos.json",
-            dataset_id,
-            repo_type="dataset",
-            revision=revision,
-            commit_message="Delete legacy dataset_infos.json",
-        )
-    except EntryNotFoundError:
-        pass
     repo_files = hf_api.list_repo_files(
         dataset_id,
         repo_type="dataset",
     )
-    if ".gitattributes" in repo_files:
-        repo_files.remove(".gitattributes")
-    if "README.md" in repo_files:
-        repo_files.remove("README.md")
-    if f"{dataset_name}.py" in repo_files:
-        repo_files.remove(f"{dataset_name}.py")
-    if "dataset_infos.json" in repo_files:
-        repo_files.remove("dataset_infos.json")
     if repo_files:
+        legacy_json_file = []
         python_files = []
         data_files = []
         for filename in repo_files:
-            if filename.endswith(".py"):
+            if filename in {".gitattributes", "README.md"}:
+                continue
+            elif filename == f"{dataset_name}.py":
+                hf_api.delete_file(
+                    filename,
+                    dataset_id,
+                    repo_type="dataset",
+                    revision=revision,
+                    commit_message="Delete loading script",
+                )
+            elif filename == "dataset_infos.json":
+                legacy_json_file.append(filename)
+            elif filename.endswith(".py"):
                 python_files.append(filename)
             else:
                 data_files.append(filename)
+        if legacy_json_file:
+            hf_api.delete_file(
+                "dataset_infos.json",
+                dataset_id,
+                repo_type="dataset",
+                revision=revision,
+                commit_message="Delete legacy dataset_infos.json",
+            )
         if python_files:
             for filename in python_files:
                 hf_api.delete_file(

From 70d5ec65f89ef4259ddce17c868674f3831dc16a Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Apr 2024 09:17:51 +0200
Subject: [PATCH 3/4] Fix quality

---
 src/datasets/commands/convert_to_parquet.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py
index d5855547752..604307476a3 100644
--- a/src/datasets/commands/convert_to_parquet.py
+++ b/src/datasets/commands/convert_to_parquet.py
@@ -3,7 +3,6 @@
 from typing import Optional
 
 from huggingface_hub import HfApi, get_repo_discussions
-from huggingface_hub.utils import EntryNotFoundError
 
 from datasets import get_dataset_config_names, get_dataset_default_config_name, load_dataset
 from datasets.commands import BaseDatasetsCLICommand

From f9421e55c1fc495746ee9c26078e0720ad0db0e4 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Fri, 12 Apr 2024 17:04:30 +0200
Subject: [PATCH 4/4] Add trust_remote_code argument

---
 src/datasets/commands/convert_to_parquet.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/datasets/commands/convert_to_parquet.py b/src/datasets/commands/convert_to_parquet.py
index 604307476a3..cbe154dd469 100644
--- a/src/datasets/commands/convert_to_parquet.py
+++ b/src/datasets/commands/convert_to_parquet.py
@@ -13,6 +13,7 @@ def _command_factory(args):
         args.dataset_id,
         args.token,
         args.revision,
+        args.trust_remote_code,
     )
 
 
@@ -23,6 +24,9 @@ def register_subcommand(parser):
         parser.add_argument("dataset_id", help="source dataset ID")
         parser.add_argument("--token", help="access token to the Hugging Face Hub")
         parser.add_argument("--revision", help="source revision")
+        parser.add_argument(
+            "--trust_remote_code", action="store_true", help="whether to trust the code execution of the load script"
+        )
         parser.set_defaults(func=_command_factory)
 
     def __init__(
@@ -30,19 +34,26 @@ def __init__(
         dataset_id: str,
         token: Optional[str],
         revision: Optional[str],
+        trust_remote_code: bool,
     ):
         self._dataset_id = dataset_id
         self._token = token
         self._revision = revision
+        self._trust_remote_code = trust_remote_code
 
     def run(self) -> None:
         dataset_id = self._dataset_id
         token = self._token
         revision = self._revision
+        trust_remote_code = self._trust_remote_code
         print(f"{dataset_id}")
-        configs = get_dataset_config_names(dataset_id, token=token, revision=revision)
+        configs = get_dataset_config_names(
+            dataset_id, token=token, revision=revision, trust_remote_code=trust_remote_code
+        )
         print(f"{configs = }")
-        default_config = get_dataset_default_config_name(dataset_id, token=token, revision=revision)
+        default_config = get_dataset_default_config_name(
+            dataset_id, token=token, revision=revision, trust_remote_code=trust_remote_code
+        )
         print(f"{default_config = }")
         if default_config:
             config = default_config
@@ -50,7 +61,7 @@ def run(self) -> None:
         else:
             config = configs.pop(0)
         print(f"{config = }")
-        dataset = load_dataset(dataset_id, config, revision=revision)
+        dataset = load_dataset(dataset_id, config, revision=revision, trust_remote_code=trust_remote_code)
         commit_info = dataset.push_to_hub(
             dataset_id,
             config_name=config,
@@ -67,7 +78,7 @@ def run(self) -> None:
             pr_revision, pr_url = infer_pr(dataset_id, token=token)
         for config in configs:
             print(f"{config = }")
-            dataset = load_dataset(dataset_id, config, revision=revision)
+            dataset = load_dataset(dataset_id, config, revision=revision, trust_remote_code=trust_remote_code)
             dataset.push_to_hub(
                 dataset_id,
                 config_name=config,