refactor(duckdb): remove the pyarrow read_parquet fallback

cpcloud · cpcloud · commit 5fa01036117a · 2025-02-19T06:07:49.000-05:00
diff --git a/conda/environment-arm64-flink.yml b/conda/environment-arm64-flink.yml
@@ -59,7 +59,6 @@ dependencies:
   - pytest-benchmark >=3.4.1,<5
   - pytest-clarity >=1.0.1,<2
   - pytest-cov >=3.0.0,<5
-  - pytest-httpserver >=1.0.5,<2
   - pytest-mock >=3.6.1,<4
   - pytest-randomly >=3.10.1,<4
   - pytest-repeat >=0.9.1,<0.10
diff --git a/conda/environment-arm64.yml b/conda/environment-arm64.yml
@@ -58,7 +58,6 @@ dependencies:
   - pytest-benchmark >=3.4.1,<5
   - pytest-clarity >=1.0.1,<2
   - pytest-cov >=3.0.0,<5
-  - pytest-httpserver >=1.0.5,<2
   - pytest-mock >=3.6.1,<4
   - pytest-randomly >=3.10.1,<4
   - pytest-repeat >=0.9.1,<0.10
diff --git a/conda/environment.yml b/conda/environment.yml
@@ -61,7 +61,6 @@ dependencies:
   - pytest-benchmark >=3.4.1,<5
   - pytest-clarity >=1.0.1,<2
   - pytest-cov >=3.0.0,<5
-  - pytest-httpserver >=1.0.5,<2
   - pytest-mock >=3.6.1,<4
   - pytest-randomly >=3.10.1,<4
   - pytest-repeat >=0.9.1,<0.10
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -11,8 +11,6 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 import duckdb
-import pyarrow as pa
-import pyarrow_hotfix  # noqa: F401
 import sqlglot as sg
 import sqlglot.expressions as sge
 from packaging.version import parse as vparse
@@ -26,7 +24,6 @@
 import ibis.expr.types as ir
 from ibis import util
 from ibis.backends import CanCreateDatabase, UrlFromPath
-from ibis.backends.duckdb.converter import DuckDBPandasData, DuckDBPyArrowData
 from ibis.backends.sql import SQLBackend
 from ibis.backends.sql.compilers.base import STAR, AlterTable, C, RenameTable
 from ibis.common.dispatch import lazy_singledispatch
@@ -37,6 +34,8 @@
 
     import pandas as pd
     import polars as pl
+    import pyarrow as pa
+    import pyarrow_hotfix  # noqa: F401
     import torch
     from fsspec import AbstractFileSystem
 
@@ -783,48 +782,17 @@ def read_parquet(
 
         table_name = table_name or util.gen_name("read_parquet")
 
-        # Default to using the native duckdb parquet reader
-        # If that fails because of auth issues, fall back to ingesting via
-        # pyarrow dataset
-        try:
-            self._read_parquet_duckdb_native(paths, table_name, **kwargs)
-        except duckdb.IOException:
-            self._read_parquet_pyarrow_dataset(paths, table_name, **kwargs)
-
-        return self.table(table_name)
-
-    def _read_parquet_duckdb_native(
-        self, source_list: str | Iterable[str], table_name: str, **kwargs: Any
-    ) -> None:
-        if any(
-            source.startswith(("http://", "https://", "s3://"))
-            for source in source_list
-        ):
+        if any(path.startswith(("http://", "https://", "s3://")) for path in paths):
             self._load_extensions(["httpfs"])
 
         options = [
             sg.to_identifier(key).eq(sge.convert(val)) for key, val in kwargs.items()
         ]
         self._create_temp_view(
             table_name,
-            sg.select(STAR).from_(self.compiler.f.read_parquet(source_list, *options)),
+            sg.select(STAR).from_(self.compiler.f.read_parquet(paths, *options)),
         )
-
-    def _read_parquet_pyarrow_dataset(
-        self, source_list: str | Iterable[str], table_name: str, **kwargs: Any
-    ) -> None:
-        import pyarrow.dataset as ds
-
-        dataset = ds.dataset(list(map(ds.dataset, source_list)), **kwargs)
-        self._load_extensions(["httpfs"])
-        # We don't create a view since DuckDB special cases Arrow Datasets
-        # so if we also create a view we end up with both a "lazy table"
-        # and a view with the same name
-        self.con.register(table_name, dataset)
-        # DuckDB normally auto-detects Arrow Datasets that are defined
-        # in local variables but the `dataset` variable won't be local
-        # by the time we execute against this so we register it
-        # explicitly.
+        return self.table(table_name)
 
     def read_delta(
         self, path: str | Path, /, *, table_name: str | None = None, **kwargs: Any
@@ -1288,6 +1256,9 @@ def to_pyarrow_batches(
         chunk_size
             The number of rows to fetch per batch
         """
+        import pyarrow as pa
+        import pyarrow_hotfix  # noqa: F401
+
         self._run_pre_execute_hooks(expr)
         table = expr.as_table()
         sql = self.compile(table, limit=limit, params=params)
@@ -1309,6 +1280,8 @@ def to_pyarrow(
         limit: int | str | None = None,
         **kwargs: Any,
     ) -> pa.Table:
+        from ibis.backends.duckdb.converter import DuckDBPyArrowData
+
         table = self._to_duckdb_relation(
             expr, params=params, limit=limit, **kwargs
         ).arrow()
@@ -1326,10 +1299,12 @@ def execute(
         """Execute an expression."""
         import pandas as pd
         import pyarrow.types as pat
+        import pyarrow_hotfix  # noqa: F401
 
-        table = self._to_duckdb_relation(
-            expr, params=params, limit=limit, **kwargs
-        ).arrow()
+        from ibis.backends.duckdb.converter import DuckDBPandasData
+
+        rel = self._to_duckdb_relation(expr, params=params, limit=limit, **kwargs)
+        table = rel.arrow()
 
         df = pd.DataFrame(
             {
diff --git a/ibis/backends/duckdb/tests/test_io.py b/ibis/backends/duckdb/tests/test_io.py
@@ -14,7 +14,7 @@
 
 import ibis
 import ibis.expr.datatypes as dt
-from ibis.conftest import ARM64, LINUX, MACOS, SANDBOXED
+from ibis.conftest import LINUX, SANDBOXED
 from ibis.util import gen_name
 
 
@@ -344,34 +344,6 @@ def test_temp_dir_set(tmp_path, database):
     assert con.settings["temp_directory"] == str(temp_directory)
 
 
-@pytest.mark.xfail(
-    SANDBOXED and LINUX,
-    reason=(
-        "nix on linux cannot download duckdb extensions or data due to sandboxing; "
-        "duckdb will try to automatically install and load read_parquet"
-    ),
-    raises=(duckdb.Error, duckdb.IOException),
-)
-@pytest.mark.skipif(
-    SANDBOXED and MACOS and ARM64, reason="raises a RuntimeError on nix macos arm64"
-)
-def test_s3_403_fallback(con, httpserver, monkeypatch):
-    # monkeypatch to avoid downloading extensions in tests
-    monkeypatch.setattr(con, "_load_extensions", lambda _: True)
-
-    # Throw a 403 to trigger fallback to pyarrow.dataset
-    path = "/invalid.parquet"
-    httpserver.expect_request(path).respond_with_data(
-        status=403, content_type="application/vnd.apache.parquet"
-    )
-
-    # Since the URI is nonsense to pyarrow, expect an error, but raises from
-    # pyarrow, which indicates the fallback worked
-    url = httpserver.url_for(path)
-    with pytest.raises(pa.lib.ArrowInvalid):
-        con.read_parquet(url)
-
-
 def test_register_numpy_str(con):
     data = pd.DataFrame({"a": [np.str_("xyz"), None]})
     result = ibis.memtable(data)
diff --git a/pyproject.toml b/pyproject.toml
@@ -247,7 +247,6 @@ tests = [
   "pytest-deadfixtures>=2.2.1,<3",
   "pytest-clarity>=1.0.1,<2",
   "pytest-cov>=5,<7",
-  "pytest-httpserver>=1.0.5,<2",
   "pytest-mock>=3.6.1,<4",
   "pytest-randomly>=3.10.1,<4",
   "pytest-repeat>=0.9.1,<0.10",
diff --git a/requirements-dev.txt b/requirements-dev.txt
diff --git a/uv.lock b/uv.lock