Skip to content

Commit 5fa0103

Browse files
committed
refactor(duckdb): remove the pyarrow read_parquet fallback
1 parent b15f0c3 commit 5fa0103

File tree

8 files changed

+16
-101
lines changed

8 files changed

+16
-101
lines changed

conda/environment-arm64-flink.yml

-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ dependencies:
5959
- pytest-benchmark >=3.4.1,<5
6060
- pytest-clarity >=1.0.1,<2
6161
- pytest-cov >=3.0.0,<5
62-
- pytest-httpserver >=1.0.5,<2
6362
- pytest-mock >=3.6.1,<4
6463
- pytest-randomly >=3.10.1,<4
6564
- pytest-repeat >=0.9.1,<0.10

conda/environment-arm64.yml

-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ dependencies:
5858
- pytest-benchmark >=3.4.1,<5
5959
- pytest-clarity >=1.0.1,<2
6060
- pytest-cov >=3.0.0,<5
61-
- pytest-httpserver >=1.0.5,<2
6261
- pytest-mock >=3.6.1,<4
6362
- pytest-randomly >=3.10.1,<4
6463
- pytest-repeat >=0.9.1,<0.10

conda/environment.yml

-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ dependencies:
6161
- pytest-benchmark >=3.4.1,<5
6262
- pytest-clarity >=1.0.1,<2
6363
- pytest-cov >=3.0.0,<5
64-
- pytest-httpserver >=1.0.5,<2
6564
- pytest-mock >=3.6.1,<4
6665
- pytest-randomly >=3.10.1,<4
6766
- pytest-repeat >=0.9.1,<0.10

ibis/backends/duckdb/__init__.py

+15-40
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
from typing import TYPE_CHECKING, Any, Literal
1212

1313
import duckdb
14-
import pyarrow as pa
15-
import pyarrow_hotfix # noqa: F401
1614
import sqlglot as sg
1715
import sqlglot.expressions as sge
1816
from packaging.version import parse as vparse
@@ -26,7 +24,6 @@
2624
import ibis.expr.types as ir
2725
from ibis import util
2826
from ibis.backends import CanCreateDatabase, UrlFromPath
29-
from ibis.backends.duckdb.converter import DuckDBPandasData, DuckDBPyArrowData
3027
from ibis.backends.sql import SQLBackend
3128
from ibis.backends.sql.compilers.base import STAR, AlterTable, C, RenameTable
3229
from ibis.common.dispatch import lazy_singledispatch
@@ -37,6 +34,8 @@
3734

3835
import pandas as pd
3936
import polars as pl
37+
import pyarrow as pa
38+
import pyarrow_hotfix # noqa: F401
4039
import torch
4140
from fsspec import AbstractFileSystem
4241

@@ -783,48 +782,17 @@ def read_parquet(
783782

784783
table_name = table_name or util.gen_name("read_parquet")
785784

786-
# Default to using the native duckdb parquet reader
787-
# If that fails because of auth issues, fall back to ingesting via
788-
# pyarrow dataset
789-
try:
790-
self._read_parquet_duckdb_native(paths, table_name, **kwargs)
791-
except duckdb.IOException:
792-
self._read_parquet_pyarrow_dataset(paths, table_name, **kwargs)
793-
794-
return self.table(table_name)
795-
796-
def _read_parquet_duckdb_native(
797-
self, source_list: str | Iterable[str], table_name: str, **kwargs: Any
798-
) -> None:
799-
if any(
800-
source.startswith(("http://", "https://", "s3://"))
801-
for source in source_list
802-
):
785+
if any(path.startswith(("http://", "https://", "s3://")) for path in paths):
803786
self._load_extensions(["httpfs"])
804787

805788
options = [
806789
sg.to_identifier(key).eq(sge.convert(val)) for key, val in kwargs.items()
807790
]
808791
self._create_temp_view(
809792
table_name,
810-
sg.select(STAR).from_(self.compiler.f.read_parquet(source_list, *options)),
793+
sg.select(STAR).from_(self.compiler.f.read_parquet(paths, *options)),
811794
)
812-
813-
def _read_parquet_pyarrow_dataset(
814-
self, source_list: str | Iterable[str], table_name: str, **kwargs: Any
815-
) -> None:
816-
import pyarrow.dataset as ds
817-
818-
dataset = ds.dataset(list(map(ds.dataset, source_list)), **kwargs)
819-
self._load_extensions(["httpfs"])
820-
# We don't create a view since DuckDB special cases Arrow Datasets
821-
# so if we also create a view we end up with both a "lazy table"
822-
# and a view with the same name
823-
self.con.register(table_name, dataset)
824-
# DuckDB normally auto-detects Arrow Datasets that are defined
825-
# in local variables but the `dataset` variable won't be local
826-
# by the time we execute against this so we register it
827-
# explicitly.
795+
return self.table(table_name)
828796

829797
def read_delta(
830798
self, path: str | Path, /, *, table_name: str | None = None, **kwargs: Any
@@ -1288,6 +1256,9 @@ def to_pyarrow_batches(
12881256
chunk_size
12891257
The number of rows to fetch per batch
12901258
"""
1259+
import pyarrow as pa
1260+
import pyarrow_hotfix # noqa: F401
1261+
12911262
self._run_pre_execute_hooks(expr)
12921263
table = expr.as_table()
12931264
sql = self.compile(table, limit=limit, params=params)
@@ -1309,6 +1280,8 @@ def to_pyarrow(
13091280
limit: int | str | None = None,
13101281
**kwargs: Any,
13111282
) -> pa.Table:
1283+
from ibis.backends.duckdb.converter import DuckDBPyArrowData
1284+
13121285
table = self._to_duckdb_relation(
13131286
expr, params=params, limit=limit, **kwargs
13141287
).arrow()
@@ -1326,10 +1299,12 @@ def execute(
13261299
"""Execute an expression."""
13271300
import pandas as pd
13281301
import pyarrow.types as pat
1302+
import pyarrow_hotfix # noqa: F401
13291303

1330-
table = self._to_duckdb_relation(
1331-
expr, params=params, limit=limit, **kwargs
1332-
).arrow()
1304+
from ibis.backends.duckdb.converter import DuckDBPandasData
1305+
1306+
rel = self._to_duckdb_relation(expr, params=params, limit=limit, **kwargs)
1307+
table = rel.arrow()
13331308

13341309
df = pd.DataFrame(
13351310
{

ibis/backends/duckdb/tests/test_io.py

+1-29
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
import ibis
1616
import ibis.expr.datatypes as dt
17-
from ibis.conftest import ARM64, LINUX, MACOS, SANDBOXED
17+
from ibis.conftest import LINUX, SANDBOXED
1818
from ibis.util import gen_name
1919

2020

@@ -344,34 +344,6 @@ def test_temp_dir_set(tmp_path, database):
344344
assert con.settings["temp_directory"] == str(temp_directory)
345345

346346

347-
@pytest.mark.xfail(
348-
SANDBOXED and LINUX,
349-
reason=(
350-
"nix on linux cannot download duckdb extensions or data due to sandboxing; "
351-
"duckdb will try to automatically install and load read_parquet"
352-
),
353-
raises=(duckdb.Error, duckdb.IOException),
354-
)
355-
@pytest.mark.skipif(
356-
SANDBOXED and MACOS and ARM64, reason="raises a RuntimeError on nix macos arm64"
357-
)
358-
def test_s3_403_fallback(con, httpserver, monkeypatch):
359-
# monkeypatch to avoid downloading extensions in tests
360-
monkeypatch.setattr(con, "_load_extensions", lambda _: True)
361-
362-
# Throw a 403 to trigger fallback to pyarrow.dataset
363-
path = "/invalid.parquet"
364-
httpserver.expect_request(path).respond_with_data(
365-
status=403, content_type="application/vnd.apache.parquet"
366-
)
367-
368-
# Since the URI is nonsense to pyarrow, expect an error, but raises from
369-
# pyarrow, which indicates the fallback worked
370-
url = httpserver.url_for(path)
371-
with pytest.raises(pa.lib.ArrowInvalid):
372-
con.read_parquet(url)
373-
374-
375347
def test_register_numpy_str(con):
376348
data = pd.DataFrame({"a": [np.str_("xyz"), None]})
377349
result = ibis.memtable(data)

pyproject.toml

-1
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,6 @@ tests = [
247247
"pytest-deadfixtures>=2.2.1,<3",
248248
"pytest-clarity>=1.0.1,<2",
249249
"pytest-cov>=5,<7",
250-
"pytest-httpserver>=1.0.5,<2",
251250
"pytest-mock>=3.6.1,<4",
252251
"pytest-randomly>=3.10.1,<4",
253252
"pytest-repeat>=0.9.1,<0.10",

requirements-dev.txt

-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

uv.lock

-26
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)