Skip to content

Commit 10373f4

Browse files
committed
fix(deps): make sure pyarrow is not an implicit dependency
1 parent 2831559 commit 10373f4

File tree

14 files changed

+104
-142
lines changed

14 files changed

+104
-142
lines changed

ci/schema/clickhouse.sql

Lines changed: 12 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,19 @@
1-
CREATE OR REPLACE TABLE diamonds (
2-
carat Nullable(Float64),
3-
cut Nullable(String),
4-
color Nullable(String),
5-
clarity Nullable(String),
6-
depth Nullable(Float64),
7-
`table` Nullable(Float64),
8-
price Nullable(Int64),
9-
x Nullable(Float64),
10-
y Nullable(Float64),
11-
z Nullable(Float64)
12-
) ENGINE = Memory;
1+
-- NB: The paths in this file are all relative to /var/lib/clickhouse/user_files
132

14-
CREATE OR REPLACE TABLE batting (
15-
`playerID` Nullable(String),
16-
`yearID` Nullable(Int64),
17-
stint Nullable(Int64),
18-
`teamID` Nullable(String),
19-
`lgID` Nullable(String),
20-
`G` Nullable(Int64),
21-
`AB` Nullable(Int64),
22-
`R` Nullable(Int64),
23-
`H` Nullable(Int64),
24-
`X2B` Nullable(Int64),
25-
`X3B` Nullable(Int64),
26-
`HR` Nullable(Int64),
27-
`RBI` Nullable(Int64),
28-
`SB` Nullable(Int64),
29-
`CS` Nullable(Int64),
30-
`BB` Nullable(Int64),
31-
`SO` Nullable(Int64),
32-
`IBB` Nullable(Int64),
33-
`HBP` Nullable(Int64),
34-
`SH` Nullable(Int64),
35-
`SF` Nullable(Int64),
36-
`GIDP` Nullable(Int64)
37-
) ENGINE = Memory;
3+
CREATE OR REPLACE TABLE diamonds ENGINE = Memory AS
4+
SELECT * FROM file('parquet/diamonds/diamonds.parquet', 'Parquet');
385

39-
CREATE OR REPLACE TABLE awards_players (
40-
`playerID` Nullable(String),
41-
`awardID` Nullable(String),
42-
`yearID` Nullable(Int64),
43-
`lgID` Nullable(String),
44-
tie Nullable(String),
45-
notes Nullable(String)
46-
) ENGINE = Memory;
6+
CREATE OR REPLACE TABLE batting ENGINE = Memory AS
7+
SELECT * FROM file('parquet/batting/batting.parquet', 'Parquet');
8+
9+
CREATE OR REPLACE TABLE awards_players ENGINE = Memory AS
10+
SELECT * FROM file('parquet/awards_players/awards_players.parquet', 'Parquet');
4711

4812
CREATE OR REPLACE TABLE functional_alltypes (
4913
`index` Nullable(Int64),
5014
`Unnamed: 0` Nullable(Int64),
5115
id Nullable(Int32),
52-
bool_col Nullable(UInt8),
16+
bool_col Nullable(Bool),
5317
tinyint_col Nullable(Int8),
5418
smallint_col Nullable(Int16),
5519
int_col Nullable(Int32),
@@ -58,10 +22,12 @@ CREATE OR REPLACE TABLE functional_alltypes (
5822
double_col Nullable(Float64),
5923
date_string_col Nullable(String),
6024
string_col Nullable(String),
25+
-- TODO: clean this up when timestamp scale is supported
6126
timestamp_col Nullable(DateTime),
6227
year Nullable(Int32),
6328
month Nullable(Int32)
64-
) ENGINE = Memory;
29+
) ENGINE = Memory AS
30+
SELECT * FROM file('functional_alltypes.csv', 'CSVWithNames');
6531

6632
CREATE OR REPLACE TABLE tzone (
6733
ts Nullable(DateTime),

docker-compose.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
version: "3.4"
22
services:
33
clickhouse:
4-
image: clickhouse/clickhouse-server:22.12.2.25-alpine
4+
build:
5+
context: .
6+
dockerfile: ./docker/clickhouse/Dockerfile
7+
image: ibis-clickhouse
58
ports:
69
- 8123:8123
710
- 9000:9000

docker/clickhouse/Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
FROM clickhouse/clickhouse-server:22.12.2.25-alpine
2+
COPY ./ci/ibis-testing-data /var/lib/clickhouse/user_files

ibis/backends/clickhouse/tests/conftest.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
import ibis
1010
import ibis.expr.types as ir
11-
from ibis.backends.conftest import TEST_TABLES, read_tables
1211
from ibis.backends.tests.base import BackendTest, RoundHalfToEven, UnorderedComparator
1312

1413
CLICKHOUSE_HOST = os.environ.get('IBIS_TEST_CLICKHOUSE_HOST', 'localhost')
@@ -67,10 +66,6 @@ def _load_data(
6766
for stmt in filter(None, map(str.strip, schema.read().split(";"))):
6867
client.execute(stmt)
6968

70-
for table, df in read_tables(TEST_TABLES, data_dir):
71-
query = f"INSERT INTO {table} VALUES"
72-
client.insert_dataframe(query, df.to_pandas(), settings={"use_numpy": True})
73-
7469
@staticmethod
7570
def connect(data_directory: Path):
7671
pytest.importorskip("clickhouse_driver")

ibis/backends/conftest.py

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,15 @@
66
import platform
77
from functools import lru_cache
88
from pathlib import Path
9-
from typing import TYPE_CHECKING, Any, Iterable, Iterator, TextIO
9+
from typing import Any, TextIO
1010

1111
import _pytest
1212
import pandas as pd
13+
import pytest
1314
import sqlalchemy as sa
1415
from packaging.requirements import Requirement
1516
from packaging.version import parse as vparse
1617

17-
if TYPE_CHECKING:
18-
import pyarrow as pa
19-
20-
import pytest
21-
2218
import ibis
2319
from ibis import util
2420
from ibis.backends.base import _get_backend_names
@@ -190,29 +186,6 @@ def init_database(
190186
return engine
191187

192188

193-
def read_tables(
194-
names: Iterable[str],
195-
data_dir: Path,
196-
) -> Iterator[tuple[str, pa.Table]]:
197-
"""For each csv {names} in {data_dir} return a pyarrow.Table."""
198-
199-
import pyarrow.csv as pac
200-
201-
import ibis.backends.pyarrow.datatypes as pa_dt
202-
203-
for name in names:
204-
schema = TEST_TABLES[name]
205-
convert_options = pac.ConvertOptions(
206-
column_types={
207-
name: pa_dt.to_pyarrow_type(type) for name, type in schema.items()
208-
}
209-
)
210-
yield name, pac.read_csv(
211-
data_dir / f'{name}.csv',
212-
convert_options=convert_options,
213-
)
214-
215-
216189
def _random_identifier(suffix: str) -> str:
217190
return f"__ibis_test_{suffix}_{util.guid()}"
218191

ibis/backends/duckdb/__init__.py

Lines changed: 11 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
from pathlib import Path
1010
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Mapping, MutableMapping
1111

12-
import pyarrow as pa
13-
import pyarrow.types as pat
1412
import sqlalchemy as sa
1513
import toolz
1614

@@ -21,6 +19,7 @@
2119
if TYPE_CHECKING:
2220
import duckdb
2321
import pandas as pd
22+
import pyarrow as pa
2423

2524
import ibis.expr.schema as sch
2625
import ibis.expr.types as ir
@@ -394,20 +393,20 @@ def to_pyarrow_batches(
394393
limit: int | str | None = None,
395394
chunk_size: int = 1_000_000,
396395
**kwargs: Any,
397-
) -> IbisRecordBatchReader:
396+
) -> pa.ipc.RecordBatchReader:
397+
# TODO: duckdb seems to not care about the `chunk_size` argument
398+
# and returns batches in 1024 row chunks
398399
_ = self._import_pyarrow()
400+
401+
from ibis.backends.duckdb.pyarrow import IbisRecordBatchReader
402+
399403
query_ast = self.compiler.to_ast_ensure_limit(expr, limit, params=params)
400404
sql = query_ast.compile()
401405

402406
cursor = self.raw_sql(sql)
403407

404-
_reader = cursor.cursor.fetch_record_batch(chunk_size=chunk_size)
405-
# Horrible hack to make sure cursor isn't garbage collected
406-
# before batches are streamed out of the RecordBatchReader
407-
batches = IbisRecordBatchReader(_reader, cursor)
408-
return batches
409-
# TODO: duckdb seems to not care about the `chunk_size` argument
410-
# and returns batches in 1024 row chunks
408+
reader = cursor.cursor.fetch_record_batch(chunk_size=chunk_size)
409+
return IbisRecordBatchReader(reader, cursor)
411410

412411
def to_pyarrow(
413412
self,
@@ -417,7 +416,7 @@ def to_pyarrow(
417416
limit: int | str | None = None,
418417
**kwargs: Any,
419418
) -> pa.Table:
420-
_ = self._import_pyarrow()
419+
pa = self._import_pyarrow()
421420
query_ast = self.compiler.to_ast_ensure_limit(expr, limit, params=params)
422421
sql = query_ast.compile()
423422

@@ -444,6 +443,7 @@ def fetch_from_cursor(
444443
schema: sch.Schema,
445444
):
446445
import pandas as pd
446+
import pyarrow.types as pat
447447

448448
table = cursor.cursor.fetch_arrow_table()
449449

@@ -525,25 +525,3 @@ def _get_temp_view_definition(
525525
definition: sa.sql.compiler.Compiled,
526526
) -> str:
527527
return f"CREATE OR REPLACE TEMPORARY VIEW {name} AS {definition}"
528-
529-
530-
class IbisRecordBatchReader(pa.ipc.RecordBatchReader):
531-
def __init__(self, reader, cursor):
532-
self.reader = reader
533-
self.cursor = cursor
534-
535-
def close(self):
536-
self.reader.close()
537-
del self.cursor
538-
539-
def read_all(self):
540-
return self.reader.read_all()
541-
542-
def read_next_batch(self):
543-
return self.reader.read_next_batch()
544-
545-
def read_pandas(self):
546-
return self.reader.read_pandas()
547-
548-
def schema(self):
549-
return self.reader.schema

ibis/backends/duckdb/pyarrow.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from __future__ import annotations
2+
3+
import pyarrow as pa
4+
5+
6+
class IbisRecordBatchReader(pa.ipc.RecordBatchReader):
7+
"""Hack to make sure the database cursor isn't garbage collected.
8+
9+
Without this hack batches are streamed out of the RecordBatchReader on a
10+
closed cursor.
11+
"""
12+
13+
def __init__(self, reader, cursor):
14+
self.reader = reader
15+
self.cursor = cursor
16+
17+
def close(self):
18+
self.reader.close()
19+
del self.cursor
20+
21+
def read_all(self):
22+
return self.reader.read_all()
23+
24+
def read_next_batch(self):
25+
return self.reader.read_next_batch()
26+
27+
def read_pandas(self):
28+
return self.reader.read_pandas()
29+
30+
@property
31+
def schema(self):
32+
return self.reader.schema

ibis/backends/impala/tests/conftest.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -152,12 +152,7 @@ def connect(
152152
)
153153

154154
def _get_original_column_names(self, tablename: str) -> list[str]:
155-
import pyarrow.parquet as pq
156-
157-
pq_file = pq.ParquetFile(
158-
self.data_directory / "parquet" / tablename / f"{tablename}.parquet"
159-
)
160-
return pq_file.schema.names
155+
return list(TEST_TABLES[tablename].names)
161156

162157
def _get_renamed_table(self, tablename: str) -> ir.Table:
163158
t = self.connection.table(tablename)

ibis/backends/pandas/tests/test_datatypes.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,9 @@
33

44
import numpy as np
55
import pandas as pd
6-
import pyarrow as pa
76
import pytest
87
from packaging.version import parse as vparse
98
from pandas.api.types import CategoricalDtype, DatetimeTZDtype
10-
from pytest import param
119

1210
import ibis
1311
import ibis.expr.datatypes as dt
@@ -113,20 +111,19 @@ def test_infer_np_array(value, expected_dtypes):
113111
(np.double, dt.double),
114112
(np.str_, dt.string),
115113
(np.datetime64, dt.timestamp),
116-
param(
117-
np.timedelta64,
118-
dt.interval,
119-
marks=pytest.mark.skipif(
120-
vparse(pa.__version__) < vparse("9"),
121-
reason="pyarrow < 9 globally mutates the timedelta64 numpy dtype",
122-
),
123-
),
124114
],
125115
)
126116
def test_numpy_dtype(numpy_dtype, ibis_dtype):
127117
assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype
128118

129119

120+
def test_numpy_dtype_timedelta():
121+
if vparse(pytest.importorskip("pyarrow").__version__) < vparse("9"):
122+
pytest.skip("pyarrow < 9 globally mutates the timedelta64 numpy dtype")
123+
124+
assert dt.dtype(np.dtype(np.timedelta64)) == dt.interval
125+
126+
130127
@pytest.mark.parametrize(
131128
('pandas_dtype', 'ibis_dtype'),
132129
[
@@ -224,5 +221,7 @@ def test_schema_infer(col_data, schema_type):
224221

225222

226223
def test_pyarrow_string():
224+
pytest.importorskip("pa")
225+
227226
s = pd.Series([], dtype="string[pyarrow]")
228227
assert dt.dtype(s.dtype) == dt.String()

ibis/backends/tests/test_export.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import sys
22

3-
import pyarrow as pa
43
import pytest
54
from pytest import param
65

6+
pa = pytest.importorskip("pyarrow")
7+
78
# Adds `to_pyarrow` to created schema objects
8-
from ibis.backends.pyarrow.datatypes import sch # noqa: F401
9+
from ibis.backends.pyarrow.datatypes import sch as _ # noqa: F401, E402
910

1011

1112
class PackageDiscarder:

0 commit comments

Comments
 (0)