fix(deps): make sure pyarrow is not an implicit dependency

cpcloud · cpcloud · commit 10373f40bedd · 2023-01-10T11:18:23.000-05:00
diff --git a/ci/schema/clickhouse.sql b/ci/schema/clickhouse.sql
@@ -1,55 +1,19 @@
-CREATE OR REPLACE TABLE diamonds (
-    carat Nullable(Float64),
-    cut Nullable(String),
-    color Nullable(String),
-    clarity Nullable(String),
-    depth Nullable(Float64),
-    `table` Nullable(Float64),
-    price Nullable(Int64),
-    x Nullable(Float64),
-    y Nullable(Float64),
-    z Nullable(Float64)
-) ENGINE = Memory;
+-- NB: The paths in this file are all relative to /var/lib/clickhouse/user_files
 
-CREATE OR REPLACE TABLE batting (
-    `playerID` Nullable(String),
-    `yearID` Nullable(Int64),
-    stint Nullable(Int64),
-    `teamID` Nullable(String),
-    `lgID` Nullable(String),
-    `G` Nullable(Int64),
-    `AB` Nullable(Int64),
-    `R` Nullable(Int64),
-    `H` Nullable(Int64),
-    `X2B` Nullable(Int64),
-    `X3B` Nullable(Int64),
-    `HR` Nullable(Int64),
-    `RBI` Nullable(Int64),
-    `SB` Nullable(Int64),
-    `CS` Nullable(Int64),
-    `BB` Nullable(Int64),
-    `SO` Nullable(Int64),
-    `IBB` Nullable(Int64),
-    `HBP` Nullable(Int64),
-    `SH` Nullable(Int64),
-    `SF` Nullable(Int64),
-    `GIDP` Nullable(Int64)
-) ENGINE = Memory;
+CREATE OR REPLACE TABLE diamonds ENGINE = Memory AS
+SELECT * FROM file('parquet/diamonds/diamonds.parquet', 'Parquet');
 
-CREATE OR REPLACE TABLE awards_players (
-    `playerID` Nullable(String),
-    `awardID` Nullable(String),
-    `yearID` Nullable(Int64),
-    `lgID` Nullable(String),
-    tie Nullable(String),
-    notes Nullable(String)
-) ENGINE = Memory;
+CREATE OR REPLACE TABLE batting ENGINE = Memory AS
+SELECT * FROM file('parquet/batting/batting.parquet', 'Parquet');
+
+CREATE OR REPLACE TABLE awards_players ENGINE = Memory AS
+SELECT * FROM file('parquet/awards_players/awards_players.parquet', 'Parquet');
 
 CREATE OR REPLACE TABLE functional_alltypes (
     `index` Nullable(Int64),
     `Unnamed: 0` Nullable(Int64),
     id Nullable(Int32),
-    bool_col Nullable(UInt8),
+    bool_col Nullable(Bool),
     tinyint_col Nullable(Int8),
     smallint_col Nullable(Int16),
     int_col Nullable(Int32),
@@ -58,10 +22,12 @@ CREATE OR REPLACE TABLE functional_alltypes (
     double_col Nullable(Float64),
     date_string_col Nullable(String),
     string_col Nullable(String),
+    -- TODO: clean this up when timestamp scale is supported
     timestamp_col Nullable(DateTime),
     year Nullable(Int32),
     month Nullable(Int32)
-) ENGINE = Memory;
+) ENGINE = Memory AS
+SELECT * FROM file('functional_alltypes.csv', 'CSVWithNames');
 
 CREATE OR REPLACE TABLE tzone (
     ts Nullable(DateTime),
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,7 +1,10 @@
 version: "3.4"
 services:
   clickhouse:
-    image: clickhouse/clickhouse-server:22.12.2.25-alpine
+    build:
+      context: .
+      dockerfile: ./docker/clickhouse/Dockerfile
+    image: ibis-clickhouse
     ports:
       - 8123:8123
       - 9000:9000
diff --git a/docker/clickhouse/Dockerfile b/docker/clickhouse/Dockerfile
@@ -0,0 +1,2 @@
+FROM clickhouse/clickhouse-server:22.12.2.25-alpine
+COPY ./ci/ibis-testing-data /var/lib/clickhouse/user_files
diff --git a/ibis/backends/clickhouse/tests/conftest.py b/ibis/backends/clickhouse/tests/conftest.py
@@ -8,7 +8,6 @@
 
 import ibis
 import ibis.expr.types as ir
-from ibis.backends.conftest import TEST_TABLES, read_tables
 from ibis.backends.tests.base import BackendTest, RoundHalfToEven, UnorderedComparator
 
 CLICKHOUSE_HOST = os.environ.get('IBIS_TEST_CLICKHOUSE_HOST', 'localhost')
@@ -67,10 +66,6 @@ def _load_data(
             for stmt in filter(None, map(str.strip, schema.read().split(";"))):
                 client.execute(stmt)
 
-        for table, df in read_tables(TEST_TABLES, data_dir):
-            query = f"INSERT INTO {table} VALUES"
-            client.insert_dataframe(query, df.to_pandas(), settings={"use_numpy": True})
-
     @staticmethod
     def connect(data_directory: Path):
         pytest.importorskip("clickhouse_driver")
diff --git a/ibis/backends/conftest.py b/ibis/backends/conftest.py
@@ -6,19 +6,15 @@
 import platform
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, TextIO
+from typing import Any, TextIO
 
 import _pytest
 import pandas as pd
+import pytest
 import sqlalchemy as sa
 from packaging.requirements import Requirement
 from packaging.version import parse as vparse
 
-if TYPE_CHECKING:
-    import pyarrow as pa
-
-import pytest
-
 import ibis
 from ibis import util
 from ibis.backends.base import _get_backend_names
@@ -190,29 +186,6 @@ def init_database(
     return engine
 
 
-def read_tables(
-    names: Iterable[str],
-    data_dir: Path,
-) -> Iterator[tuple[str, pa.Table]]:
-    """For each csv {names} in {data_dir} return a pyarrow.Table."""
-
-    import pyarrow.csv as pac
-
-    import ibis.backends.pyarrow.datatypes as pa_dt
-
-    for name in names:
-        schema = TEST_TABLES[name]
-        convert_options = pac.ConvertOptions(
-            column_types={
-                name: pa_dt.to_pyarrow_type(type) for name, type in schema.items()
-            }
-        )
-        yield name, pac.read_csv(
-            data_dir / f'{name}.csv',
-            convert_options=convert_options,
-        )
-
-
 def _random_identifier(suffix: str) -> str:
     return f"__ibis_test_{suffix}_{util.guid()}"
 
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -9,8 +9,6 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Iterable, Iterator, Mapping, MutableMapping
 
-import pyarrow as pa
-import pyarrow.types as pat
 import sqlalchemy as sa
 import toolz
 
@@ -21,6 +19,7 @@
 if TYPE_CHECKING:
     import duckdb
     import pandas as pd
+    import pyarrow as pa
 
 import ibis.expr.schema as sch
 import ibis.expr.types as ir
@@ -394,20 +393,20 @@ def to_pyarrow_batches(
         limit: int | str | None = None,
         chunk_size: int = 1_000_000,
         **kwargs: Any,
-    ) -> IbisRecordBatchReader:
+    ) -> pa.ipc.RecordBatchReader:
+        # TODO: duckdb seems to not care about the `chunk_size` argument
+        # and returns batches in 1024 row chunks
         _ = self._import_pyarrow()
+
+        from ibis.backends.duckdb.pyarrow import IbisRecordBatchReader
+
         query_ast = self.compiler.to_ast_ensure_limit(expr, limit, params=params)
         sql = query_ast.compile()
 
         cursor = self.raw_sql(sql)
 
-        _reader = cursor.cursor.fetch_record_batch(chunk_size=chunk_size)
-        # Horrible hack to make sure cursor isn't garbage collected
-        # before batches are streamed out of the RecordBatchReader
-        batches = IbisRecordBatchReader(_reader, cursor)
-        return batches
-        # TODO: duckdb seems to not care about the `chunk_size` argument
-        # and returns batches in 1024 row chunks
+        reader = cursor.cursor.fetch_record_batch(chunk_size=chunk_size)
+        return IbisRecordBatchReader(reader, cursor)
 
     def to_pyarrow(
         self,
@@ -417,7 +416,7 @@ def to_pyarrow(
         limit: int | str | None = None,
         **kwargs: Any,
     ) -> pa.Table:
-        _ = self._import_pyarrow()
+        pa = self._import_pyarrow()
         query_ast = self.compiler.to_ast_ensure_limit(expr, limit, params=params)
         sql = query_ast.compile()
 
@@ -444,6 +443,7 @@ def fetch_from_cursor(
         schema: sch.Schema,
     ):
         import pandas as pd
+        import pyarrow.types as pat
 
         table = cursor.cursor.fetch_arrow_table()
 
@@ -525,25 +525,3 @@ def _get_temp_view_definition(
         definition: sa.sql.compiler.Compiled,
     ) -> str:
         return f"CREATE OR REPLACE TEMPORARY VIEW {name} AS {definition}"
-
-
-class IbisRecordBatchReader(pa.ipc.RecordBatchReader):
-    def __init__(self, reader, cursor):
-        self.reader = reader
-        self.cursor = cursor
-
-    def close(self):
-        self.reader.close()
-        del self.cursor
-
-    def read_all(self):
-        return self.reader.read_all()
-
-    def read_next_batch(self):
-        return self.reader.read_next_batch()
-
-    def read_pandas(self):
-        return self.reader.read_pandas()
-
-    def schema(self):
-        return self.reader.schema
diff --git a/ibis/backends/duckdb/pyarrow.py b/ibis/backends/duckdb/pyarrow.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import pyarrow as pa
+
+
+class IbisRecordBatchReader(pa.ipc.RecordBatchReader):
+    """Hack to make sure the database cursor isn't garbage collected.
+
+    Without this hack batches are streamed out of the RecordBatchReader on a
+    closed cursor.
+    """
+
+    def __init__(self, reader, cursor):
+        self.reader = reader
+        self.cursor = cursor
+
+    def close(self):
+        self.reader.close()
+        del self.cursor
+
+    def read_all(self):
+        return self.reader.read_all()
+
+    def read_next_batch(self):
+        return self.reader.read_next_batch()
+
+    def read_pandas(self):
+        return self.reader.read_pandas()
+
+    @property
+    def schema(self):
+        return self.reader.schema
diff --git a/ibis/backends/impala/tests/conftest.py b/ibis/backends/impala/tests/conftest.py
@@ -152,12 +152,7 @@ def connect(
         )
 
     def _get_original_column_names(self, tablename: str) -> list[str]:
-        import pyarrow.parquet as pq
-
-        pq_file = pq.ParquetFile(
-            self.data_directory / "parquet" / tablename / f"{tablename}.parquet"
-        )
-        return pq_file.schema.names
+        return list(TEST_TABLES[tablename].names)
 
     def _get_renamed_table(self, tablename: str) -> ir.Table:
         t = self.connection.table(tablename)
diff --git a/ibis/backends/pandas/tests/test_datatypes.py b/ibis/backends/pandas/tests/test_datatypes.py
@@ -3,11 +3,9 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
 import pytest
 from packaging.version import parse as vparse
 from pandas.api.types import CategoricalDtype, DatetimeTZDtype
-from pytest import param
 
 import ibis
 import ibis.expr.datatypes as dt
@@ -113,20 +111,19 @@ def test_infer_np_array(value, expected_dtypes):
         (np.double, dt.double),
         (np.str_, dt.string),
         (np.datetime64, dt.timestamp),
-        param(
-            np.timedelta64,
-            dt.interval,
-            marks=pytest.mark.skipif(
-                vparse(pa.__version__) < vparse("9"),
-                reason="pyarrow < 9 globally mutates the timedelta64 numpy dtype",
-            ),
-        ),
     ],
 )
 def test_numpy_dtype(numpy_dtype, ibis_dtype):
     assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype
 
 
+def test_numpy_dtype_timedelta():
+    if vparse(pytest.importorskip("pyarrow").__version__) < vparse("9"):
+        pytest.skip("pyarrow < 9 globally mutates the timedelta64 numpy dtype")
+
+    assert dt.dtype(np.dtype(np.timedelta64)) == dt.interval
+
+
 @pytest.mark.parametrize(
     ('pandas_dtype', 'ibis_dtype'),
     [
@@ -224,5 +221,7 @@ def test_schema_infer(col_data, schema_type):
 
 
 def test_pyarrow_string():
+    pytest.importorskip("pa")
+
     s = pd.Series([], dtype="string[pyarrow]")
     assert dt.dtype(s.dtype) == dt.String()
diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py
@@ -1,11 +1,12 @@
 import sys
 
-import pyarrow as pa
 import pytest
 from pytest import param
 
+pa = pytest.importorskip("pyarrow")
+
 # Adds `to_pyarrow` to created schema objects
-from ibis.backends.pyarrow.datatypes import sch  # noqa: F401
+from ibis.backends.pyarrow.datatypes import sch as _  # noqa: F401, E402
 
 
 class PackageDiscarder:
diff --git a/ibis/backends/tests/test_register.py b/ibis/backends/tests/test_register.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/requirements.txt b/requirements.txt

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+FROM clickhouse/clickhouse-server:22.12.2.25-alpine`
	`2`	`+COPY ./ci/ibis-testing-data /var/lib/clickhouse/user_files`