fix(duckdb): ensure that duckdb columns argument to read_csv accepts duckdb syntax not ibis syntax (#10696)

cpcloud · web-flow · commit 83bed7422b24 · 2025-01-22T07:18:31.000-05:00
BREAKING CHANGE: The duckdb backend's `read_csv` method accepts only DuckDB types for the values components of the `columns` and `types` arguments. You may need need to adjust existing code. For example, the string `"float64"` should be replaced with the string `"double"`.
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -667,14 +667,18 @@ def read_csv(
         def make_struct_argument(obj: Mapping[str, str | dt.DataType]) -> sge.Struct:
             expressions = []
             geospatial = False
-            type_mapper = self.compiler.type_mapper
+            dialect = self.compiler.dialect
+            possible_geospatial_types = (
+                sge.DataType.Type.GEOGRAPHY,
+                sge.DataType.Type.GEOMETRY,
+            )
 
             for name, typ in obj.items():
-                typ = dt.dtype(typ)
-                geospatial |= typ.is_geospatial()
-                sgtype = type_mapper.from_ibis(typ)
+                sgtype = sg.parse_one(typ, read=dialect, into=sge.DataType)
+                geospatial |= sgtype.this in possible_geospatial_types
                 prop = sge.PropertyEQ(
-                    this=sge.to_identifier(name), expression=sge.convert(sgtype)
+                    this=sge.to_identifier(name),
+                    expression=sge.convert(sgtype.sql(dialect)),
                 )
                 expressions.append(prop)
 
diff --git a/ibis/backends/duckdb/tests/test_client.py b/ibis/backends/duckdb/tests/test_client.py
@@ -5,6 +5,7 @@
 import sys
 
 import duckdb
+import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
@@ -391,11 +392,12 @@ def test_multiple_tables_with_the_same_name(tmp_path):
 @pytest.mark.parametrize(
     "input",
     [
-        {"columns": {"lat": "float64", "lon": "float64", "geom": "geometry"}},
-        {"types": {"geom": "geometry"}},
+        {"columns": {"lat": "double", "lon": "float", "geom": "geometry"}},
+        {"types": {"geom": "geometry", "lon": "float"}},
     ],
+    ids=["columns", "types"],
 )
-@pytest.mark.parametrize("all_varchar", [True, False])
+@pytest.mark.parametrize("all_varchar", [True, False], ids=["varchar", "not_varchar"])
 @pytest.mark.xfail(
     LINUX and SANDBOXED,
     reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
@@ -423,8 +425,6 @@ def test_memtable_doesnt_leak(con):
 
 
 def test_pyarrow_batches_chunk_size(con):  # 10443
-    import numpy as np
-
     t = ibis.memtable(
         {
             "id": np.arange(10_000),
diff --git a/ibis/backends/duckdb/tests/test_io.py b/ibis/backends/duckdb/tests/test_io.py
@@ -15,6 +15,7 @@
 import ibis
 import ibis.expr.datatypes as dt
 from ibis.conftest import ARM64, LINUX, MACOS, SANDBOXED
+from ibis.util import gen_name
 
 
 def test_read_csv(con, data_dir):
@@ -461,3 +462,10 @@ def test_read_json_no_auto_detection(con, tmp_path):
 
     t = con.read_json(path, auto_detect=False, columns={"year": "varchar"})
     assert t.year.type() == dt.string
+
+
+def test_read_csv_with_duckdb_specific_types(con):
+    path = gen_name("duckdb")
+    columns = {"a": "STRUCT(a INTEGER)"}
+    with pytest.raises(duckdb.IOException, match="No files found"):
+        con.read_csv(path, columns=columns)