Skip to content

Commit 61dd7ea

Browse files
authored
feat(datatypes): add string length (#11045)
1 parent b83a88e commit 61dd7ea

File tree

19 files changed

+189
-73
lines changed

19 files changed

+189
-73
lines changed

ibis/backends/clickhouse/tests/test_datatypes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def test_array_discovery_clickhouse(con):
125125
),
126126
param(
127127
"Array(FixedString(32))",
128-
dt.Array(dt.String(nullable=False), nullable=False),
128+
dt.Array(dt.String(length=32, nullable=False), nullable=False),
129129
id="array_fixed_string",
130130
),
131131
param(

ibis/backends/exasol/__init__.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@ def do_connect(
103103
bigint_col int64
104104
float_col float64
105105
double_col float64
106-
date_string_col string
107-
string_col string
106+
date_string_col string(256)
107+
string_col string(256)
108108
timestamp_col timestamp(3)
109109
year int32
110110
month int32
@@ -267,16 +267,19 @@ def _get_schema_using_query(self, query: str) -> sch.Schema:
267267
drop_view = sg.exp.Drop(kind="VIEW", this=table).sql(dialect)
268268
describe = sg.exp.Describe(this=table).sql(dialect)
269269
type_mapper = self.compiler.type_mapper
270+
con = self.con
270271
with self._safe_raw_sql(create_view):
271272
try:
272-
return sch.Schema(
273-
{
274-
name: type_mapper.from_string(_VARCHAR_REGEX.sub(r"\1", typ))
275-
for name, typ, *_ in self.con.execute(describe).fetchall()
276-
}
277-
)
273+
rows = con.execute(describe).fetchall()
278274
finally:
279-
self.con.execute(drop_view)
275+
con.execute(drop_view)
276+
277+
return sch.Schema(
278+
{
279+
name: type_mapper.from_string(_VARCHAR_REGEX.sub(r"\1", typ))
280+
for name, typ, *_ in rows
281+
}
282+
)
280283

281284
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
282285
schema = op.schema

ibis/backends/mssql/__init__.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ def get_schema(
265265
C.numeric_precision,
266266
C.numeric_scale,
267267
C.datetime_precision,
268+
C.character_maximum_length,
268269
)
269270
.from_(
270271
sg.table(
@@ -295,12 +296,20 @@ def get_schema(
295296
numeric_precision,
296297
numeric_scale,
297298
datetime_precision,
299+
character_maximum_length,
298300
) in meta:
299301
newtyp = self.compiler.type_mapper.from_string(
300302
typ, nullable=is_nullable == "YES"
301303
)
302304

303-
if typ == "float":
305+
if (
306+
typ.lower() != "hierarchyid"
307+
and character_maximum_length is not None
308+
and character_maximum_length != -1
309+
and newtyp.is_string()
310+
):
311+
newtyp = newtyp.copy(length=character_maximum_length)
312+
elif typ == "float":
304313
newcls = dt.Float64 if numeric_precision == 53 else dt.Float32
305314
newtyp = newcls(nullable=newtyp.nullable)
306315
elif newtyp.is_decimal():

ibis/backends/mssql/tests/test_client.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,16 @@
4949
("DATETIMEOFFSET", dt.timestamp(scale=7, timezone="UTC")),
5050
("SMALLDATETIME", dt.Timestamp(scale=0)),
5151
("DATETIME", dt.Timestamp(scale=3)),
52-
# Characters strings
53-
("CHAR", dt.string),
54-
("VARCHAR", dt.string),
52+
# Character strings
53+
("CHAR", dt.String(length=1)),
54+
("VARCHAR", dt.String(length=1)),
55+
("CHAR(73)", dt.String(length=73)),
56+
("VARCHAR(73)", dt.String(length=73)),
5557
# Unicode character strings
56-
("NCHAR", dt.string),
57-
("NVARCHAR", dt.string),
58+
("NCHAR", dt.String(length=1)),
59+
("NVARCHAR", dt.String(length=1)),
60+
("NCHAR(42)", dt.String(length=42)),
61+
("NVARCHAR(42)", dt.String(length=42)),
5862
# Binary strings
5963
("BINARY", dt.binary),
6064
("VARBINARY", dt.binary),
@@ -259,7 +263,7 @@ def test_dot_sql_with_unnamed_columns(con):
259263

260264
assert schema.types == (
261265
dt.Timestamp(timezone="UTC", scale=7),
262-
dt.String(nullable=False),
266+
dt.String(nullable=False, length=2),
263267
dt.Int32(nullable=False),
264268
)
265269

ibis/backends/mysql/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,11 +192,16 @@ def list_databases(self, *, like: str | None = None) -> list[str]:
192192
def _get_schema_using_query(self, query: str) -> sch.Schema:
193193
from ibis.backends.mysql.datatypes import _type_from_cursor_info
194194

195+
char_set_info = self.con.get_character_set_info()
196+
multi_byte_maximum_length = char_set_info["mbmaxlen"]
197+
195198
sql = (
196199
sg.select(STAR)
197200
.from_(
198201
sg.parse_one(query, dialect=self.dialect).subquery(
199-
sg.to_identifier("tmp", quoted=self.compiler.quoted)
202+
sg.to_identifier(
203+
util.gen_name("query_schema"), quoted=self.compiler.quoted
204+
)
200205
)
201206
)
202207
.limit(0)
@@ -210,13 +215,13 @@ def _get_schema_using_query(self, query: str) -> sch.Schema:
210215
for (name, type_code, _, _, field_length, scale, _), raw_flags in zip(
211216
descr, flags
212217
):
213-
item = _type_from_cursor_info(
218+
items[name] = _type_from_cursor_info(
214219
flags=raw_flags,
215220
type_code=type_code,
216221
field_length=field_length,
217222
scale=scale,
223+
multi_byte_maximum_length=multi_byte_maximum_length,
218224
)
219-
items[name] = item
220225
return sch.Schema(items)
221226

222227
def get_schema(

ibis/backends/mysql/datatypes.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
)
2121

2222

23-
def _type_from_cursor_info(*, flags, type_code, field_length, scale) -> dt.DataType:
23+
def _type_from_cursor_info(
24+
*, flags, type_code, field_length, scale, multi_byte_maximum_length
25+
) -> dt.DataType:
2426
"""Construct an ibis type from MySQL field descr and field result metadata.
2527
2628
This method is complex because the MySQL protocol is complex.
@@ -60,7 +62,7 @@ def _type_from_cursor_info(*, flags, type_code, field_length, scale) -> dt.DataT
6062
if flags.is_binary:
6163
typ = dt.Binary
6264
else:
63-
typ = dt.String
65+
typ = partial(dt.String, length=field_length // multi_byte_maximum_length)
6466
elif flags.is_timestamp or typename == "TIMESTAMP":
6567
typ = partial(dt.Timestamp, timezone="UTC", scale=scale or None)
6668
elif typename == "DATETIME":

ibis/backends/mysql/tests/test_client.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,16 @@
4545
param("time", dt.time, id="time"),
4646
param("datetime", dt.timestamp, id="datetime"),
4747
param("year", dt.uint8, id="year"),
48-
param("char(32)", dt.string, id="char"),
48+
param("char(32)", dt.String(length=32), id="char"),
4949
param("char byte", dt.binary, id="char_byte"),
50-
param("varchar(42)", dt.string, id="varchar"),
51-
param("mediumtext", dt.string, id="mediumtext"),
52-
param("text", dt.string, id="text"),
50+
param("varchar(42)", dt.String(length=42), id="varchar"),
5351
param("binary(42)", dt.binary, id="binary"),
5452
param("varbinary(42)", dt.binary, id="varbinary"),
5553
param("bit(1)", dt.int8, id="bit_1"),
5654
param("bit(9)", dt.int16, id="bit_9"),
5755
param("bit(17)", dt.int32, id="bit_17"),
5856
param("bit(33)", dt.int64, id="bit_33"),
5957
# mariadb doesn't have a distinct json type
60-
param("enum('small', 'medium', 'large')", dt.string, id="enum"),
6158
param("set('a', 'b', 'c', 'd')", dt.Array(dt.string), id="set"),
6259
param("mediumblob", dt.binary, id="mediumblob"),
6360
param("blob", dt.binary, id="blob"),
@@ -100,6 +97,14 @@ def test_get_schema_from_query(con, mysql_type, expected_type):
10097
param("json", dt.binary, dt.string, id="json"),
10198
param("inet6", dt.binary, dt.inet, id="inet"),
10299
param("uuid", dt.binary, dt.uuid, id="uuid"),
100+
param(
101+
"enum('small', 'medium', 'large')",
102+
dt.String(length=6),
103+
dt.string,
104+
id="enum",
105+
),
106+
param("mediumtext", dt.String(length=2**24 - 1), dt.string, id="mediumtext"),
107+
param("text", dt.String(length=2**16 - 1), dt.string, id="text"),
103108
],
104109
)
105110
def test_get_schema_from_query_special_cases(

ibis/backends/postgres/tests/test_client.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ def test_create_and_drop_table(con, temp_table, params):
148148
for (pg_type, ibis_type) in [
149149
("boolean", dt.boolean),
150150
("bytea", dt.binary),
151-
("char", dt.string),
151+
("char", dt.String(length=1)),
152+
("char(42)", dt.String(length=42)),
152153
("bigint", dt.int64),
153154
("smallint", dt.int16),
154155
("integer", dt.int32),
@@ -162,8 +163,10 @@ def test_create_and_drop_table(con, temp_table, params):
162163
("macaddr", dt.macaddr),
163164
("macaddr8", dt.macaddr),
164165
("inet", dt.inet),
165-
("character", dt.string),
166+
("character", dt.String(length=1)),
166167
("character varying", dt.string),
168+
("character varying(73)", dt.String(length=73)),
169+
("varchar(37)", dt.String(length=37)),
167170
("date", dt.date),
168171
("time", dt.time),
169172
("time without time zone", dt.time),
@@ -308,13 +311,13 @@ def test_pgvector_type_load(con, vector_size):
308311
def test_name_dtype(con):
309312
expected_schema = ibis.schema(
310313
{
311-
"f_table_catalog": dt.String(nullable=True),
314+
"f_table_catalog": dt.String(length=256, nullable=True),
312315
"f_table_schema": dt.String(nullable=True),
313316
"f_table_name": dt.String(nullable=True),
314317
"f_geometry_column": dt.String(nullable=True),
315318
"coord_dimension": dt.Int32(nullable=True),
316319
"srid": dt.Int32(nullable=True),
317-
"type": dt.String(nullable=True),
320+
"type": dt.String(length=30, nullable=True),
318321
}
319322
)
320323

ibis/backends/snowflake/tests/test_datatypes.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,10 @@ def con():
6363
("DOUBLE PRECISION", dt.float64),
6464
("REAL", dt.float64),
6565
("VARCHAR", dt.string),
66-
("VARCHAR(50)", dt.string),
67-
("CHAR", dt.string),
68-
("CHAR(5)", dt.string),
69-
("CHARACTER", dt.string),
66+
("VARCHAR(50)", dt.String(length=50)),
67+
("CHAR", dt.String(length=1)),
68+
("CHAR(5)", dt.String(length=5)),
69+
("CHARACTER", dt.String(length=1)),
7070
("STRING", dt.string),
7171
("TEXT", dt.string),
7272
("BINARY", dt.binary),

0 commit comments

Comments
 (0)