Skip to content

Commit c4f00dc

Browse files
feat: Use data types defined in the Rust Wren Engine to do type mapping (#913)
1 parent 9bfffc1 commit c4f00dc

File tree

10 files changed

+265
-204
lines changed

10 files changed

+265
-204
lines changed

ibis-server/app/model/metadata/bigquery.py

+32-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
Column,
55
Constraint,
66
ConstraintType,
7+
RustWrenEngineColumnType,
78
Table,
89
TableProperties,
910
)
@@ -17,6 +18,8 @@ def __init__(self, connection_info: BigQueryConnectionInfo):
1718

1819
def get_table_list(self) -> list[Table]:
1920
dataset_id = self.connection_info.dataset_id.get_secret_value()
21+
22+
# filter out columns with GEOGRAPHY & RANGE types
2023
sql = f"""
2124
SELECT
2225
c.table_catalog,
@@ -46,14 +49,11 @@ def get_table_list(self) -> list[Table]:
4649
AND cf.column_name = c.column_name
4750
LEFT JOIN {dataset_id}.INFORMATION_SCHEMA.TABLE_OPTIONS table_options
4851
ON c.table_name = table_options.table_name
52+
WHERE cf.data_type != 'GEOGRAPHY'
53+
AND cf.data_type NOT LIKE 'RANGE%'
4954
"""
5055
response = self.connection.sql(sql).to_pandas().to_dict(orient="records")
5156

52-
def get_data_type(data_type) -> str:
53-
if "STRUCT" in data_type:
54-
return "RECORD"
55-
return data_type
56-
5757
def get_column(row, nestedColumns=None) -> Column:
5858
return Column(
5959
# field_path supports both column & nested column
@@ -139,3 +139,30 @@ def get_constraints(self) -> list[Constraint]:
139139

140140
def get_version(self) -> str:
141141
return "Follow BigQuery release version"
142+
143+
def _transform_column_type(self, data_type):
144+
# lower case the data_type
145+
data_type = data_type.lower()
146+
147+
# if data_type start with "array" or "struct", by pass it
148+
if data_type.startswith(("array", "struct")):
149+
return data_type
150+
151+
# Map BigQuery types to RustWrenEngineColumnType
152+
switcher = {
153+
# GEOGRAPHY and RANGE columns were filtered out
154+
"bytes": RustWrenEngineColumnType.BYTES,
155+
"date": RustWrenEngineColumnType.DATE,
156+
"datetime": RustWrenEngineColumnType.DATETIME,
157+
"interval": RustWrenEngineColumnType.INTERVAL,
158+
"json": RustWrenEngineColumnType.JSON,
159+
"int64": RustWrenEngineColumnType.INT64,
160+
"numeric": RustWrenEngineColumnType.NUMERIC,
161+
"bignumeric": RustWrenEngineColumnType.BIGNUMERIC,
162+
"float64": RustWrenEngineColumnType.FLOAT64,
163+
"string": RustWrenEngineColumnType.STRING,
164+
"time": RustWrenEngineColumnType.TIME,
165+
"timestamp": RustWrenEngineColumnType.TIMESTAMPTZ,
166+
}
167+
168+
return switcher.get(data_type, RustWrenEngineColumnType.UNKNOWN)

ibis-server/app/model/metadata/canner.py

+45-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
from urllib.parse import urlparse
23

34
from gql import Client, gql
@@ -7,6 +8,7 @@
78
from app.model.metadata.dto import (
89
Column,
910
Constraint,
11+
RustWrenEngineColumnType,
1012
Table,
1113
TableProperties,
1214
)
@@ -181,7 +183,7 @@ def _build_columns(cls, columns: list[dict]) -> list[Column]:
181183
return [
182184
Column(
183185
name=column["originalColumn"]["name"],
184-
type=column["originalColumn"]["type"],
186+
type=cls._transform_column_type(column["originalColumn"]["type"]),
185187
notNull=column["originalColumn"]["properties"].get(
186188
"jdbc-nullable", False
187189
),
@@ -190,3 +192,45 @@ def _build_columns(cls, columns: list[dict]) -> list[Column]:
190192
)
191193
for column in columns
192194
]
195+
196+
@classmethod
197+
def _transform_column_type(self, data_type):
198+
# all possible types listed here: https://trino.io/docs/current/language/types.html
199+
# trim the (all characters) at the end of the data_type if exists
200+
data_type = re.sub(r"\(.*\)", "", data_type).strip()
201+
202+
switcher = {
203+
# String Types (ignore Binary and Spatial Types for now)
204+
"char": RustWrenEngineColumnType.CHAR,
205+
"varchar": RustWrenEngineColumnType.VARCHAR,
206+
"tinytext": RustWrenEngineColumnType.TEXT,
207+
"text": RustWrenEngineColumnType.TEXT,
208+
"mediumtext": RustWrenEngineColumnType.TEXT,
209+
"longtext": RustWrenEngineColumnType.TEXT,
210+
"enum": RustWrenEngineColumnType.VARCHAR,
211+
"set": RustWrenEngineColumnType.VARCHAR,
212+
# Numeric Types(https://dev.mysql.com/doc/refman/8.4/en/numeric-types.html)
213+
"bit": RustWrenEngineColumnType.TINYINT,
214+
"tinyint": RustWrenEngineColumnType.TINYINT,
215+
"smallint": RustWrenEngineColumnType.SMALLINT,
216+
"mediumint": RustWrenEngineColumnType.INTEGER,
217+
"int": RustWrenEngineColumnType.INTEGER,
218+
"integer": RustWrenEngineColumnType.INTEGER,
219+
"bigint": RustWrenEngineColumnType.BIGINT,
220+
# boolean
221+
"bool": RustWrenEngineColumnType.BOOL,
222+
"boolean": RustWrenEngineColumnType.BOOL,
223+
# Decimal
224+
"float": RustWrenEngineColumnType.FLOAT8,
225+
"double": RustWrenEngineColumnType.DOUBLE,
226+
"decimal": RustWrenEngineColumnType.DECIMAL,
227+
"numeric": RustWrenEngineColumnType.NUMERIC,
228+
# Date and Time Types(https://dev.mysql.com/doc/refman/8.4/en/date-and-time-types.html)
229+
"date": RustWrenEngineColumnType.DATE,
230+
"datetime": RustWrenEngineColumnType.TIMESTAMP,
231+
"timestamp": RustWrenEngineColumnType.TIMESTAMPTZ,
232+
# JSON Type
233+
"json": RustWrenEngineColumnType.JSON,
234+
}
235+
236+
return switcher.get(data_type.lower(), RustWrenEngineColumnType.UNKNOWN)

ibis-server/app/model/metadata/clickhouse.py

+24-24
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
from app.model.metadata.dto import (
44
Column,
55
Constraint,
6+
RustWrenEngineColumnType,
67
Table,
78
TableProperties,
8-
WrenEngineColumnType,
99
)
1010
from app.model.metadata.metadata import Metadata
1111

@@ -80,29 +80,29 @@ def _transform_column_type(self, data_type):
8080
# lower case the data_type
8181
data_type = data_type.lower()
8282

83-
# Map ClickHouse types to WrenEngineColumnType
83+
# Map ClickHouse types to RustWrenEngineColumnType
8484
switcher = {
85-
"boolean": WrenEngineColumnType.BOOLEAN,
86-
"int8": WrenEngineColumnType.TINYINT,
87-
"uint8": WrenEngineColumnType.INT2,
88-
"int16": WrenEngineColumnType.INT2,
89-
"uint16": WrenEngineColumnType.INT2,
90-
"int32": WrenEngineColumnType.INT4,
91-
"uint32": WrenEngineColumnType.INT4,
92-
"int64": WrenEngineColumnType.INT8,
93-
"uint64": WrenEngineColumnType.INT8,
94-
"float32": WrenEngineColumnType.FLOAT4,
95-
"float64": WrenEngineColumnType.FLOAT8,
96-
"decimal": WrenEngineColumnType.DECIMAL,
97-
"date": WrenEngineColumnType.DATE,
98-
"datetime": WrenEngineColumnType.TIMESTAMP,
99-
"string": WrenEngineColumnType.VARCHAR,
100-
"fixedstring": WrenEngineColumnType.CHAR,
101-
"uuid": WrenEngineColumnType.UUID,
102-
"enum8": WrenEngineColumnType.STRING, # Enums can be mapped to strings
103-
"enum16": WrenEngineColumnType.STRING, # Enums can be mapped to strings
104-
"ipv4": WrenEngineColumnType.INET,
105-
"ipv6": WrenEngineColumnType.INET,
85+
"boolean": RustWrenEngineColumnType.BOOL,
86+
"int8": RustWrenEngineColumnType.TINYINT,
87+
"uint8": RustWrenEngineColumnType.INT2,
88+
"int16": RustWrenEngineColumnType.INT2,
89+
"uint16": RustWrenEngineColumnType.INT2,
90+
"int32": RustWrenEngineColumnType.INT4,
91+
"uint32": RustWrenEngineColumnType.INT4,
92+
"int64": RustWrenEngineColumnType.INT8,
93+
"uint64": RustWrenEngineColumnType.INT8,
94+
"float32": RustWrenEngineColumnType.FLOAT4,
95+
"float64": RustWrenEngineColumnType.FLOAT8,
96+
"decimal": RustWrenEngineColumnType.DECIMAL,
97+
"date": RustWrenEngineColumnType.DATE,
98+
"datetime": RustWrenEngineColumnType.TIMESTAMP,
99+
"string": RustWrenEngineColumnType.VARCHAR,
100+
"fixedstring": RustWrenEngineColumnType.CHAR,
101+
"uuid": RustWrenEngineColumnType.UUID,
102+
"enum8": RustWrenEngineColumnType.STRING, # Enums can be mapped to strings
103+
"enum16": RustWrenEngineColumnType.STRING, # Enums can be mapped to strings
104+
"ipv4": RustWrenEngineColumnType.INET,
105+
"ipv6": RustWrenEngineColumnType.INET,
106106
}
107107

108-
return switcher.get(data_type, WrenEngineColumnType.UNKNOWN)
108+
return switcher.get(data_type, RustWrenEngineColumnType.UNKNOWN)

ibis-server/app/model/metadata/dto.py

+23-36
Original file line numberDiff line numberDiff line change
@@ -10,59 +10,46 @@ class MetadataDTO(BaseModel):
1010
connection_info: ConnectionInfo = Field(alias="connectionInfo")
1111

1212

13-
class WrenEngineColumnType(Enum):
14-
# Boolean Types
15-
BOOLEAN = "BOOLEAN"
16-
17-
# Numeric Types
13+
class RustWrenEngineColumnType(Enum):
14+
BOOL = "BOOL"
1815
TINYINT = "TINYINT"
1916
INT2 = "INT2"
20-
SMALLINT = "SMALLINT" # alias for INT2
17+
SMALLINT = "SMALLINT"
2118
INT4 = "INT4"
22-
INTEGER = "INTEGER" # alias for INT4
19+
INT = "INT"
20+
INTEGER = "INTEGER"
2321
INT8 = "INT8"
24-
BIGINT = "BIGINT" # alias for INT8
22+
BIGINT = "BIGINT"
2523
NUMERIC = "NUMERIC"
2624
DECIMAL = "DECIMAL"
27-
28-
# Floating-Point Types
29-
FLOAT4 = "FLOAT4"
30-
REAL = "REAL" # alias for FLOAT4
31-
FLOAT8 = "FLOAT8"
32-
DOUBLE = "DOUBLE" # alias for FLOAT8
33-
34-
# Character Types
3525
VARCHAR = "VARCHAR"
3626
CHAR = "CHAR"
37-
BPCHAR = "BPCHAR" # BPCHAR is fixed-length blank padded string
38-
TEXT = "TEXT" # alias for VARCHAR
39-
STRING = "STRING" # alias for VARCHAR
40-
NAME = "NAME" # alias for VARCHAR
41-
42-
# Date/Time Types
27+
BPCHAR = "BPCHAR"
28+
TEXT = "TEXT"
29+
STRING = "STRING"
30+
NAME = "NAME"
31+
FLOAT4 = "FLOAT4"
32+
REAL = "REAL"
33+
FLOAT = "FLOAT"
34+
FLOAT8 = "FLOAT8"
35+
DOUBLE = "DOUBLE"
4336
TIMESTAMP = "TIMESTAMP"
44-
TIMESTAMPTZ = "TIMESTAMP WITH TIME ZONE"
37+
TIMESTAMPTZ = "TIMESTAMPTZ"
4538
DATE = "DATE"
4639
INTERVAL = "INTERVAL"
47-
48-
# JSON Types
4940
JSON = "JSON"
50-
51-
# Object identifiers (OIDs) are used internally by PostgreSQL as primary keys for various system tables.
52-
# https:#www.postgresql.org/docs/current/datatype-oid.html
5341
OID = "OID"
54-
55-
# Binary Data Types
5642
BYTEA = "BYTEA"
57-
58-
# UUID Type
5943
UUID = "UUID"
60-
61-
# Network Address Types
6244
INET = "INET"
63-
64-
# Unknown Type
6545
UNKNOWN = "UNKNOWN"
46+
BIGNUMERIC = "BIGNUMERIC"
47+
BYTES = "BYTES"
48+
DATETIME = "DATETIME"
49+
FLOAT64 = "FLOAT64"
50+
INT64 = "INT64"
51+
TIME = "TIME"
52+
NULL = "NULL"
6653

6754

6855
class Column(BaseModel):

ibis-server/app/model/metadata/mssql.py

+28-28
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
Column,
55
Constraint,
66
ConstraintType,
7+
RustWrenEngineColumnType,
78
Table,
89
TableProperties,
9-
WrenEngineColumnType,
1010
)
1111
from app.model.metadata.metadata import Metadata
1212

@@ -173,40 +173,40 @@ def _format_constraint_name(
173173
return f"{table_name}_{column_name}_{referenced_table_name}_{referenced_column_name}"
174174

175175
def _transform_column_type(self, data_type):
176-
# Define the mapping of MSSQL data types to WrenEngineColumnType
176+
# Define the mapping of MSSQL data types to RustWrenEngineColumnType
177177
# ref: https://learn.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver15#exact-numerics
178178
switcher = {
179179
# String Types
180-
"char": WrenEngineColumnType.CHAR,
181-
"varchar": WrenEngineColumnType.VARCHAR,
182-
"text": WrenEngineColumnType.TEXT,
183-
"nchar": WrenEngineColumnType.CHAR,
184-
"nvarchar": WrenEngineColumnType.VARCHAR,
185-
"ntext": WrenEngineColumnType.TEXT,
180+
"char": RustWrenEngineColumnType.CHAR,
181+
"varchar": RustWrenEngineColumnType.VARCHAR,
182+
"text": RustWrenEngineColumnType.TEXT,
183+
"nchar": RustWrenEngineColumnType.CHAR,
184+
"nvarchar": RustWrenEngineColumnType.VARCHAR,
185+
"ntext": RustWrenEngineColumnType.TEXT,
186186
# Numeric Types
187-
"bit": WrenEngineColumnType.TINYINT,
188-
"tinyint": WrenEngineColumnType.TINYINT,
189-
"smallint": WrenEngineColumnType.SMALLINT,
190-
"int": WrenEngineColumnType.INTEGER,
191-
"bigint": WrenEngineColumnType.BIGINT,
187+
"bit": RustWrenEngineColumnType.TINYINT,
188+
"tinyint": RustWrenEngineColumnType.TINYINT,
189+
"smallint": RustWrenEngineColumnType.SMALLINT,
190+
"int": RustWrenEngineColumnType.INTEGER,
191+
"bigint": RustWrenEngineColumnType.BIGINT,
192192
# Boolean
193-
"boolean": WrenEngineColumnType.BOOLEAN,
193+
"boolean": RustWrenEngineColumnType.BOOL,
194194
# Decimal
195-
"float": WrenEngineColumnType.FLOAT8,
196-
"real": WrenEngineColumnType.FLOAT8,
197-
"decimal": WrenEngineColumnType.DECIMAL,
198-
"numeric": WrenEngineColumnType.NUMERIC,
199-
"money": WrenEngineColumnType.DECIMAL,
200-
"smallmoney": WrenEngineColumnType.DECIMAL,
195+
"float": RustWrenEngineColumnType.FLOAT8,
196+
"real": RustWrenEngineColumnType.FLOAT8,
197+
"decimal": RustWrenEngineColumnType.DECIMAL,
198+
"numeric": RustWrenEngineColumnType.NUMERIC,
199+
"money": RustWrenEngineColumnType.DECIMAL,
200+
"smallmoney": RustWrenEngineColumnType.DECIMAL,
201201
# Date and Time Types
202-
"date": WrenEngineColumnType.DATE,
203-
"datetime": WrenEngineColumnType.TIMESTAMP,
204-
"datetime2": WrenEngineColumnType.TIMESTAMPTZ,
205-
"smalldatetime": WrenEngineColumnType.TIMESTAMP,
206-
"time": WrenEngineColumnType.INTERVAL,
207-
"datetimeoffset": WrenEngineColumnType.TIMESTAMPTZ,
202+
"date": RustWrenEngineColumnType.DATE,
203+
"datetime": RustWrenEngineColumnType.TIMESTAMP,
204+
"datetime2": RustWrenEngineColumnType.TIMESTAMP,
205+
"smalldatetime": RustWrenEngineColumnType.TIMESTAMP,
206+
"time": RustWrenEngineColumnType.INTERVAL,
207+
"datetimeoffset": RustWrenEngineColumnType.TIMESTAMPTZ,
208208
# JSON Type (Note: MSSQL supports JSON natively as a string type)
209-
"json": WrenEngineColumnType.JSON,
209+
"json": RustWrenEngineColumnType.JSON,
210210
}
211211

212-
return switcher.get(data_type.lower(), WrenEngineColumnType.UNKNOWN)
212+
return switcher.get(data_type.lower(), RustWrenEngineColumnType.UNKNOWN)

0 commit comments

Comments
 (0)