Skip to content

Commit fb33bf9

Browse files
krzysztof-kwittcpcloud
authored andcommitted
refactor(bigquery): explicite disallow INT64 in JS UDF
1 parent ccf80fd commit fb33bf9

File tree

3 files changed

+67
-80
lines changed

3 files changed

+67
-80
lines changed

ibis/backends/bigquery/datatypes.py

Lines changed: 40 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,6 @@
44

55
import ibis.expr.datatypes as dt
66

7-
8-
class TypeTranslationContext:
9-
"""A tag class to alter the way a type is translated.
10-
11-
This is used to raise an exception when INT64 types are encountered to
12-
avoid suprising results due to BigQuery's handling of INT64 types in
13-
JavaScript UDFs.
14-
"""
15-
16-
__slots__ = ()
17-
18-
19-
class UDFContext(TypeTranslationContext):
20-
__slots__ = ()
21-
22-
23-
UDF_CONTEXT = UDFContext()
24-
257
ibis_type_to_bigquery_type = Dispatcher("ibis_type_to_bigquery_type")
268

279

@@ -30,81 +12,60 @@ def trans_string_default(datatype):
3012
return ibis_type_to_bigquery_type(dt.dtype(datatype))
3113

3214

33-
@ibis_type_to_bigquery_type.register(dt.DataType)
34-
def trans_default(t):
35-
return ibis_type_to_bigquery_type(t, TypeTranslationContext())
36-
37-
38-
@ibis_type_to_bigquery_type.register(str, TypeTranslationContext)
39-
def trans_string_context(datatype, context):
40-
return ibis_type_to_bigquery_type(dt.dtype(datatype), context)
41-
42-
43-
@ibis_type_to_bigquery_type.register(dt.Floating, TypeTranslationContext)
44-
def trans_float64(t, context):
15+
@ibis_type_to_bigquery_type.register(dt.Floating)
16+
def trans_float64(t):
4517
return "FLOAT64"
4618

4719

48-
@ibis_type_to_bigquery_type.register(dt.Integer, TypeTranslationContext)
49-
def trans_integer(t, context):
20+
@ibis_type_to_bigquery_type.register(dt.Integer)
21+
def trans_integer(t):
5022
return "INT64"
5123

5224

53-
@ibis_type_to_bigquery_type.register(dt.Binary, TypeTranslationContext)
54-
def trans_binary(t, context):
25+
@ibis_type_to_bigquery_type.register(dt.Binary)
26+
def trans_binary(t):
5527
return "BYTES"
5628

5729

58-
@ibis_type_to_bigquery_type.register(dt.UInt64, (TypeTranslationContext, UDFContext))
59-
def trans_lossy_integer(t, context):
30+
@ibis_type_to_bigquery_type.register(dt.UInt64)
31+
def trans_lossy_integer(t):
6032
raise TypeError("Conversion from uint64 to BigQuery integer type (int64) is lossy")
6133

6234

63-
@ibis_type_to_bigquery_type.register(dt.Array, TypeTranslationContext)
64-
def trans_array(t, context):
65-
return f"ARRAY<{ibis_type_to_bigquery_type(t.value_type, context)}>"
35+
@ibis_type_to_bigquery_type.register(dt.Array)
36+
def trans_array(t):
37+
return f"ARRAY<{ibis_type_to_bigquery_type(t.value_type)}>"
6638

6739

68-
@ibis_type_to_bigquery_type.register(dt.Struct, TypeTranslationContext)
69-
def trans_struct(t, context):
40+
@ibis_type_to_bigquery_type.register(dt.Struct)
41+
def trans_struct(t):
7042
return "STRUCT<{}>".format(
7143
", ".join(
72-
f"{name} {ibis_type_to_bigquery_type(dt.dtype(type), context)}"
73-
for name, type in zip(t.names, t.types)
44+
f"{name} {ibis_type_to_bigquery_type(dt.dtype(type_))}"
45+
for name, type_ in zip(t.names, t.types)
7446
)
7547
)
7648

7749

78-
@ibis_type_to_bigquery_type.register(dt.Date, TypeTranslationContext)
79-
def trans_date(t, context):
50+
@ibis_type_to_bigquery_type.register(dt.Date)
51+
def trans_date(t):
8052
return "DATE"
8153

8254

83-
@ibis_type_to_bigquery_type.register(dt.Timestamp, TypeTranslationContext)
84-
def trans_timestamp(t, context):
55+
@ibis_type_to_bigquery_type.register(dt.Timestamp)
56+
def trans_timestamp(t):
8557
if t.timezone is not None:
8658
raise TypeError("BigQuery does not support timestamps with timezones")
8759
return "TIMESTAMP"
8860

8961

90-
@ibis_type_to_bigquery_type.register(dt.DataType, TypeTranslationContext)
91-
def trans_type(t, context):
62+
@ibis_type_to_bigquery_type.register(dt.DataType)
63+
def trans_type(t):
9264
return str(t).upper()
9365

9466

95-
@ibis_type_to_bigquery_type.register(dt.Integer, UDFContext)
96-
def trans_integer_udf(t, context):
97-
# JavaScript does not have integers, only a Number class. BigQuery doesn't
98-
# behave as expected with INT64 inputs or outputs
99-
raise TypeError(
100-
"BigQuery does not support INT64 as an argument type or a return type "
101-
"for UDFs. Replace INT64 with FLOAT64 in your UDF signature and "
102-
"cast all INT64 inputs to FLOAT64."
103-
)
104-
105-
106-
@ibis_type_to_bigquery_type.register(dt.Decimal, TypeTranslationContext)
107-
def trans_numeric(t, context):
67+
@ibis_type_to_bigquery_type.register(dt.Decimal)
68+
def trans_numeric(t):
10869
if (t.precision, t.scale) != (38, 9):
10970
raise TypeError(
11071
"BigQuery only supports decimal types with precision of 38 and "
@@ -113,11 +74,22 @@ def trans_numeric(t, context):
11374
return "NUMERIC"
11475

11576

116-
@ibis_type_to_bigquery_type.register(dt.Decimal, UDFContext)
117-
def trans_numeric_udf(t, context):
118-
raise TypeError("Decimal types are not supported in BigQuery UDFs")
77+
@ibis_type_to_bigquery_type.register(dt.JSON)
78+
def trans_json(t):
79+
return "JSON"
11980

12081

121-
@ibis_type_to_bigquery_type.register(dt.JSON, TypeTranslationContext)
122-
def trans_json(t, context):
123-
return "JSON"
82+
def spread_type(dt: dt.DataType):
83+
"""Returns a generator that contains all the types in the given type.
84+
85+
For complex types like set and array, it returns the types of the elements.
86+
"""
87+
if dt.is_array():
88+
yield from spread_type(dt.value_type)
89+
elif dt.is_struct():
90+
for type_ in dt.types:
91+
yield from spread_type(type_)
92+
elif dt.is_map():
93+
yield from spread_type(dt.key_type)
94+
yield from spread_type(dt.value_type)
95+
yield dt

ibis/backends/bigquery/tests/unit/test_datatypes.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55
import ibis.expr.datatypes as dt
66
from ibis.backends.bigquery.datatypes import (
7-
UDF_CONTEXT,
87
ibis_type_to_bigquery_type,
8+
spread_type,
99
)
1010

1111

@@ -63,20 +63,25 @@ def test_simple_failure_mode(datatype):
6363

6464

6565
@pytest.mark.parametrize(
66-
("type", "expected"),
66+
("type_", "expected"),
6767
[
68-
param(dt.int64, "INT64", marks=pytest.mark.xfail(raises=TypeError)),
68+
param(
69+
dt.int64,
70+
[dt.int64],
71+
),
6972
param(
7073
dt.Array(dt.int64),
71-
"ARRAY<INT64>",
72-
marks=pytest.mark.xfail(raises=TypeError),
74+
[dt.int64, dt.Array(value_type=dt.int64)],
7375
),
7476
param(
7577
dt.Struct.from_tuples([("a", dt.Array(dt.int64))]),
76-
"STRUCT<a ARRAY<INT64>>",
77-
marks=pytest.mark.xfail(raises=TypeError),
78+
[
79+
dt.int64,
80+
dt.Array(value_type=dt.int64),
81+
dt.Struct.from_tuples([('a', dt.Array(value_type=dt.int64))]),
82+
],
7883
),
7984
],
8085
)
81-
def test_ibis_type_to_bigquery_type_udf(type, expected):
82-
assert ibis_type_to_bigquery_type(type, UDF_CONTEXT) == expected
86+
def test_spread_type(type_, expected):
87+
assert list(spread_type(type_)) == expected

ibis/backends/bigquery/udf/__init__.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import ibis.expr.datatypes as dt
99
import ibis.expr.rules as rlz
10-
from ibis.backends.bigquery.datatypes import UDF_CONTEXT, ibis_type_to_bigquery_type
10+
from ibis.backends.bigquery.datatypes import ibis_type_to_bigquery_type, spread_type
1111
from ibis.backends.bigquery.operations import BigQueryUDFNode
1212
from ibis.backends.bigquery.udf.core import PythonToJavaScriptTranslator
1313
from ibis.udf.validate import validate_output_type
@@ -254,6 +254,16 @@ def js(
254254
""";
255255
'''
256256
validate_output_type(output_type)
257+
if any(
258+
type_ == dt.int64
259+
for param_type in params.values()
260+
for type_ in spread_type(param_type)
261+
) or any(type_ == dt.int64 for type_ in spread_type(output_type)):
262+
raise TypeError(
263+
"BigQuery does not support INT64 as an argument type or a return type "
264+
"for UDFs. Replace INT64 with FLOAT64 in your UDF signature and "
265+
"cast all INT64 inputs to FLOAT64."
266+
)
257267

258268
if libraries is None:
259269
libraries = []
@@ -276,11 +286,11 @@ def compiles_udf_node(t, op):
276286
bigquery_signature = ", ".join(
277287
"{name} {type}".format(
278288
name=name,
279-
type=ibis_type_to_bigquery_type(dt.dtype(type_), UDF_CONTEXT),
289+
type=ibis_type_to_bigquery_type(dt.dtype(type_)),
280290
)
281291
for name, type_ in params.items()
282292
)
283-
return_type = ibis_type_to_bigquery_type(dt.dtype(output_type), UDF_CONTEXT)
293+
return_type = ibis_type_to_bigquery_type(dt.dtype(output_type))
284294
libraries_opts = (
285295
f"\nOPTIONS (\n library={repr(list(libraries))}\n)" if libraries else ""
286296
)

0 commit comments

Comments
 (0)