refactor(ir): Don't bypass argument coercion and validation for user defined functions (#3001)

kszucs · web-flow · commit 52fa8a97faf5 · 2021-11-03T15:42:37.000-04:00
* Inherit slots instead of redefining them

* Don't bypass argument coercion and validation for user defined functions

* Update return type's attribute name

* Use VectorizedUDF base

* Add tests

* Remove/rewrite scalar udf tests

* Remove test from dask backend
diff --git a/ibis/backends/dask/tests/test_udf.py b/ibis/backends/dask/tests/test_udf.py
@@ -175,12 +175,6 @@ def test_udf(t, df):
     tm.assert_series_equal(result, expected, check_names=False)
 
 
-def test_elementwise_udf_with_non_vectors(con):
-    expr = my_add(1.0, 2.0)
-    result = con.execute(expr)
-    assert result == 3.0
-
-
 def test_multiple_argument_udf(con, t, df):
     expr = my_add(t.b, t.c)
 
diff --git a/ibis/backends/dask/udf.py b/ibis/backends/dask/udf.py
@@ -26,8 +26,8 @@ def make_struct_op_meta(op: ir.Expr) -> List[Tuple[str, np.dtype]]:
     """Unpacks a dt.Struct into a DataFrame meta"""
     return list(
         zip(
-            op._output_type.names,
-            [x.to_dask() for x in op._output_type.types],
+            op.return_type.names,
+            [x.to_dask() for x in op.return_type.types],
         )
     )
 
@@ -72,16 +72,14 @@ def execute_udf_node(op, *args, **kwargs):
         # kwargs here. This is true for all udf execution in this
         # file.
         # See ibis.udf.vectorized.UserDefinedFunction
-        if isinstance(op._output_type, dt.Struct):
+        if isinstance(op.return_type, dt.Struct):
             meta = make_struct_op_meta(op)
 
             df = dd.map_partitions(op.func, *args, meta=meta)
             return df
         else:
             name = args[0].name if len(args) == 1 else None
-            meta = pandas.Series(
-                [], name=name, dtype=op._output_type.to_dask()
-            )
+            meta = pandas.Series([], name=name, dtype=op.return_type.to_dask())
             df = dd.map_partitions(op.func, *args, meta=meta)
 
             return df
@@ -124,11 +122,11 @@ def lazy_agg(*series: pandas.Series):
         # Depending on the type of operation, lazy_result is a Delayed that
         # could become a dd.Series or a dd.core.Scalar
         if isinstance(op, ops.AnalyticVectorizedUDF):
-            if isinstance(op._output_type, dt.Struct):
+            if isinstance(op.return_type, dt.Struct):
                 meta = make_struct_op_meta(op)
             else:
                 meta = make_meta_series(
-                    dtype=op._output_type.to_dask(),
+                    dtype=op.return_type.to_dask(),
                     name=args[0].name,
                 )
             result = dd.from_delayed(lazy_result, meta=meta)
@@ -151,13 +149,13 @@ def lazy_agg(*series: pandas.Series):
                 result = result.repartition(divisions=original_divisions)
         else:
             # lazy_result is a dd.core.Scalar from an ungrouped reduction
-            if isinstance(op._output_type, (dt.Array, dt.Struct)):
+            if isinstance(op.return_type, (dt.Array, dt.Struct)):
                 # we're outputing a dt.Struct that will need to be destructured
                 # or an array of an unknown size.
                 # we compute so we can work with items inside downstream.
                 result = lazy_result.compute()
             else:
-                output_meta = safe_scalar_type(op._output_type.to_dask())
+                output_meta = safe_scalar_type(op.return_type.to_dask())
                 result = dd.from_delayed(
                     lazy_result, meta=output_meta, verify_meta=False
                 )
@@ -181,7 +179,7 @@ def execute_reduction_node_groupby(op, *args, aggcontext, **kwargs):
         func = op.func
         groupings = args[0].index
         parent_df = args[0].obj
-        out_type = op._output_type.to_dask()
+        out_type = op.return_type.to_dask()
 
         grouped_df = parent_df.groupby(groupings)
         col_names = [col._meta._selected_obj.name for col in args]
@@ -223,7 +221,7 @@ def execute_analytic_node_groupby(op, *args, aggcontext, **kwargs):
         func = op.func
         groupings = args[0].index
         parent_df = args[0].obj
-        out_type = op._output_type.to_dask()
+        out_type = op.return_type.to_dask()
 
         grouped_df = parent_df.groupby(groupings)
         col_names = [col._meta._selected_obj.name for col in args]
@@ -232,7 +230,7 @@ def apply_wrapper(df, apply_func, col_names):
             cols = (df[col] for col in col_names)
             return apply_func(*cols)
 
-        if isinstance(op._output_type, dt.Struct):
+        if isinstance(op.return_type, dt.Struct):
             # with struct output we destruct to a dataframe directly
             meta = dd.utils.make_meta(make_struct_op_meta(op))
             meta.index.name = parent_df.index.name
diff --git a/ibis/backends/pandas/tests/execution/test_functions.py b/ibis/backends/pandas/tests/execution/test_functions.py
@@ -216,10 +216,16 @@ def test_execute_with_same_hash_value_in_scope(
     def my_func(x, y):
         return x
 
-    expr = my_func(left, right)
+    df = pd.DataFrame({"left": [left], "right": [right]})
+    table = ibis.pandas.from_dataframe(df)
+
+    expr = my_func(table.left, table.right)
     result = execute(expr)
-    assert type(result) is expected_type
-    assert result == expected_value
+    assert isinstance(result, pd.Series)
+
+    result = result.tolist()
+    assert result == [expected_value]
+    assert type(result[0]) is expected_type
 
 
 def test_ifelse_returning_bool():
@@ -248,7 +254,12 @@ def test_signature_does_not_match_input_type(dtype, value):
     def func(x):
         return x
 
-    expr = func(value)
-    result = execute(expr)
-    assert type(result) == type(value)
-    assert result == value
+    df = pd.DataFrame({"col": [value]})
+    table = ibis.pandas.from_dataframe(df)
+
+    result = execute(table.col)
+    assert isinstance(result, pd.Series)
+
+    result = result.tolist()
+    assert result == [value]
+    assert type(result[0]) is type(value)
diff --git a/ibis/backends/pandas/tests/test_udf.py b/ibis/backends/pandas/tests/test_udf.py
@@ -111,12 +111,6 @@ def test_udf(t, df):
     tm.assert_series_equal(result, expected)
 
 
-def test_elementwise_udf_with_non_vectors(con):
-    expr = my_add(1.0, 2.0)
-    result = con.execute(expr)
-    assert result == 3.0
-
-
 def test_multiple_argument_udf(con, t, df):
     expr = my_add(t.b, t.c)
 
diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py
@@ -1798,7 +1798,7 @@ def compile_fillna_table(t, expr, scope, timecontext, **kwargs):
 @compiles(ops.ElementWiseVectorizedUDF)
 def compile_elementwise_udf(t, expr, scope, timecontext, **kwargs):
     op = expr.op()
-    spark_output_type = spark_dtype(op._output_type)
+    spark_output_type = spark_dtype(op.return_type)
     func = op.func
     spark_udf = pandas_udf(func, spark_output_type, PandasUDFType.SCALAR)
     func_args = (t.translate(arg, scope, timecontext) for arg in op.func_args)
@@ -1809,7 +1809,7 @@ def compile_elementwise_udf(t, expr, scope, timecontext, **kwargs):
 def compile_reduction_udf(t, expr, scope, timecontext, context=None, **kwargs):
     op = expr.op()
 
-    spark_output_type = spark_dtype(op._output_type)
+    spark_output_type = spark_dtype(op.return_type)
     spark_udf = pandas_udf(
         op.func, spark_output_type, PandasUDFType.GROUPED_AGG
     )
diff --git a/ibis/expr/operations/vectorized.py b/ibis/expr/operations/vectorized.py
@@ -1,3 +1,5 @@
+from types import FunctionType, LambdaType
+
 from public import public
 
 from .. import rules as rlz
@@ -7,79 +9,39 @@
 from .reductions import Reduction
 
 
-@public
-class ElementWiseVectorizedUDF(ValueOp):
-    """Node for element wise UDF."""
-
-    func = Arg(callable)
-    func_args = Arg(tuple)
-    input_type = Arg(rlz.shape_like('func_args'))
-    _output_type = Arg(rlz.noop)
-
-    def __init__(self, func, args, input_type, output_type):
-        self.func = func
-        self.func_args = args
-        self.input_type = input_type
-        self._output_type = output_type
+class VectorizedUDF(ValueOp):
+    func = Arg(rlz.instance_of((FunctionType, LambdaType)))
+    func_args = Arg(rlz.list_of(rlz.column(rlz.any)))
+    input_type = Arg(rlz.list_of(rlz.datatype))
+    return_type = Arg(rlz.datatype)
 
     @property
     def inputs(self):
         return self.func_args
 
-    def output_type(self):
-        return self._output_type.column_type()
-
     def root_tables(self):
         return distinct_roots(*self.func_args)
 
 
 @public
-class ReductionVectorizedUDF(Reduction):
-    """Node for reduction UDF."""
+class ElementWiseVectorizedUDF(VectorizedUDF):
+    """Node for element wise UDF."""
 
-    func = Arg(callable)
-    func_args = Arg(tuple)
-    input_type = Arg(rlz.shape_like('func_args'))
-    _output_type = Arg(rlz.noop)
+    def output_type(self):
+        return self.return_type.column_type()
 
-    def __init__(self, func, args, input_type, output_type):
-        self.func = func
-        self.func_args = args
-        self.input_type = input_type
-        self._output_type = output_type
 
-    @property
-    def inputs(self):
-        return self.func_args
+@public
+class ReductionVectorizedUDF(VectorizedUDF, Reduction):
+    """Node for reduction UDF."""
 
     def output_type(self):
-        return self._output_type.scalar_type()
-
-    def root_tables(self):
-        return distinct_roots(*self.func_args)
+        return self.return_type.scalar_type()
 
 
 @public
-class AnalyticVectorizedUDF(AnalyticOp):
+class AnalyticVectorizedUDF(VectorizedUDF, AnalyticOp):
     """Node for analytics UDF."""
 
-    func = Arg(callable)
-    func_args = Arg(tuple)
-    input_type = Arg(rlz.shape_like('func_args'))
-    _output_type = Arg(rlz.noop)
-
-    def __init__(self, func, args, input_type, output_type):
-        self.func = func
-        self.func_args = args
-        self.input_type = input_type
-        self._output_type = output_type
-
-    @property
-    def inputs(self):
-        return self.func_args
-
     def output_type(self):
-        return self._output_type.column_type()
-
-    def root_tables(self):
-        return distinct_roots(*self.func_args)
+        return self.return_type.column_type()
diff --git a/ibis/tests/expr/test_udf.py b/ibis/tests/expr/test_udf.py
@@ -0,0 +1,81 @@
+import pytest
+
+import ibis
+import ibis.common.exceptions as com
+import ibis.expr.datatypes as dt
+import ibis.expr.operations as ops
+import ibis.expr.types as ir
+
+
+@pytest.fixture
+def table():
+    return ibis.table(
+        [
+            ("a", "int8"),
+            ("b", "string"),
+            ("c", "bool"),
+        ],
+        name="test",
+    )
+
+
+@pytest.mark.parametrize(
+    ("klass", "output_type"),
+    [
+        (ops.ElementWiseVectorizedUDF, ir.IntegerColumn),
+        (ops.ReductionVectorizedUDF, ir.IntegerScalar),
+        (ops.AnalyticVectorizedUDF, ir.IntegerColumn),
+    ],
+)
+def test_vectorized_udf_operations(table, klass, output_type):
+    udf = klass(
+        func=lambda a, b, c: a,
+        func_args=[table.a, table.b, table.c],
+        input_type=[dt.int8(), dt.string(), dt.boolean()],
+        return_type=dt.int8(),
+    )
+    assert udf.func_args[0].equals(table.a)
+    assert udf.func_args[1].equals(table.b)
+    assert udf.func_args[2].equals(table.c)
+    assert udf.input_type == [dt.int8(), dt.string(), dt.boolean()]
+    assert udf.return_type == dt.int8()
+
+    factory = udf.output_type()
+    expr = factory(udf)
+    assert isinstance(expr, output_type)
+
+    with pytest.raises(com.IbisTypeError):
+        # wrong function type
+        klass(
+            func=1,
+            func_args=[ibis.literal(1), table.b, table.c],
+            input_type=[dt.int8(), dt.string(), dt.boolean()],
+            return_type=dt.int8(),
+        )
+
+    with pytest.raises(com.IbisTypeError):
+        # scalar type instead of column type
+        klass(
+            func=lambda a, b, c: a,
+            func_args=[ibis.literal(1), table.b, table.c],
+            input_type=[dt.int8(), dt.string(), dt.boolean()],
+            return_type=dt.int8(),
+        )
+
+    with pytest.raises(com.IbisTypeError):
+        # wrong input type
+        klass(
+            func=lambda a, b, c: a,
+            func_args=[ibis.literal(1), table.b, table.c],
+            input_type="int8",
+            return_type=dt.int8(),
+        )
+
+    with pytest.raises(com.IbisTypeError):
+        # wrong return type
+        klass(
+            func=lambda a, b, c: a,
+            func_args=[ibis.literal(1), table.b, table.c],
+            input_type=[dt.int8(), dt.string(), dt.boolean()],
+            return_type=table,
+        )
diff --git a/ibis/udf/vectorized.py b/ibis/udf/vectorized.py
@@ -85,9 +85,9 @@ def func(*args):
 
         op = self.func_type(
             func=func,
-            args=args,
+            func_args=args,
             input_type=self.input_type,
-            output_type=self.output_type,
+            return_type=self.output_type,
         )
 
         return op.to_expr()