Skip to content

Commit 518d315

Browse files
feat: Series binary ops compatible with more types (#618)
1 parent 7227a6a commit 518d315

File tree

7 files changed

+200
-99
lines changed

7 files changed

+200
-99
lines changed

bigframes/core/__init__.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,6 @@ def _compiled_schema(self) -> schemata.ArraySchema:
117117
)
118118
return schemata.ArraySchema(items)
119119

120-
def validate_schema(self):
121-
tree_derived = self.node.schema
122-
ibis_derived = self._compiled_schema
123-
if tree_derived.names != ibis_derived.names:
124-
raise ValueError(
125-
f"Unexpected names internal {tree_derived.names} vs compiled {ibis_derived.names}"
126-
)
127-
if tree_derived.dtypes != ibis_derived.dtypes:
128-
raise ValueError(
129-
f"Unexpected types internal {tree_derived.dtypes} vs compiled {ibis_derived.dtypes}"
130-
)
131-
132120
def _try_evaluate_local(self):
133121
"""Use only for unit testing paths - not fully featured. Will throw exception if fails."""
134122
import ibis

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1366,7 +1366,7 @@ def clip_op(
13661366

13671367

13681368
@scalar_op_compiler.register_nary_op(ops.case_when_op)
1369-
def switch_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value:
1369+
def case_when_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value:
13701370
# ibis can handle most type coercions, but we need to force bool -> int
13711371
# TODO: dispatch coercion depending on bigframes dtype schema
13721372
result_values = cases_and_outputs[1::2]

bigframes/core/convert.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,22 @@
2121
import bigframes.series as series
2222

2323

24-
def to_bf_series(obj, default_index: Optional[index.Index]) -> series.Series:
24+
def is_series_convertible(obj) -> bool:
25+
if isinstance(obj, series.Series):
26+
return True
27+
if isinstance(obj, pd.Series):
28+
return True
29+
if isinstance(obj, index.Index):
30+
return True
31+
if isinstance(obj, pd.Index):
32+
return True
33+
if pd.api.types.is_list_like(obj):
34+
return True
35+
else:
36+
return False
37+
38+
39+
def to_bf_series(obj, default_index: Optional[index.Index], session) -> series.Series:
2540
"""
2641
Convert a an object to a bigframes series
2742
@@ -37,13 +52,15 @@ def to_bf_series(obj, default_index: Optional[index.Index]) -> series.Series:
3752
if isinstance(obj, series.Series):
3853
return obj
3954
if isinstance(obj, pd.Series):
40-
return series.Series(obj)
55+
return series.Series(obj, session=session)
4156
if isinstance(obj, index.Index):
42-
return series.Series(obj, default_index)
57+
return series.Series(obj, default_index, session=session)
4358
if isinstance(obj, pd.Index):
44-
return series.Series(obj, default_index)
59+
return series.Series(obj, default_index, session=session)
60+
if pd.api.types.is_dict_like(obj):
61+
return series.Series(obj, session=session)
4562
if pd.api.types.is_list_like(obj):
46-
return series.Series(obj, default_index)
63+
return series.Series(obj, default_index, session=session)
4764
else:
4865
raise TypeError(f"Cannot interpret {obj} as series.")
4966

@@ -69,6 +86,8 @@ def to_pd_series(obj, default_index: pd.Index) -> pd.Series:
6986
return pd.Series(obj.to_pandas(), default_index)
7087
if isinstance(obj, pd.Index):
7188
return pd.Series(obj, default_index)
89+
if pd.api.types.is_dict_like(obj):
90+
return pd.Series(obj)
7291
if pd.api.types.is_list_like(obj):
7392
return pd.Series(obj, default_index)
7493
else:

bigframes/dataframe.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ def __init__(
105105
raise ValueError(
106106
f"DataFrame constructor only supports copy=True. {constants.FEEDBACK_LINK}"
107107
)
108+
# just ignore object dtype if provided
109+
if dtype in {numpy.dtypes.ObjectDType, "object"}:
110+
dtype = None
108111

109112
# Check to see if constructing from BigQuery-backed objects before
110113
# falling back to pandas constructor
@@ -668,7 +671,9 @@ def _apply_binop(
668671
DataFrame(other), op, how=how, reverse=reverse
669672
)
670673
elif utils.get_axis_number(axis) == 0:
671-
bf_series = bigframes.core.convert.to_bf_series(other, self.index)
674+
bf_series = bigframes.core.convert.to_bf_series(
675+
other, self.index, self._session
676+
)
672677
return self._apply_series_binop_axis_0(bf_series, op, how, reverse)
673678
elif utils.get_axis_number(axis) == 1:
674679
pd_series = bigframes.core.convert.to_pd_series(other, self.columns)

bigframes/operations/base.py

Lines changed: 79 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717
import typing
1818

1919
import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing
20+
import numpy
2021
import pandas as pd
2122

2223
import bigframes.constants as constants
2324
import bigframes.core.blocks as blocks
25+
import bigframes.core.convert
2426
import bigframes.core.expression as ex
2527
import bigframes.core.indexes as indexes
2628
import bigframes.core.scalar as scalars
@@ -44,7 +46,19 @@ def __init__(
4446
*,
4547
session: typing.Optional[bigframes.session.Session] = None,
4648
):
47-
block = None
49+
import bigframes.pandas
50+
51+
# just ignore object dtype if provided
52+
if dtype in {numpy.dtypes.ObjectDType, "object"}:
53+
dtype = None
54+
55+
read_pandas_func = (
56+
session.read_pandas
57+
if (session is not None)
58+
else (lambda x: bigframes.pandas.read_pandas(x))
59+
)
60+
61+
block: typing.Optional[blocks.Block] = None
4862
if copy is not None and not copy:
4963
raise ValueError(
5064
f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}"
@@ -55,58 +69,75 @@ def __init__(
5569
assert index is None
5670
block = data
5771

58-
elif isinstance(data, SeriesMethods):
59-
block = data._block
72+
# interpret these cases as both index and data
73+
elif (
74+
isinstance(data, SeriesMethods)
75+
or isinstance(data, pd.Series)
76+
or pd.api.types.is_dict_like(data)
77+
):
78+
if isinstance(data, pd.Series):
79+
data = read_pandas_func(data)
80+
elif pd.api.types.is_dict_like(data):
81+
data = read_pandas_func(pd.Series(data, dtype=dtype)) # type: ignore
82+
dtype = None
83+
data_block = data._block
6084
if index is not None:
6185
# reindex
62-
bf_index = indexes.Index(index)
86+
bf_index = indexes.Index(index, session=session)
6387
idx_block = bf_index._block
6488
idx_cols = idx_block.value_columns
65-
block_idx, _ = idx_block.join(block, how="left")
66-
block = block_idx.with_index_labels(bf_index.names)
67-
68-
elif isinstance(data, indexes.Index):
89+
block_idx, _ = idx_block.join(data_block, how="left")
90+
data_block = block_idx.with_index_labels(bf_index.names)
91+
block = data_block
92+
93+
# list-like data that will get default index
94+
elif isinstance(data, indexes.Index) or pd.api.types.is_list_like(data):
95+
data = indexes.Index(data, dtype=dtype, session=session)
96+
dtype = (
97+
None # set to none as it has already been applied, avoid re-cast later
98+
)
6999
if data.nlevels != 1:
70100
raise NotImplementedError("Cannot interpret multi-index as Series.")
71101
# Reset index to promote index columns to value columns, set default index
72-
block = data._block.reset_index(drop=False)
102+
data_block = data._block.reset_index(drop=False).with_column_labels(
103+
data.names
104+
)
73105
if index is not None:
74106
# Align by offset
75-
bf_index = indexes.Index(index)
76-
idx_block = bf_index._block.reset_index(drop=False)
107+
bf_index = indexes.Index(index, session=session)
108+
idx_block = bf_index._block.reset_index(
109+
drop=False
110+
) # reset to align by offsets, and then reset back
77111
idx_cols = idx_block.value_columns
78-
block, (l_mapping, _) = idx_block.join(block, how="left")
79-
block = block.set_index([l_mapping[col] for col in idx_cols])
80-
block = block.with_index_labels(bf_index.names)
81-
82-
if block:
83-
if name:
84-
if not isinstance(name, typing.Hashable):
85-
raise ValueError(
86-
f"BigQuery DataFrames only supports hashable series names. {constants.FEEDBACK_LINK}"
87-
)
88-
block = block.with_column_labels([name])
89-
if dtype:
90-
block = block.multi_apply_unary_op(
91-
block.value_columns, ops.AsTypeOp(to_type=dtype)
92-
)
93-
else:
94-
import bigframes.pandas
112+
data_block, (l_mapping, _) = idx_block.join(data_block, how="left")
113+
data_block = data_block.set_index([l_mapping[col] for col in idx_cols])
114+
data_block = data_block.with_index_labels(bf_index.names)
115+
block = data_block
95116

96-
pd_series = pd.Series(
97-
data=data, index=index, dtype=dtype, name=name # type:ignore
98-
)
99-
pd_dataframe = pd_series.to_frame()
100-
if pd_series.name is None:
101-
# to_frame will set default numeric column label if unnamed, but we do not support int column label, so must rename
102-
pd_dataframe = pd_dataframe.set_axis(["unnamed_col"], axis=1)
103-
if session:
104-
block = session.read_pandas(pd_dataframe)._get_block()
117+
else: # Scalar case
118+
if index is not None:
119+
bf_index = indexes.Index(index, session=session)
105120
else:
106-
# Uses default global session
107-
block = bigframes.pandas.read_pandas(pd_dataframe)._get_block()
108-
if pd_series.name is None:
109-
block = block.with_column_labels([None])
121+
bf_index = indexes.Index(
122+
[] if (data is None) else [0],
123+
session=session,
124+
dtype=bigframes.dtypes.INT_DTYPE,
125+
)
126+
block, _ = bf_index._block.create_constant(data, dtype)
127+
dtype = None
128+
block = block.with_column_labels([name])
129+
130+
assert block is not None
131+
if name:
132+
if not isinstance(name, typing.Hashable):
133+
raise ValueError(
134+
f"BigQuery DataFrames only supports hashable series names. {constants.FEEDBACK_LINK}"
135+
)
136+
block = block.with_column_labels([name])
137+
if dtype:
138+
block = block.multi_apply_unary_op(
139+
block.value_columns, ops.AsTypeOp(to_type=dtype)
140+
)
110141
self._block: blocks.Block = block
111142

112143
@property
@@ -145,17 +176,16 @@ def _apply_binary_op(
145176
reverse: bool = False,
146177
) -> series.Series:
147178
"""Applies a binary operator to the series and other."""
148-
if isinstance(other, pd.Series):
149-
# TODO: Convert to BigQuery DataFrames series
150-
raise NotImplementedError(
151-
f"Pandas series not supported as operand. {constants.FEEDBACK_LINK}"
179+
if bigframes.core.convert.is_series_convertible(other):
180+
self_index = indexes.Index(self._block)
181+
other_series = bigframes.core.convert.to_bf_series(
182+
other, self_index, self._block.session
152183
)
153-
if isinstance(other, series.Series):
154-
(self_col, other_col, block) = self._align(other, how=alignment)
184+
(self_col, other_col, block) = self._align(other_series, how=alignment)
155185

156186
name = self._name
157187
if (
158-
isinstance(other, series.Series)
188+
hasattr(other, "name")
159189
and other.name != self._name
160190
and alignment == "outer"
161191
):
@@ -166,7 +196,7 @@ def _apply_binary_op(
166196
block, result_id = block.project_expr(expr, name)
167197
return series.Series(block.select_column(result_id))
168198

169-
else:
199+
else: # Scalar binop
170200
name = self._name
171201
expr = op.as_expr(
172202
ex.const(other) if reverse else self._value_column,

bigframes/series.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import inspect
2121
import itertools
2222
import numbers
23-
import os
2423
import textwrap
2524
import typing
2625
from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union
@@ -73,11 +72,6 @@ def __init__(self, *args, **kwargs):
7372
self._query_job: Optional[bigquery.QueryJob] = None
7473
super().__init__(*args, **kwargs)
7574

76-
# Runs strict validations to ensure internal type predictions and ibis are completely in sync
77-
# Do not execute these validations outside of testing suite.
78-
if "PYTEST_CURRENT_TEST" in os.environ:
79-
self._block.expr.validate_schema()
80-
8175
@property
8276
def dt(self) -> dt.DatetimeMethods:
8377
return dt.DatetimeMethods(self._block)
@@ -812,9 +806,6 @@ def combine_first(self, other: Series) -> Series:
812806
return result
813807

814808
def update(self, other: Union[Series, Sequence, Mapping]) -> None:
815-
import bigframes.core.convert
816-
817-
other = bigframes.core.convert.to_bf_series(other, default_index=None)
818809
result = self._apply_binary_op(
819810
other, ops.coalesce_op, reverse=True, alignment="left"
820811
)

0 commit comments

Comments
 (0)