Skip to content

Commit 39b7b47

Browse files
feat: add interpolate() to series and dataframe
1 parent 79a638e commit 39b7b47

File tree

7 files changed

+156
-0
lines changed

7 files changed

+156
-0
lines changed

bigframes/core/block_transforms.py

+79
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,85 @@ def indicate_duplicates(
105105
)
106106

107107

108+
def interpolate_linear(block: blocks.Block) -> blocks.Block:
109+
backwards_window = windows.WindowSpec(following=0)
110+
forwards_window = windows.WindowSpec(preceding=0)
111+
112+
output_column_ids = []
113+
114+
original_columns = block.value_columns
115+
original_labels = block.column_labels
116+
block, offsets = block.promote_offsets()
117+
for column in original_columns:
118+
# null in same places column is null
119+
should_interpolate = block._column_type(column) in [
120+
pd.Float64Dtype(),
121+
pd.Int64Dtype(),
122+
]
123+
if should_interpolate:
124+
block, notnull = block.apply_unary_op(column, ops.notnull_op)
125+
block, masked_offsets = block.apply_binary_op(
126+
offsets, notnull, ops.partial_arg3(ops.where_op, None)
127+
)
128+
129+
block, previous_value = block.apply_window_op(
130+
column, agg_ops.LastNonNullOp(), backwards_window
131+
)
132+
block, next_value = block.apply_window_op(
133+
column, agg_ops.FirstNonNullOp(), forwards_window
134+
)
135+
block, previous_value_offset = block.apply_window_op(
136+
masked_offsets, agg_ops.LastNonNullOp(), backwards_window
137+
)
138+
block, next_value_offset = block.apply_window_op(
139+
masked_offsets, agg_ops.FirstNonNullOp(), forwards_window
140+
)
141+
142+
block, prediction_id = _interpolate(
143+
block,
144+
previous_value_offset,
145+
previous_value,
146+
next_value_offset,
147+
next_value,
148+
offsets,
149+
)
150+
151+
block, interpolated_column = block.apply_binary_op(
152+
column, prediction_id, ops.fillna_op
153+
)
154+
# pandas linear interpolation also extrapolates forward like 'ffill'
155+
block, interpolated_column = block.apply_window_op(
156+
interpolated_column, agg_ops.LastNonNullOp(), backwards_window
157+
)
158+
159+
output_column_ids.append(interpolated_column)
160+
else:
161+
output_column_ids.append(column)
162+
163+
return block.select_columns(output_column_ids).with_column_labels(original_labels)
164+
165+
166+
def _interpolate(
167+
block: blocks.Block,
168+
x0_id: str,
169+
y0_id: str,
170+
x1_id: str,
171+
y1_id: str,
172+
xpredict_id: str,
173+
) -> typing.Tuple[blocks.Block, str]:
174+
"""Applies linear interpolation equation to predict y values for xpredict."""
175+
block, x1x0diff = block.apply_binary_op(x1_id, x0_id, ops.sub_op)
176+
block, y1y0diff = block.apply_binary_op(y1_id, y0_id, ops.sub_op)
177+
block, xpredictx0diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op)
178+
179+
block, y1_weight = block.apply_binary_op(y1y0diff, x1x0diff, ops.div_op)
180+
block, y1_part = block.apply_binary_op(xpredictx0diff, y1_weight, ops.mul_op)
181+
182+
block, prediction_id = block.apply_binary_op(y0_id, y1_part, ops.add_op)
183+
block = block.drop_columns([x1x0diff, y1y0diff, xpredictx0diff, y1_weight, y1_part])
184+
return block, prediction_id
185+
186+
108187
def drop_duplicates(
109188
block: blocks.Block, columns: typing.Sequence[str], keep: str = "first"
110189
) -> blocks.Block:

bigframes/dataframe.py

+4
Original file line numberDiff line numberDiff line change
@@ -1434,6 +1434,10 @@ def _reindex_columns(self, columns):
14341434
def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None):
14351435
return self.reindex(index=other.index, columns=other.columns, validate=validate)
14361436

1437+
def interpolate(self) -> DataFrame:
1438+
result = block_ops.interpolate_linear(self._block)
1439+
return DataFrame(result)
1440+
14371441
def fillna(self, value=None) -> DataFrame:
14381442
return self._apply_binop(value, ops.fillna_op, how="left")
14391443

bigframes/series.py

+4
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,10 @@ def replace(
468468
)
469469
return Series(block.select_column(result_col))
470470

471+
def interpolate(self) -> Series:
472+
result = block_ops.interpolate_linear(self._block)
473+
return Series(result)
474+
471475
def dropna(
472476
self,
473477
*,

tests/system/small/test_dataframe.py

+21
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,27 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index):
711711
pandas.testing.assert_frame_equal(bf_result, pd_result)
712712

713713

714+
def test_df_interpolate(scalars_dfs):
715+
scalars_df, scalars_pandas_df = scalars_dfs
716+
columns = ["int64_col", "int64_too", "float64_col"]
717+
bf_result = scalars_df[columns].interpolate().to_pandas()
718+
# Pandas can only interpolate on "float64" columns
719+
# https://github.com/pandas-dev/pandas/issues/40252
720+
pd_result = scalars_pandas_df[columns].astype("float64").interpolate()
721+
722+
print("pandas")
723+
print(pd_result.to_string())
724+
print("bigframes")
725+
print(bf_result.to_string())
726+
727+
pandas.testing.assert_frame_equal(
728+
bf_result,
729+
pd_result,
730+
check_index_type=False,
731+
check_dtype=False,
732+
)
733+
734+
714735
def test_df_fillna(scalars_dfs):
715736
scalars_df, scalars_pandas_df = scalars_dfs
716737
df = scalars_df[["int64_col", "float64_col"]].fillna(3)

tests/system/small/test_series.py

+26
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,32 @@ def test_series_replace_list_scalar(scalars_dfs):
272272
)
273273

274274

275+
@pytest.mark.parametrize(
276+
("values",),
277+
(
278+
([None, 1, 2, None, None, 16, None],),
279+
([None, None, 3.6, None],),
280+
([403.2, None, 352.1, None, None, 111.9],),
281+
),
282+
)
283+
def test_series_interpolate(values):
284+
pd_series = pd.Series(values)
285+
bf_series = series.Series(pd_series)
286+
287+
# Pandas can only interpolate on "float64" columns
288+
# https://github.com/pandas-dev/pandas/issues/40252
289+
pd_result = pd_series.astype("float64").interpolate()
290+
bf_result = bf_series.interpolate().to_pandas()
291+
292+
# pd uses non-null types, while bf uses nullable types
293+
pd.testing.assert_series_equal(
294+
pd_result,
295+
bf_result,
296+
check_index_type=False,
297+
check_dtype=False,
298+
)
299+
300+
275301
@pytest.mark.parametrize(
276302
("ignore_index",),
277303
(

third_party/bigframes_vendored/pandas/core/frame.py

+11
Original file line numberDiff line numberDiff line change
@@ -2756,6 +2756,17 @@ def value_counts(
27562756
"""
27572757
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
27582758

2759+
def interpolate(self):
2760+
"""
2761+
Fill NaN values using an interpolation method.
2762+
2763+
Returns:
2764+
DataFrame:
2765+
Returns the same object type as the caller, interpolated at
2766+
some or all ``NaN`` values
2767+
"""
2768+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
2769+
27592770
def fillna(self, value):
27602771
"""
27612772
Fill NA/NaN values using the specified method.

third_party/bigframes_vendored/pandas/core/series.py

+11
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,17 @@ def droplevel(self, level, axis):
916916
"""
917917
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
918918

919+
def interpolate(self):
920+
"""
921+
Fill NaN values using an interpolation method.
922+
923+
Returns:
924+
Series:
925+
Returns the same object type as the caller, interpolated at
926+
some or all ``NaN`` values
927+
"""
928+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
929+
919930
def fillna(
920931
self,
921932
value=None,

0 commit comments

Comments
 (0)