Skip to content

Commit 12ad9ed

Browse files
feat: add interpolate() to series and dataframe
1 parent 79a638e commit 12ad9ed

File tree

7 files changed

+210
-0
lines changed

7 files changed

+210
-0
lines changed

bigframes/core/block_transforms.py

+91
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,97 @@ def indicate_duplicates(
105105
)
106106

107107

108+
def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
109+
if method != "linear":
110+
raise NotImplementedError(
111+
f"Only 'linear' interpolate method supported. {constants.FEEDBACK_LINK}"
112+
)
113+
backwards_window = windows.WindowSpec(following=0)
114+
forwards_window = windows.WindowSpec(preceding=0)
115+
116+
output_column_ids = []
117+
118+
original_columns = block.value_columns
119+
original_labels = block.column_labels
120+
block, offsets = block.promote_offsets()
121+
for column in original_columns:
122+
# null in same places column is null
123+
should_interpolate = block._column_type(column) in [
124+
pd.Float64Dtype(),
125+
pd.Int64Dtype(),
126+
]
127+
if should_interpolate:
128+
block, notnull = block.apply_unary_op(column, ops.notnull_op)
129+
block, masked_offsets = block.apply_binary_op(
130+
offsets, notnull, ops.partial_arg3(ops.where_op, None)
131+
)
132+
133+
block, previous_value = block.apply_window_op(
134+
column, agg_ops.LastNonNullOp(), backwards_window
135+
)
136+
block, next_value = block.apply_window_op(
137+
column, agg_ops.FirstNonNullOp(), forwards_window
138+
)
139+
block, previous_value_offset = block.apply_window_op(
140+
masked_offsets,
141+
agg_ops.LastNonNullOp(),
142+
backwards_window,
143+
skip_reproject_unsafe=True,
144+
)
145+
block, next_value_offset = block.apply_window_op(
146+
masked_offsets,
147+
agg_ops.FirstNonNullOp(),
148+
forwards_window,
149+
skip_reproject_unsafe=True,
150+
)
151+
152+
block, prediction_id = _interpolate(
153+
block,
154+
previous_value_offset,
155+
previous_value,
156+
next_value_offset,
157+
next_value,
158+
offsets,
159+
)
160+
161+
block, interpolated_column = block.apply_binary_op(
162+
column, prediction_id, ops.fillna_op
163+
)
164+
# Pandas performs ffill-like behavior to extrapolate forwards
165+
block, interpolated_and_ffilled = block.apply_binary_op(
166+
interpolated_column, previous_value, ops.fillna_op
167+
)
168+
169+
output_column_ids.append(interpolated_and_ffilled)
170+
else:
171+
output_column_ids.append(column)
172+
173+
# Force reproject since used `skip_project_unsafe` perviously
174+
block = block.select_columns(output_column_ids)._force_reproject()
175+
return block.with_column_labels(original_labels)
176+
177+
178+
def _interpolate(
179+
block: blocks.Block,
180+
x0_id: str,
181+
y0_id: str,
182+
x1_id: str,
183+
y1_id: str,
184+
xpredict_id: str,
185+
) -> typing.Tuple[blocks.Block, str]:
186+
"""Applies linear interpolation equation to predict y values for xpredict."""
187+
block, x1x0diff = block.apply_binary_op(x1_id, x0_id, ops.sub_op)
188+
block, y1y0diff = block.apply_binary_op(y1_id, y0_id, ops.sub_op)
189+
block, xpredictx0diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op)
190+
191+
block, y1_weight = block.apply_binary_op(y1y0diff, x1x0diff, ops.div_op)
192+
block, y1_part = block.apply_binary_op(xpredictx0diff, y1_weight, ops.mul_op)
193+
194+
block, prediction_id = block.apply_binary_op(y0_id, y1_part, ops.add_op)
195+
block = block.drop_columns([x1x0diff, y1y0diff, xpredictx0diff, y1_weight, y1_part])
196+
return block, prediction_id
197+
198+
108199
def drop_duplicates(
109200
block: blocks.Block, columns: typing.Sequence[str], keep: str = "first"
110201
) -> blocks.Block:

bigframes/dataframe.py

+4
Original file line numberDiff line numberDiff line change
@@ -1434,6 +1434,10 @@ def _reindex_columns(self, columns):
14341434
def reindex_like(self, other: DataFrame, *, validate: typing.Optional[bool] = None):
14351435
return self.reindex(index=other.index, columns=other.columns, validate=validate)
14361436

1437+
def interpolate(self, method: str = "linear") -> DataFrame:
1438+
result = block_ops.interpolate(self._block, method)
1439+
return DataFrame(result)
1440+
14371441
def fillna(self, value=None) -> DataFrame:
14381442
return self._apply_binop(value, ops.fillna_op, how="left")
14391443

bigframes/series.py

+4
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,10 @@ def replace(
468468
)
469469
return Series(block.select_column(result_col))
470470

471+
def interpolate(self, method: str = "linear") -> Series:
472+
result = block_ops.interpolate(self._block, method)
473+
return Series(result)
474+
471475
def dropna(
472476
self,
473477
*,

tests/system/small/test_dataframe.py

+16
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,22 @@ def test_df_dropna(scalars_dfs, axis, how, ignore_index):
711711
pandas.testing.assert_frame_equal(bf_result, pd_result)
712712

713713

714+
def test_df_interpolate(scalars_dfs):
715+
scalars_df, scalars_pandas_df = scalars_dfs
716+
columns = ["int64_col", "int64_too", "float64_col"]
717+
bf_result = scalars_df[columns].interpolate().to_pandas()
718+
# Pandas can only interpolate on "float64" columns
719+
# https://github.com/pandas-dev/pandas/issues/40252
720+
pd_result = scalars_pandas_df[columns].astype("float64").interpolate()
721+
722+
pandas.testing.assert_frame_equal(
723+
bf_result,
724+
pd_result,
725+
check_index_type=False,
726+
check_dtype=False,
727+
)
728+
729+
714730
def test_df_fillna(scalars_dfs):
715731
scalars_df, scalars_pandas_df = scalars_dfs
716732
df = scalars_df[["int64_col", "float64_col"]].fillna(3)

tests/system/small/test_series.py

+26
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,32 @@ def test_series_replace_list_scalar(scalars_dfs):
272272
)
273273

274274

275+
@pytest.mark.parametrize(
276+
("values",),
277+
(
278+
([None, 1, 2, None, None, 16, None],),
279+
([None, None, 3.6, None],),
280+
([403.2, None, 352.1, None, None, 111.9],),
281+
),
282+
)
283+
def test_series_interpolate(values):
284+
pd_series = pd.Series(values)
285+
bf_series = series.Series(pd_series)
286+
287+
# Pandas can only interpolate on "float64" columns
288+
# https://github.com/pandas-dev/pandas/issues/40252
289+
pd_result = pd_series.astype("float64").interpolate()
290+
bf_result = bf_series.interpolate().to_pandas()
291+
292+
# pd uses non-null types, while bf uses nullable types
293+
pd.testing.assert_series_equal(
294+
pd_result,
295+
bf_result,
296+
check_index_type=False,
297+
check_dtype=False,
298+
)
299+
300+
275301
@pytest.mark.parametrize(
276302
("ignore_index",),
277303
(

third_party/bigframes_vendored/pandas/core/frame.py

+37
Original file line numberDiff line numberDiff line change
@@ -2756,6 +2756,43 @@ def value_counts(
27562756
"""
27572757
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
27582758

2759+
def interpolate(self, method: str = "linear"):
2760+
"""
2761+
Fill NaN values using an interpolation method.
2762+
2763+
Args:
2764+
method (str, default 'linear'):
2765+
Interpolation technique to use. Only 'linear' supported.
2766+
'linear': Ignore the index and treat the values as equally spaced.
2767+
This is the only method supported on MultiIndexes.
2768+
2769+
Returns:
2770+
DataFrame:
2771+
Returns the same object type as the caller, interpolated at
2772+
some or all ``NaN`` values
2773+
2774+
**Examples:**
2775+
2776+
>>> import bigframes.pandas as bpd
2777+
>>> bpd.options.display.progress_bar = None
2778+
2779+
>>> df = bpd.DataFrame({
2780+
... 'A': [1, 2, 3, None, None, 6],
2781+
... 'B': [None, 6, None, 2, None, 3],
2782+
... })
2783+
>>> df.interpolate()
2784+
A B
2785+
0 1.0 <NA>
2786+
1 2.0 6.0
2787+
2 3.0 4.0
2788+
3 4.0 2.0
2789+
4 5.0 2.5
2790+
5 6.0 3.0
2791+
<BLANKLINE>
2792+
[6 rows x 2 columns]
2793+
"""
2794+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
2795+
27592796
def fillna(self, value):
27602797
"""
27612798
Fill NA/NaN values using the specified method.

third_party/bigframes_vendored/pandas/core/series.py

+32
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,38 @@ def droplevel(self, level, axis):
916916
"""
917917
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
918918

919+
def interpolate(self, method: str = "linear"):
920+
"""
921+
Fill NaN values using an interpolation method.
922+
923+
Args:
924+
method (str, default 'linear'):
925+
Interpolation technique to use. Only 'linear' supported.
926+
'linear': Ignore the index and treat the values as equally spaced.
927+
This is the only method supported on MultiIndexes.
928+
929+
Returns:
930+
Series:
931+
Returns the same object type as the caller, interpolated at
932+
some or all ``NaN`` values
933+
934+
**Examples:**
935+
936+
>>> import bigframes.pandas as bpd
937+
>>> bpd.options.display.progress_bar = None
938+
939+
>>> series = bpd.Series([1, 2, 3, None, None, 6])
940+
>>> series.interpolate()
941+
0 1.0
942+
1 2.0
943+
2 3.0
944+
3 4.0
945+
4 5.0
946+
5 6.0
947+
dtype: Float64
948+
"""
949+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
950+
919951
def fillna(
920952
self,
921953
value=None,

0 commit comments

Comments
 (0)