Skip to content

Commit 4efcd9c

Browse files
authored
Merge branch 'main' into ashleyxu-add-kbins-discretizer
2 parents f9bfa6e + c506200 commit 4efcd9c

File tree

11 files changed

+260
-179
lines changed

11 files changed

+260
-179
lines changed

.github/.OwlBot.lock.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@
1313
# limitations under the License.
1414
docker:
1515
image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest
16-
digest: sha256:3e3800bb100af5d7f9e810d48212b37812c1856d20ffeafb99ebe66461b61fc7
17-
# created: 2023-08-02T10:53:29.114535628Z
16+
digest: sha256:fac304457974bb530cc5396abd4ab25d26a469cd3bc97cbfb18c8d4324c584eb
17+
# created: 2023-10-02T21:31:03.517640371Z

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ docs.metadata
5050

5151
# Virtual environment
5252
env/
53+
venv/
5354

5455
# Test logs
5556
coverage.xml

.kokoro/requirements.txt

+25-24
Original file line numberDiff line numberDiff line change
@@ -113,30 +113,30 @@ commonmark==0.9.1 \
113113
--hash=sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60 \
114114
--hash=sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9
115115
# via rich
116-
cryptography==41.0.3 \
117-
--hash=sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306 \
118-
--hash=sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84 \
119-
--hash=sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47 \
120-
--hash=sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d \
121-
--hash=sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116 \
122-
--hash=sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207 \
123-
--hash=sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81 \
124-
--hash=sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087 \
125-
--hash=sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd \
126-
--hash=sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507 \
127-
--hash=sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858 \
128-
--hash=sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae \
129-
--hash=sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34 \
130-
--hash=sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906 \
131-
--hash=sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd \
132-
--hash=sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922 \
133-
--hash=sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7 \
134-
--hash=sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4 \
135-
--hash=sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574 \
136-
--hash=sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1 \
137-
--hash=sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c \
138-
--hash=sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e \
139-
--hash=sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de
116+
cryptography==41.0.4 \
117+
--hash=sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67 \
118+
--hash=sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311 \
119+
--hash=sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8 \
120+
--hash=sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13 \
121+
--hash=sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143 \
122+
--hash=sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f \
123+
--hash=sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829 \
124+
--hash=sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd \
125+
--hash=sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397 \
126+
--hash=sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac \
127+
--hash=sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d \
128+
--hash=sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a \
129+
--hash=sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839 \
130+
--hash=sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e \
131+
--hash=sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6 \
132+
--hash=sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9 \
133+
--hash=sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860 \
134+
--hash=sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca \
135+
--hash=sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91 \
136+
--hash=sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d \
137+
--hash=sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714 \
138+
--hash=sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb \
139+
--hash=sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f
140140
# via
141141
# gcp-releasetool
142142
# secretstorage
@@ -382,6 +382,7 @@ protobuf==3.20.3 \
382382
# gcp-docuploader
383383
# gcp-releasetool
384384
# google-api-core
385+
# googleapis-common-protos
385386
pyasn1==0.4.8 \
386387
--hash=sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d \
387388
--hash=sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba

bigframes/core/blocks.py

+73
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import bigframes.core as core
3939
import bigframes.core.guid as guid
4040
import bigframes.core.indexes as indexes
41+
import bigframes.core.joins as joins
4142
import bigframes.core.ordering as ordering
4243
import bigframes.core.utils
4344
import bigframes.core.utils as utils
@@ -1403,6 +1404,78 @@ def concat(
14031404
result_block = result_block.reset_index()
14041405
return result_block
14051406

1407+
def merge(
1408+
self,
1409+
other: Block,
1410+
how: typing.Literal[
1411+
"inner",
1412+
"left",
1413+
"outer",
1414+
"right",
1415+
],
1416+
left_col_ids: typing.Sequence[str],
1417+
right_col_ids: typing.Sequence[str],
1418+
sort: bool,
1419+
suffixes: tuple[str, str] = ("_x", "_y"),
1420+
) -> Block:
1421+
(
1422+
joined_expr,
1423+
coalesced_join_cols,
1424+
(get_column_left, get_column_right),
1425+
) = joins.join_by_column(
1426+
self.expr,
1427+
left_col_ids,
1428+
other.expr,
1429+
right_col_ids,
1430+
how=how,
1431+
sort=sort,
1432+
)
1433+
1434+
# which join key parts should be coalesced
1435+
merge_join_key_mask = [
1436+
str(self.col_id_to_label[left_id]) == str(other.col_id_to_label[right_id])
1437+
for left_id, right_id in zip(left_col_ids, right_col_ids)
1438+
]
1439+
labels_to_coalesce = [
1440+
self.col_id_to_label[col_id]
1441+
for i, col_id in enumerate(left_col_ids)
1442+
if merge_join_key_mask[i]
1443+
]
1444+
1445+
def left_col_mapping(col_id: str) -> str:
1446+
if col_id in left_col_ids:
1447+
join_key_part = left_col_ids.index(col_id)
1448+
if merge_join_key_mask[join_key_part]:
1449+
return coalesced_join_cols[join_key_part]
1450+
return get_column_left(col_id)
1451+
1452+
def right_col_mapping(col_id: str) -> typing.Optional[str]:
1453+
if col_id in right_col_ids:
1454+
join_key_part = right_col_ids.index(col_id)
1455+
if merge_join_key_mask[join_key_part]:
1456+
return None
1457+
return get_column_right(col_id)
1458+
1459+
left_columns = [left_col_mapping(col_id) for col_id in self.value_columns]
1460+
1461+
right_columns = [
1462+
typing.cast(str, right_col_mapping(col_id))
1463+
for col_id in other.value_columns
1464+
if right_col_mapping(col_id)
1465+
]
1466+
1467+
expr = joined_expr.select_columns([*left_columns, *right_columns])
1468+
labels = utils.merge_column_labels(
1469+
self.column_labels,
1470+
other.column_labels,
1471+
coalesce_labels=labels_to_coalesce,
1472+
suffixes=suffixes,
1473+
)
1474+
1475+
# Constructs default index
1476+
expr, offset_index_id = expr.promote_offsets()
1477+
return Block(expr, index_columns=[offset_index_id], column_labels=labels)
1478+
14061479
def _force_reproject(self) -> Block:
14071480
"""Forces a reprojection of the underlying tables expression. Used to force predicate/order application before subsequent operations."""
14081481
return Block(

bigframes/core/indexers.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -332,8 +332,6 @@ def _iloc_getitem_series_or_dataframe(
332332
elif isinstance(key, slice):
333333
return series_or_dataframe._slice(key.start, key.stop, key.step)
334334
elif pd.api.types.is_list_like(key):
335-
# TODO(henryjsolberg): support MultiIndex
336-
337335
if len(key) == 0:
338336
return typing.cast(
339337
typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
@@ -346,15 +344,18 @@ def _iloc_getitem_series_or_dataframe(
346344
original_series_name if original_series_name is not None else "0"
347345
)
348346
df = series_or_dataframe.to_frame()
349-
original_index_name = df.index.name
350-
temporary_index_name = guid.generate_guid(prefix="temp_iloc_index_")
351-
df = df.rename_axis(temporary_index_name)
347+
original_index_names = df.index.names
348+
temporary_index_names = [
349+
guid.generate_guid(prefix="temp_iloc_index_")
350+
for _ in range(len(df.index.names))
351+
]
352+
df = df.rename_axis(temporary_index_names)
352353

353354
# set to offset index and use regular loc, then restore index
354355
df = df.reset_index(drop=False)
355356
result = df.loc[key]
356-
result = result.set_index(temporary_index_name)
357-
result = result.rename_axis(original_index_name)
357+
result = result.set_index(temporary_index_names)
358+
result = result.rename_axis(original_index_names)
358359

359360
if isinstance(series_or_dataframe, bigframes.series.Series):
360361
result = result[series_name]

bigframes/core/joins/single_column.py

+20-40
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def join_by_column(
4444
"right",
4545
],
4646
sort: bool = False,
47-
coalesce_join_keys: bool = True,
4847
allow_row_identity_join: bool = True,
4948
) -> Tuple[
5049
core.ArrayValue,
@@ -59,8 +58,6 @@ def join_by_column(
5958
right: Expression for right table to join.
6059
right_column_ids: Column IDs (not label) to join by.
6160
how: The type of join to perform.
62-
coalesce_join_keys: if set to False, returned column ids will contain
63-
both left and right join key columns.
6461
allow_row_identity_join (bool):
6562
If True, allow matching by row identity. Set to False to always
6663
perform a true JOIN in generated SQL.
@@ -71,8 +68,6 @@ def join_by_column(
7168
* Sequence[str]: Column IDs of the coalesced join columns. Sometimes either the
7269
left/right table will have missing rows. This column pulls the
7370
non-NULL value from either left/right.
74-
If coalesce_join_keys is False, will return uncombined left and
75-
right key columns.
7671
* Tuple[Callable, Callable]: For a given column ID from left or right,
7772
respectively, return the new column id from the combined expression.
7873
"""
@@ -100,9 +95,7 @@ def join_by_column(
10095
right_join_keys = [
10196
combined_expr.get_column(get_column_right(col)) for col in right_column_ids
10297
]
103-
join_key_cols = get_join_cols(
104-
left_join_keys, right_join_keys, how, coalesce_join_keys
105-
)
98+
join_key_cols = get_coalesced_join_cols(left_join_keys, right_join_keys, how)
10699
join_key_ids = [col.get_name() for col in join_key_cols]
107100
combined_expr = combined_expr.projection(
108101
[*join_key_cols, *combined_expr.columns]
@@ -182,9 +175,7 @@ def get_column_right(col_id):
182175
right_join_keys = [
183176
combined_table[get_column_right(col)] for col in right_column_ids
184177
]
185-
join_key_cols = get_join_cols(
186-
left_join_keys, right_join_keys, how, coalesce_join_keys
187-
)
178+
join_key_cols = get_coalesced_join_cols(left_join_keys, right_join_keys, how)
188179
# We could filter out the original join columns, but predicates/ordering
189180
# might still reference them in implicit joins.
190181
columns = (
@@ -226,46 +217,35 @@ def get_column_right(col_id):
226217
)
227218

228219

229-
def get_join_cols(
220+
def get_coalesced_join_cols(
230221
left_join_cols: typing.Iterable[ibis_types.Value],
231222
right_join_cols: typing.Iterable[ibis_types.Value],
232223
how: str,
233-
coalesce_join_keys: bool = True,
234224
) -> typing.List[ibis_types.Value]:
235225
join_key_cols: list[ibis_types.Value] = []
236226
for left_col, right_col in zip(left_join_cols, right_join_cols):
237-
if not coalesce_join_keys:
227+
if how == "left" or how == "inner":
238228
join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_")))
229+
elif how == "right":
239230
join_key_cols.append(right_col.name(guid.generate_guid(prefix="index_")))
240-
else:
241-
if how == "left" or how == "inner":
231+
elif how == "outer":
232+
# The left index and the right index might contain null values, for
233+
# example due to an outer join with different numbers of rows. Coalesce
234+
# these to take the index value from either column.
235+
# Use a random name in case the left index and the right index have the
236+
# same name. In such a case, _x and _y suffixes will already be used.
237+
# Don't need to coalesce if they are exactly the same column.
238+
if left_col.name("index").equals(right_col.name("index")):
242239
join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_")))
243-
elif how == "right":
244-
join_key_cols.append(
245-
right_col.name(guid.generate_guid(prefix="index_"))
246-
)
247-
elif how == "outer":
248-
# The left index and the right index might contain null values, for
249-
# example due to an outer join with different numbers of rows. Coalesce
250-
# these to take the index value from either column.
251-
# Use a random name in case the left index and the right index have the
252-
# same name. In such a case, _x and _y suffixes will already be used.
253-
# Don't need to coalesce if they are exactly the same column.
254-
if left_col.name("index").equals(right_col.name("index")):
255-
join_key_cols.append(
256-
left_col.name(guid.generate_guid(prefix="index_"))
257-
)
258-
else:
259-
join_key_cols.append(
260-
ibis.coalesce(
261-
left_col,
262-
right_col,
263-
).name(guid.generate_guid(prefix="index_"))
264-
)
265240
else:
266-
raise ValueError(
267-
f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}"
241+
join_key_cols.append(
242+
ibis.coalesce(
243+
left_col,
244+
right_col,
245+
).name(guid.generate_guid(prefix="index_"))
268246
)
247+
else:
248+
raise ValueError(f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}")
269249
return join_key_cols
270250

271251

bigframes/core/utils.py

+33
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,36 @@ def get_standardized_ids(
104104
idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]
105105

106106
return col_ids, idx_ids
107+
108+
109+
def merge_column_labels(
110+
left_labels: pd.Index,
111+
right_labels: pd.Index,
112+
coalesce_labels: typing.Sequence,
113+
suffixes: tuple[str, str] = ("_x", "_y"),
114+
) -> pd.Index:
115+
result_labels = []
116+
117+
for col_label in left_labels:
118+
if col_label in right_labels:
119+
if col_label in coalesce_labels:
120+
# Merging on the same column only returns 1 key column from coalesce both.
121+
# Take the left key column.
122+
result_labels.append(col_label)
123+
else:
124+
result_labels.append(str(col_label) + suffixes[0])
125+
else:
126+
result_labels.append(col_label)
127+
128+
for col_label in right_labels:
129+
if col_label in left_labels:
130+
if col_label in coalesce_labels:
131+
# Merging on the same column only returns 1 key column from coalesce both.
132+
# Pass the right key column.
133+
pass
134+
else:
135+
result_labels.append(str(col_label) + suffixes[1])
136+
else:
137+
result_labels.append(col_label)
138+
139+
return pd.Index(result_labels)

0 commit comments

Comments
 (0)