Skip to content

Commit 99264a8

Browse files
authored
Merge branch 'main' into garrettwu-gemini
2 parents 4ba996a + d632cd0 commit 99264a8

File tree

10 files changed

+165
-23
lines changed

10 files changed

+165
-23
lines changed

.github/workflows/unittest.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ jobs:
3030
with:
3131
name: coverage-artifact-${{ matrix.python }}
3232
path: .coverage-${{ matrix.python }}
33+
include-hidden-files: true
3334

3435
cover:
3536
runs-on: ubuntu-latest

bigframes/bigquery/__init__.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,40 @@ def json_extract_array(
271271
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
272272

273273

274+
def struct(value: dataframe.DataFrame) -> series.Series:
275+
"""Takes a DataFrame and converts it into a Series of structs with each
276+
struct entry corresponding to a DataFrame row and each struct field
277+
corresponding to a DataFrame column
278+
279+
**Examples:**
280+
281+
>>> import bigframes.pandas as bpd
282+
>>> import bigframes.bigquery as bbq
283+
>>> import bigframes.series as series
284+
>>> bpd.options.display.progress_bar = None
285+
286+
>>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},])
287+
>>> df = srs.struct.explode()
288+
>>> bbq.struct(df)
289+
0 {'project': 'pandas', 'version': 1}
290+
1 {'project': 'numpy', 'version': 2}
291+
dtype: struct<project: string, version: int64>[pyarrow]
292+
293+
Args:
294+
value (bigframes.dataframe.DataFrame):
295+
The DataFrame to be converted to a Series of structs
296+
297+
Returns:
298+
bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame
299+
"""
300+
block = value._block
301+
block, result_id = block.apply_nary_op(
302+
block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
303+
)
304+
block = block.select_column(result_id)
305+
return bigframes.series.Series(block)
306+
307+
274308
# Search functions defined from
275309
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions
276310

bigframes/core/blocks.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,9 @@ def to_pandas_batches(
577577
see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result"""
578578
dtypes = dict(zip(self.index_columns, self.index.dtypes))
579579
dtypes.update(zip(self.value_columns, self.dtypes))
580-
_, query_job = self.session._execute(self.expr, ordered=True)
580+
_, query_job = self.session._executor.execute(
581+
self.expr, ordered=True, use_explicit_destination=True
582+
)
581583
results_iterator = query_job.result(
582584
page_size=page_size, max_results=max_results
583585
)

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1539,6 +1539,17 @@ def nary_remote_function_op_impl(
15391539
return result
15401540

15411541

1542+
@scalar_op_compiler.register_nary_op(ops.StructOp, pass_op=True)
1543+
def struct_op_impl(
1544+
*values: ibis_types.Value, op: ops.StructOp
1545+
) -> ibis_types.StructValue:
1546+
data = {}
1547+
for i, value in enumerate(values):
1548+
data[op.column_names[i]] = value
1549+
1550+
return ibis.struct(data)
1551+
1552+
15421553
# Helpers
15431554
def is_null(value) -> bool:
15441555
# float NaN/inf should be treated as distinct from 'true' null values

bigframes/operations/__init__.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
867867
case_when_op = CaseWhenOp()
868868

869869

870+
@dataclasses.dataclass(frozen=True)
871+
class StructOp(NaryOp):
872+
name: typing.ClassVar[str] = "struct"
873+
column_names: tuple[str]
874+
875+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
876+
num_input_types = len(input_types)
877+
# value1, value2, ...
878+
assert num_input_types == len(self.column_names)
879+
fields = []
880+
881+
for i in range(num_input_types):
882+
fields.append(
883+
(
884+
self.column_names[i],
885+
dtypes.bigframes_dtype_to_arrow_dtype(input_types[i]),
886+
)
887+
)
888+
return pd.ArrowDtype(
889+
pa.struct(fields)
890+
) # [(name1, value1), (name2, value2), ...]
891+
892+
870893
# Just parameterless unary ops for now
871894
# TODO: Parameter mappings
872895
NUMPY_TO_OP: typing.Final = {

bigframes/session/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,6 +1324,7 @@ def _execute(
13241324
*,
13251325
ordered: bool = True,
13261326
col_id_overrides: Mapping[str, str] = {},
1327+
use_explicit_destination: bool = False,
13271328
) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
13281329
return self._executor.execute(
13291330
array_value,

bigframes/session/executor.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ def execute(
102102
*,
103103
ordered: bool = True,
104104
col_id_overrides: Mapping[str, str] = {},
105+
use_explicit_destination: bool = False,
105106
):
106107
"""
107108
Execute the ArrayValue, storing the result to a temporary session-owned table.
@@ -113,6 +114,13 @@ def execute(
113114
array_value, ordered=ordered, col_id_overrides=col_id_overrides
114115
)
115116
job_config = bigquery.QueryJobConfig()
117+
# Use explicit destination to avoid 10GB limit of temporary table
118+
if use_explicit_destination:
119+
schema = array_value.schema.to_bigquery()
120+
destination_table = self.storage_manager.create_temp_table(
121+
schema, cluster_cols=[]
122+
)
123+
job_config.destination = destination_table
116124
# TODO(swast): plumb through the api_name of the user-facing api that
117125
# caused this query.
118126
return self._run_execute_query(

owlbot.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
"README.rst",
5050
"CONTRIBUTING.rst",
5151
".github/release-trigger.yml",
52+
".github/workflows/unittest.yml",
5253
# BigQuery DataFrames manages its own Kokoro cluster for presubmit & continuous tests.
5354
".kokoro/build.sh",
5455
".kokoro/continuous/common.cfg",
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pandas as pd
16+
import pytest
17+
18+
import bigframes.bigquery as bbq
19+
import bigframes.series as series
20+
21+
22+
@pytest.mark.parametrize(
23+
"columns_arg",
24+
[
25+
[
26+
{"version": 1, "project": "pandas"},
27+
{"version": 2, "project": "pandas"},
28+
{"version": 1, "project": "numpy"},
29+
],
30+
[
31+
{"version": 1, "project": "pandas"},
32+
{"version": None, "project": "pandas"},
33+
{"version": 1, "project": "numpy"},
34+
],
35+
[
36+
{"array": [6, 4, 6], "project": "pandas"},
37+
{"array": [6, 4, 7, 6], "project": "pandas"},
38+
{"array": [7, 2, 3], "project": "numpy"},
39+
],
40+
[
41+
{"array": [6, 4, 6], "project": "pandas"},
42+
{"array": [6, 4, 7, 6], "project": "pandas"},
43+
{"array": [7, 2, 3], "project": "numpy"},
44+
],
45+
[
46+
{"struct": [{"x": 2, "y": 4}], "project": "pandas"},
47+
{"struct": [{"x": 9, "y": 3}], "project": "pandas"},
48+
{"struct": [{"x": 1, "y": 2}], "project": "numpy"},
49+
],
50+
],
51+
)
52+
def test_struct_from_dataframe(columns_arg):
53+
srs = series.Series(
54+
columns_arg,
55+
)
56+
pd.testing.assert_series_equal(
57+
srs.to_pandas(),
58+
bbq.struct(srs.struct.explode()).to_pandas(),
59+
check_index_type=False,
60+
check_dtype=False,
61+
)

third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ def field(self, name_or_index: str | int):
2222
>>> bpd.options.display.progress_bar = None
2323
>>> s = bpd.Series(
2424
... [
25-
... {"version": 1, "project": "pandas"},
26-
... {"version": 2, "project": "pandas"},
27-
... {"version": 1, "project": "numpy"},
25+
... {"project": "pandas", "version": 1},
26+
... {"project": "pandas", "version": 2},
27+
... {"project": "numpy", "version": 1},
2828
... ],
2929
... dtype=bpd.ArrowDtype(pa.struct(
30-
... [("version", pa.int64()), ("project", pa.string())]
30+
... [("project", pa.string()), ("version", pa.int64())]
3131
... ))
3232
... )
3333
@@ -41,7 +41,7 @@ def field(self, name_or_index: str | int):
4141
4242
Extract by field index.
4343
44-
>>> s.struct.field(0)
44+
>>> s.struct.field(1)
4545
0 1
4646
1 2
4747
2 1
@@ -68,22 +68,22 @@ def explode(self):
6868
>>> bpd.options.display.progress_bar = None
6969
>>> s = bpd.Series(
7070
... [
71-
... {"version": 1, "project": "pandas"},
72-
... {"version": 2, "project": "pandas"},
73-
... {"version": 1, "project": "numpy"},
71+
... {"project": "pandas", "version": 1},
72+
... {"project": "pandas", "version": 2},
73+
... {"project": "numpy", "version": 1},
7474
... ],
7575
... dtype=bpd.ArrowDtype(pa.struct(
76-
... [("version", pa.int64()), ("project", pa.string())]
76+
... [("project", pa.string()), ("version", pa.int64())]
7777
... ))
7878
... )
7979
8080
Extract all child fields.
8181
8282
>>> s.struct.explode()
83-
version project
84-
0 1 pandas
85-
1 2 pandas
86-
2 1 numpy
83+
project version
84+
0 pandas 1
85+
1 pandas 2
86+
2 numpy 1
8787
<BLANKLINE>
8888
[3 rows x 2 columns]
8989
@@ -113,8 +113,8 @@ def dtypes(self):
113113
... ))
114114
... )
115115
>>> s.struct.dtypes()
116-
version Int64
117116
project string[pyarrow]
117+
version Int64
118118
dtype: object
119119
120120
Returns:
@@ -140,21 +140,21 @@ def explode(self, column, *, separator: str = "."):
140140
>>> countries = bpd.Series(["cn", "es", "us"])
141141
>>> files = bpd.Series(
142142
... [
143-
... {"version": 1, "project": "pandas"},
144-
... {"version": 2, "project": "pandas"},
145-
... {"version": 1, "project": "numpy"},
143+
... {"project": "pandas", "version": 1},
144+
... {"project": "pandas", "version": 2},
145+
... {"project": "numpy", "version": 1},
146146
... ],
147147
... dtype=bpd.ArrowDtype(pa.struct(
148-
... [("version", pa.int64()), ("project", pa.string())]
148+
... [("project", pa.string()), ("version", pa.int64())]
149149
... ))
150150
... )
151151
>>> downloads = bpd.Series([100, 200, 300])
152152
>>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads})
153153
>>> df.struct.explode("file")
154-
country file.version file.project download_count
155-
0 cn 1 pandas 100
156-
1 es 2 pandas 200
157-
2 us 1 numpy 300
154+
country file.project file.version download_count
155+
0 cn pandas 1 100
156+
1 es pandas 2 200
157+
2 us numpy 1 300
158158
<BLANKLINE>
159159
[3 rows x 4 columns]
160160

0 commit comments

Comments
 (0)