Merge branch 'main' into garrettwu-gemini

GarrettWu · web-flow · commit 99264a81fd81 · 2024-09-04T10:37:15.000-07:00
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -30,6 +30,7 @@ jobs:
       with:
         name: coverage-artifact-${{ matrix.python }}
         path: .coverage-${{ matrix.python }}
+        include-hidden-files: true
 
   cover:
     runs-on: ubuntu-latest
diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
@@ -271,6 +271,40 @@ def json_extract_array(
     return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
 
 
+def struct(value: dataframe.DataFrame) -> series.Series:
+    """Takes a DataFrame and converts it into a Series of structs with each
+    struct entry corresponding to a DataFrame row and each struct field
+    corresponding to a DataFrame column
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import bigframes.series as series
+        >>> bpd.options.display.progress_bar = None
+
+        >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},])
+        >>> df = srs.struct.explode()
+        >>> bbq.struct(df)
+        0    {'project': 'pandas', 'version': 1}
+        1     {'project': 'numpy', 'version': 2}
+        dtype: struct<project: string, version: int64>[pyarrow]
+
+        Args:
+            value (bigframes.dataframe.DataFrame):
+                The DataFrame to be converted to a Series of structs
+
+        Returns:
+            bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame
+    """
+    block = value._block
+    block, result_id = block.apply_nary_op(
+        block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
+    )
+    block = block.select_column(result_id)
+    return bigframes.series.Series(block)
+
+
 # Search functions defined from
 # https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions
 
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -577,7 +577,9 @@ def to_pandas_batches(
         see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result"""
         dtypes = dict(zip(self.index_columns, self.index.dtypes))
         dtypes.update(zip(self.value_columns, self.dtypes))
-        _, query_job = self.session._execute(self.expr, ordered=True)
+        _, query_job = self.session._executor.execute(
+            self.expr, ordered=True, use_explicit_destination=True
+        )
         results_iterator = query_job.result(
             page_size=page_size, max_results=max_results
         )
diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
@@ -1539,6 +1539,17 @@ def nary_remote_function_op_impl(
     return result
 
 
+@scalar_op_compiler.register_nary_op(ops.StructOp, pass_op=True)
+def struct_op_impl(
+    *values: ibis_types.Value, op: ops.StructOp
+) -> ibis_types.StructValue:
+    data = {}
+    for i, value in enumerate(values):
+        data[op.column_names[i]] = value
+
+    return ibis.struct(data)
+
+
 # Helpers
 def is_null(value) -> bool:
     # float NaN/inf should be treated as distinct from 'true' null values
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
@@ -867,6 +867,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 case_when_op = CaseWhenOp()
 
 
+@dataclasses.dataclass(frozen=True)
+class StructOp(NaryOp):
+    name: typing.ClassVar[str] = "struct"
+    column_names: tuple[str]
+
+    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        num_input_types = len(input_types)
+        # value1, value2, ...
+        assert num_input_types == len(self.column_names)
+        fields = []
+
+        for i in range(num_input_types):
+            fields.append(
+                (
+                    self.column_names[i],
+                    dtypes.bigframes_dtype_to_arrow_dtype(input_types[i]),
+                )
+            )
+        return pd.ArrowDtype(
+            pa.struct(fields)
+        )  # [(name1, value1), (name2, value2), ...]
+
+
 # Just parameterless unary ops for now
 # TODO: Parameter mappings
 NUMPY_TO_OP: typing.Final = {
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -1324,6 +1324,7 @@ def _execute(
         *,
         ordered: bool = True,
         col_id_overrides: Mapping[str, str] = {},
+        use_explicit_destination: bool = False,
     ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
         return self._executor.execute(
             array_value,
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -102,6 +102,7 @@ def execute(
         *,
         ordered: bool = True,
         col_id_overrides: Mapping[str, str] = {},
+        use_explicit_destination: bool = False,
     ):
         """
         Execute the ArrayValue, storing the result to a temporary session-owned table.
@@ -113,6 +114,13 @@ def execute(
             array_value, ordered=ordered, col_id_overrides=col_id_overrides
         )
         job_config = bigquery.QueryJobConfig()
+        # Use explicit destination to avoid 10GB limit of temporary table
+        if use_explicit_destination:
+            schema = array_value.schema.to_bigquery()
+            destination_table = self.storage_manager.create_temp_table(
+                schema, cluster_cols=[]
+            )
+            job_config.destination = destination_table
         # TODO(swast): plumb through the api_name of the user-facing api that
         # caused this query.
         return self._run_execute_query(
diff --git a/owlbot.py b/owlbot.py
@@ -49,6 +49,7 @@
         "README.rst",
         "CONTRIBUTING.rst",
         ".github/release-trigger.yml",
+        ".github/workflows/unittest.yml",
         # BigQuery DataFrames manages its own Kokoro cluster for presubmit & continuous tests.
         ".kokoro/build.sh",
         ".kokoro/continuous/common.cfg",
diff --git a/tests/system/small/bigquery/test_struct.py b/tests/system/small/bigquery/test_struct.py
@@ -0,0 +1,61 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pytest
+
+import bigframes.bigquery as bbq
+import bigframes.series as series
+
+
+@pytest.mark.parametrize(
+    "columns_arg",
+    [
+        [
+            {"version": 1, "project": "pandas"},
+            {"version": 2, "project": "pandas"},
+            {"version": 1, "project": "numpy"},
+        ],
+        [
+            {"version": 1, "project": "pandas"},
+            {"version": None, "project": "pandas"},
+            {"version": 1, "project": "numpy"},
+        ],
+        [
+            {"array": [6, 4, 6], "project": "pandas"},
+            {"array": [6, 4, 7, 6], "project": "pandas"},
+            {"array": [7, 2, 3], "project": "numpy"},
+        ],
+        [
+            {"array": [6, 4, 6], "project": "pandas"},
+            {"array": [6, 4, 7, 6], "project": "pandas"},
+            {"array": [7, 2, 3], "project": "numpy"},
+        ],
+        [
+            {"struct": [{"x": 2, "y": 4}], "project": "pandas"},
+            {"struct": [{"x": 9, "y": 3}], "project": "pandas"},
+            {"struct": [{"x": 1, "y": 2}], "project": "numpy"},
+        ],
+    ],
+)
+def test_struct_from_dataframe(columns_arg):
+    srs = series.Series(
+        columns_arg,
+    )
+    pd.testing.assert_series_equal(
+        srs.to_pandas(),
+        bbq.struct(srs.struct.explode()).to_pandas(),
+        check_index_type=False,
+        check_dtype=False,
+    )
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
@@ -22,12 +22,12 @@ def field(self, name_or_index: str | int):
             >>> bpd.options.display.progress_bar = None
             >>> s = bpd.Series(
             ...     [
-            ...         {"version": 1, "project": "pandas"},
-            ...         {"version": 2, "project": "pandas"},
-            ...         {"version": 1, "project": "numpy"},
+            ...         {"project": "pandas", "version": 1},
+            ...         {"project": "pandas", "version": 2},
+            ...         {"project": "numpy", "version": 1},
             ...     ],
             ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("version", pa.int64()), ("project", pa.string())]
+            ...         [("project", pa.string()), ("version", pa.int64())]
             ...     ))
             ... )
 
@@ -41,7 +41,7 @@ def field(self, name_or_index: str | int):
 
         Extract by field index.
 
-            >>> s.struct.field(0)
+            >>> s.struct.field(1)
             0    1
             1    2
             2    1
@@ -68,22 +68,22 @@ def explode(self):
             >>> bpd.options.display.progress_bar = None
             >>> s = bpd.Series(
             ...     [
-            ...         {"version": 1, "project": "pandas"},
-            ...         {"version": 2, "project": "pandas"},
-            ...         {"version": 1, "project": "numpy"},
+            ...         {"project": "pandas", "version": 1},
+            ...         {"project": "pandas", "version": 2},
+            ...         {"project": "numpy", "version": 1},
             ...     ],
             ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("version", pa.int64()), ("project", pa.string())]
+            ...         [("project", pa.string()), ("version", pa.int64())]
             ...     ))
             ... )
 
         Extract all child fields.
 
             >>> s.struct.explode()
-               version project
-            0        1  pandas
-            1        2  pandas
-            2        1   numpy
+               project version
+            0   pandas       1
+            1   pandas       2
+            2    numpy       1
             <BLANKLINE>
             [3 rows x 2 columns]
 
@@ -113,8 +113,8 @@ def dtypes(self):
             ...     ))
             ... )
             >>> s.struct.dtypes()
-            version              Int64
             project    string[pyarrow]
+            version              Int64
             dtype: object
 
         Returns:
@@ -140,21 +140,21 @@ def explode(self, column, *, separator: str = "."):
             >>> countries = bpd.Series(["cn", "es", "us"])
             >>> files = bpd.Series(
             ...     [
-            ...         {"version": 1, "project": "pandas"},
-            ...         {"version": 2, "project": "pandas"},
-            ...         {"version": 1, "project": "numpy"},
+            ...         {"project": "pandas", "version": 1},
+            ...         {"project": "pandas", "version": 2},
+            ...         {"project": "numpy", "version": 1},
             ...     ],
             ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("version", pa.int64()), ("project", pa.string())]
+            ...         [("project", pa.string()), ("version", pa.int64())]
             ...     ))
             ... )
             >>> downloads = bpd.Series([100, 200, 300])
             >>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads})
             >>> df.struct.explode("file")
-              country  file.version file.project  download_count
-            0      cn             1       pandas             100
-            1      es             2       pandas             200
-            2      us             1        numpy             300
+              country file.project  file.version  download_count
+            0      cn       pandas             1             100
+            1      es       pandas             2             200
+            2      us        numpy             1             300
             <BLANKLINE>
             [3 rows x 4 columns]