fix: ensure page_size works correctly in to_pandas_batches when max_results is not set (#1588)

chelsea-lin · web-flow · commit 570cff3c2efe · 2025-04-03T14:38:59.000-05:00
* fix: page_size without max_results does not work with to_pandas_batches

* add series.to_pandas code example

* fix test_to_pandas_batches_override_global_option: workaround solution can skip one warning message
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1634,6 +1634,39 @@ def to_pandas(
     ) -> pandas.DataFrame | pandas.Series:
         """Write DataFrame to pandas DataFrame.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> df = bpd.DataFrame({'col': [4, 2, 2]})
+
+        Download the data from BigQuery and convert it into an in-memory pandas DataFrame.
+
+            >>> df.to_pandas()
+               col
+            0    4
+            1    2
+            2    2
+
+        Estimate job statistics without processing or downloading data by using `dry_run=True`.
+
+            >>> df.to_pandas(dry_run=True) # doctest: +SKIP
+            columnCount                                                            1
+            columnDtypes                                              {'col': Int64}
+            indexLevel                                                             1
+            indexDtypes                                                      [Int64]
+            projectId                                                  bigframes-dev
+            location                                                              US
+            jobType                                                            QUERY
+            destinationTable       {'projectId': 'bigframes-dev', 'datasetId': '_...
+            useLegacySql                                                       False
+            referencedTables                                                    None
+            totalBytesProcessed                                                    0
+            cacheHit                                                           False
+            statementType                                                     SELECT
+            creationTime                            2025-04-02 20:17:12.038000+00:00
+            dtype: object
+
         Args:
             max_download_size (int, default None):
                 Download size threshold in MB. If max_download_size is exceeded when downloading data
@@ -1666,9 +1699,6 @@ def to_pandas(
                 downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas
                 Series containing dry run statistics will be returned.
         """
-
-        # TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job
-
         if dry_run:
             dry_run_stats, dry_run_job = self._block._compute_dry_run(
                 max_download_size=max_download_size,
@@ -1702,11 +1732,40 @@ def to_pandas_batches(
         page_size and max_results determine the size and number of batches,
         see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]})
+
+        Iterate through the results in batches, limiting the total rows yielded
+        across all batches via `max_results`:
+
+            >>> for df_batch in df.to_pandas_batches(max_results=3):
+            ...     print(df_batch)
+               col
+            0    4
+            1    3
+            2    2
+
+        Alternatively, control the approximate size of each batch using `page_size`
+        and fetch batches manually using `next()`:
+
+            >>> it = df.to_pandas_batches(page_size=2)
+            >>> next(it)
+               col
+            0    4
+            1    3
+            >>> next(it)
+               col
+            2    2
+            3    2
+
         Args:
             page_size (int, default None):
-                The size of each batch.
+                The maximum number of rows of each batch. Non-positive values are ignored.
             max_results (int, default None):
-                If given, only download this many rows at maximum.
+                The maximum total number of rows of all batches.
             allow_large_results (bool, default None):
                 If not None, overrides the global setting to allow or disallow large query results
                 over the default size limit of 10 GB.
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -385,6 +385,39 @@ def to_pandas(
     ) -> pandas.Series:
         """Writes Series to pandas Series.
 
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> s = bpd.Series([4, 3, 2])
+
+        Download the data from BigQuery and convert it into an in-memory pandas Series.
+
+            >>> s.to_pandas()
+            0    4
+            1    3
+            2    2
+            dtype: Int64
+
+        Estimate job statistics without processing or downloading data by using `dry_run=True`.
+
+            >>> s.to_pandas(dry_run=True) # doctest: +SKIP
+            columnCount                                                            1
+            columnDtypes                                               {None: Int64}
+            indexLevel                                                             1
+            indexDtypes                                                      [Int64]
+            projectId                                                  bigframes-dev
+            location                                                              US
+            jobType                                                            QUERY
+            destinationTable       {'projectId': 'bigframes-dev', 'datasetId': '_...
+            useLegacySql                                                       False
+            referencedTables                                                    None
+            totalBytesProcessed                                                    0
+            cacheHit                                                           False
+            statementType                                                     SELECT
+            creationTime                            2025-04-03 18:54:59.219000+00:00
+            dtype: object
+
         Args:
             max_download_size (int, default None):
                 Download size threshold in MB. If max_download_size is exceeded when downloading data
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -264,7 +264,13 @@ def execute(
 
         # Though we provide the read client, iterator may or may not use it based on what is efficient for the result
         def iterator_supplier():
-            return iterator.to_arrow_iterable(bqstorage_client=self.bqstoragereadclient)
+            # Workaround issue fixed by: https://github.com/googleapis/python-bigquery/pull/2154
+            if iterator._page_size is not None or iterator.max_results is not None:
+                return iterator.to_arrow_iterable(bqstorage_client=None)
+            else:
+                return iterator.to_arrow_iterable(
+                    bqstorage_client=self.bqstoragereadclient
+                )
 
         if query_job:
             size_bytes = self.bqclient.get_table(query_job.destination).num_bytes
diff --git a/tests/system/large/test_dataframe_io.py b/tests/system/large/test_dataframe_io.py
@@ -44,7 +44,7 @@ def test_to_pandas_batches_override_global_option(
                     page_size=500, max_results=1500, allow_large_results=True
                 )
             )
-            assert len(w) == 2
+            assert len(w) == 1
             assert issubclass(w[0].category, FutureWarning)
             assert "The query result size has exceeded 10 GB." in str(w[0].message)
 

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ def test_to_pandas_batches_override_global_option(`
`44`	`44`	`page_size=500, max_results=1500, allow_large_results=True`
`45`	`45`	`)`
`46`	`46`	`)`
`47`		`- assert len(w) == 2`
	`47`	`+ assert len(w) == 1`
`48`	`48`	`assert issubclass(w[0].category, FutureWarning)`
`49`	`49`	`assert "The query result size has exceeded 10 GB." in str(w[0].message)`
`50`	`50`