Skip to content

Commit 570cff3

Browse files
authored
fix: ensure page_size works correctly in to_pandas_batches when max_results is not set (#1588)
* fix: page_size without max_results does not work with to_pandas_batches * add series.to_pandas code example * fix test_to_pandas_batches_override_global_option: workaround solution can skip one warning message
1 parent 9711b83 commit 570cff3

File tree

4 files changed

+105
-7
lines changed

4 files changed

+105
-7
lines changed

bigframes/dataframe.py

+64-5
Original file line numberDiff line numberDiff line change
@@ -1634,6 +1634,39 @@ def to_pandas(
16341634
) -> pandas.DataFrame | pandas.Series:
16351635
"""Write DataFrame to pandas DataFrame.
16361636
1637+
**Examples:**
1638+
1639+
>>> import bigframes.pandas as bpd
1640+
>>> bpd.options.display.progress_bar = None
1641+
>>> df = bpd.DataFrame({'col': [4, 2, 2]})
1642+
1643+
Download the data from BigQuery and convert it into an in-memory pandas DataFrame.
1644+
1645+
>>> df.to_pandas()
1646+
col
1647+
0 4
1648+
1 2
1649+
2 2
1650+
1651+
Estimate job statistics without processing or downloading data by using `dry_run=True`.
1652+
1653+
>>> df.to_pandas(dry_run=True) # doctest: +SKIP
1654+
columnCount 1
1655+
columnDtypes {'col': Int64}
1656+
indexLevel 1
1657+
indexDtypes [Int64]
1658+
projectId bigframes-dev
1659+
location US
1660+
jobType QUERY
1661+
destinationTable {'projectId': 'bigframes-dev', 'datasetId': '_...
1662+
useLegacySql False
1663+
referencedTables None
1664+
totalBytesProcessed 0
1665+
cacheHit False
1666+
statementType SELECT
1667+
creationTime 2025-04-02 20:17:12.038000+00:00
1668+
dtype: object
1669+
16371670
Args:
16381671
max_download_size (int, default None):
16391672
Download size threshold in MB. If max_download_size is exceeded when downloading data
@@ -1666,9 +1699,6 @@ def to_pandas(
16661699
downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas
16671700
Series containing dry run statistics will be returned.
16681701
"""
1669-
1670-
# TODO(orrbradford): Optimize this in future. Potentially some cases where we can return the stored query job
1671-
16721702
if dry_run:
16731703
dry_run_stats, dry_run_job = self._block._compute_dry_run(
16741704
max_download_size=max_download_size,
@@ -1702,11 +1732,40 @@ def to_pandas_batches(
17021732
page_size and max_results determine the size and number of batches,
17031733
see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result
17041734
1735+
**Examples:**
1736+
1737+
>>> import bigframes.pandas as bpd
1738+
>>> bpd.options.display.progress_bar = None
1739+
>>> df = bpd.DataFrame({'col': [4, 3, 2, 2, 3]})
1740+
1741+
Iterate through the results in batches, limiting the total rows yielded
1742+
across all batches via `max_results`:
1743+
1744+
>>> for df_batch in df.to_pandas_batches(max_results=3):
1745+
... print(df_batch)
1746+
col
1747+
0 4
1748+
1 3
1749+
2 2
1750+
1751+
Alternatively, control the approximate size of each batch using `page_size`
1752+
and fetch batches manually using `next()`:
1753+
1754+
>>> it = df.to_pandas_batches(page_size=2)
1755+
>>> next(it)
1756+
col
1757+
0 4
1758+
1 3
1759+
>>> next(it)
1760+
col
1761+
2 2
1762+
3 2
1763+
17051764
Args:
17061765
page_size (int, default None):
1707-
The size of each batch.
1766+
The maximum number of rows of each batch. Non-positive values are ignored.
17081767
max_results (int, default None):
1709-
If given, only download this many rows at maximum.
1768+
The maximum total number of rows of all batches.
17101769
allow_large_results (bool, default None):
17111770
If not None, overrides the global setting to allow or disallow large query results
17121771
over the default size limit of 10 GB.

bigframes/series.py

+33
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,39 @@ def to_pandas(
385385
) -> pandas.Series:
386386
"""Writes Series to pandas Series.
387387
388+
**Examples:**
389+
390+
>>> import bigframes.pandas as bpd
391+
>>> bpd.options.display.progress_bar = None
392+
>>> s = bpd.Series([4, 3, 2])
393+
394+
Download the data from BigQuery and convert it into an in-memory pandas Series.
395+
396+
>>> s.to_pandas()
397+
0 4
398+
1 3
399+
2 2
400+
dtype: Int64
401+
402+
Estimate job statistics without processing or downloading data by using `dry_run=True`.
403+
404+
>>> s.to_pandas(dry_run=True) # doctest: +SKIP
405+
columnCount 1
406+
columnDtypes {None: Int64}
407+
indexLevel 1
408+
indexDtypes [Int64]
409+
projectId bigframes-dev
410+
location US
411+
jobType QUERY
412+
destinationTable {'projectId': 'bigframes-dev', 'datasetId': '_...
413+
useLegacySql False
414+
referencedTables None
415+
totalBytesProcessed 0
416+
cacheHit False
417+
statementType SELECT
418+
creationTime 2025-04-03 18:54:59.219000+00:00
419+
dtype: object
420+
388421
Args:
389422
max_download_size (int, default None):
390423
Download size threshold in MB. If max_download_size is exceeded when downloading data

bigframes/session/executor.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,13 @@ def execute(
264264

265265
# Though we provide the read client, iterator may or may not use it based on what is efficient for the result
266266
def iterator_supplier():
267-
return iterator.to_arrow_iterable(bqstorage_client=self.bqstoragereadclient)
267+
# Workaround issue fixed by: https://github.com/googleapis/python-bigquery/pull/2154
268+
if iterator._page_size is not None or iterator.max_results is not None:
269+
return iterator.to_arrow_iterable(bqstorage_client=None)
270+
else:
271+
return iterator.to_arrow_iterable(
272+
bqstorage_client=self.bqstoragereadclient
273+
)
268274

269275
if query_job:
270276
size_bytes = self.bqclient.get_table(query_job.destination).num_bytes

tests/system/large/test_dataframe_io.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def test_to_pandas_batches_override_global_option(
4444
page_size=500, max_results=1500, allow_large_results=True
4545
)
4646
)
47-
assert len(w) == 2
47+
assert len(w) == 1
4848
assert issubclass(w[0].category, FutureWarning)
4949
assert "The query result size has exceeded 10 GB." in str(w[0].message)
5050

0 commit comments

Comments
 (0)