|
23 | 23 | import numbers
|
24 | 24 | import textwrap
|
25 | 25 | import typing
|
26 |
| -from typing import Any, cast, List, Literal, Mapping, Optional, Sequence, Tuple, Union |
| 26 | +from typing import ( |
| 27 | + Any, |
| 28 | + cast, |
| 29 | + Iterable, |
| 30 | + List, |
| 31 | + Literal, |
| 32 | + Mapping, |
| 33 | + Optional, |
| 34 | + Sequence, |
| 35 | + Tuple, |
| 36 | + Union, |
| 37 | +) |
27 | 38 |
|
28 | 39 | import bigframes_vendored.constants as constants
|
29 | 40 | import bigframes_vendored.pandas.core.series as vendored_pandas_series
|
@@ -478,6 +489,70 @@ def to_pandas(
|
478 | 489 | series.name = self._name
|
479 | 490 | return series
|
480 | 491 |
|
| 492 | + def to_pandas_batches( |
| 493 | + self, |
| 494 | + page_size: Optional[int] = None, |
| 495 | + max_results: Optional[int] = None, |
| 496 | + *, |
| 497 | + allow_large_results: Optional[bool] = None, |
| 498 | + ) -> Iterable[pandas.Series]: |
| 499 | + """Stream Series results to an iterable of pandas Series. |
| 500 | +
|
| 501 | + page_size and max_results determine the size and number of batches, |
| 502 | + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result |
| 503 | +
|
| 504 | + **Examples:** |
| 505 | +
|
| 506 | + >>> import bigframes.pandas as bpd |
| 507 | + >>> bpd.options.display.progress_bar = None |
| 508 | + >>> s = bpd.Series([4, 3, 2, 2, 3]) |
| 509 | +
|
| 510 | + Iterate through the results in batches, limiting the total rows yielded |
| 511 | + across all batches via `max_results`: |
| 512 | +
|
| 513 | + >>> for s_batch in s.to_pandas_batches(max_results=3): |
| 514 | + ... print(s_batch) |
| 515 | + 0 4 |
| 516 | + 1 3 |
| 517 | + 2 2 |
| 518 | + dtype: Int64 |
| 519 | +
|
| 520 | + Alternatively, control the approximate size of each batch using `page_size` |
| 521 | + and fetch batches manually using `next()`: |
| 522 | +
|
| 523 | + >>> it = s.to_pandas_batches(page_size=2) |
| 524 | + >>> next(it) |
| 525 | + 0 4 |
| 526 | + 1 3 |
| 527 | + dtype: Int64 |
| 528 | + >>> next(it) |
| 529 | + 2 2 |
| 530 | + 3 2 |
| 531 | + dtype: Int64 |
| 532 | +
|
| 533 | + Args: |
| 534 | + page_size (int, default None): |
| 535 | + The maximum number of rows of each batch. Non-positive values are ignored. |
| 536 | + max_results (int, default None): |
| 537 | + The maximum total number of rows of all batches. |
| 538 | + allow_large_results (bool, default None): |
| 539 | + If not None, overrides the global setting to allow or disallow large query results |
| 540 | + over the default size limit of 10 GB. |
| 541 | +
|
| 542 | + Returns: |
| 543 | + Iterable[pandas.Series]: |
| 544 | + An iterable of smaller Series which combine to |
| 545 | + form the original Series. Results stream from bigquery, |
| 546 | + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable |
| 547 | + """ |
| 548 | + df = self._block.to_pandas_batches( |
| 549 | + page_size=page_size, |
| 550 | + max_results=max_results, |
| 551 | + allow_large_results=allow_large_results, |
| 552 | + squeeze=True, |
| 553 | + ) |
| 554 | + return df |
| 555 | + |
481 | 556 | def _compute_dry_run(self) -> bigquery.QueryJob:
|
482 | 557 | _, query_job = self._block._compute_dry_run((self._value_column,))
|
483 | 558 | return query_job
|
|
0 commit comments