Skip to content

Commit 27c5905

Browse files
authored
fix: to_pandas_batches() respects page_size and max_results again (#1572)
* fix: `to_pandas_batches()` respects `page_size` and `max_results` again * fix lint * help with session close flakiness
1 parent d1e9ec2 commit 27c5905

File tree

6 files changed

+44
-8
lines changed

6 files changed

+44
-8
lines changed

.pre-commit-config.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ repos:
3131
hooks:
3232
- id: black
3333
- repo: https://github.com/pycqa/flake8
34-
rev: 6.1.0
34+
rev: 7.1.2
3535
hooks:
3636
- id: flake8
3737
- repo: https://github.com/pre-commit/mirrors-mypy
38-
rev: v1.10.0
38+
rev: v1.15.0
3939
hooks:
4040
- id: mypy
4141
additional_dependencies: [types-requests, types-tabulate, pandas-stubs<=2.2.3.241126]

bigframes/session/_io/bigquery/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,8 @@ def start_query_with_client(
245245
location=location,
246246
project=project,
247247
api_timeout=timeout,
248+
page_size=page_size,
249+
max_results=max_results,
248250
)
249251
if metrics is not None:
250252
metrics.count_job_stats(query=sql)

noxfile.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@
2929
import nox.sessions
3030

3131
BLACK_VERSION = "black==22.3.0"
32+
FLAKE8_VERSION = "flake8==7.1.2"
3233
ISORT_VERSION = "isort==5.12.0"
34+
MYPY_VERSION = "mypy==1.15.0"
3335

3436
# TODO: switch to 3.13 once remote functions / cloud run adds a runtime for it (internal issue 333742751)
3537
LATEST_FULLY_SUPPORTED_PYTHON = "3.12"
@@ -135,7 +137,7 @@ def lint(session):
135137
Returns a failure if the linters find linting errors or sufficiently
136138
serious code quality issues.
137139
"""
138-
session.install("flake8", BLACK_VERSION, ISORT_VERSION)
140+
session.install(FLAKE8_VERSION, BLACK_VERSION, ISORT_VERSION)
139141
session.run(
140142
"isort",
141143
"--check",
@@ -264,7 +266,7 @@ def mypy(session):
264266
deps = (
265267
set(
266268
[
267-
"mypy",
269+
MYPY_VERSION,
268270
# TODO: update to latest pandas-stubs once we resolve bigframes issues.
269271
"pandas-stubs<=2.2.3.241126",
270272
"types-protobuf",

tests/system/load/test_large_tables.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -75,17 +75,19 @@ def test_index_repr_large_table():
7575

7676

7777
def test_to_pandas_batches_large_table():
78-
df = bpd.read_gbq("load_testing.scalars_1tb")
78+
df = bpd.read_gbq("load_testing.scalars_100gb")
7979
_, expected_column_count = df.shape
8080

8181
# download only a few batches, since 1tb would be too much
82-
iterable = df.to_pandas_batches(page_size=500, max_results=1500)
82+
iterable = df.to_pandas_batches(
83+
page_size=500, max_results=1500, allow_large_results=True
84+
)
8385
# use page size since client library doesn't support
8486
# streaming only part of the dataframe via bqstorage
8587
for pdf in iterable:
8688
batch_row_count, batch_column_count = pdf.shape
8789
assert batch_column_count == expected_column_count
88-
assert batch_row_count > 0
90+
assert 0 < batch_row_count <= 500
8991

9092

9193
@pytest.mark.skip(reason="See if it caused kokoro build aborted.")

tests/system/small/test_bq_sessions.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
from concurrent.futures import ThreadPoolExecutor
16+
import time
1617

1718
import google
1819
import google.api_core.exceptions
@@ -58,7 +59,11 @@ def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client
5859

5960
session_resource_manager.close()
6061
with pytest.raises(google.api_core.exceptions.NotFound):
61-
bigquery_client.get_table(session_table_ref)
62+
# It may take time for the underlying tables to get cleaned up after
63+
# closing the session, so wait at least 1 minute to check.
64+
for _ in range(6):
65+
bigquery_client.get_table(session_table_ref)
66+
time.sleep(10)
6267

6368

6469
def test_bq_session_create_multi_temp_tables(bigquery_client: bigquery.Client):

tests/system/small/test_dataframe_io.py

+25
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
import bigframes
3737
import bigframes.dataframe
38+
import bigframes.enums
3839
import bigframes.features
3940
import bigframes.pandas as bpd
4041

@@ -288,6 +289,30 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
288289
pd.testing.assert_series_equal(actual, expected)
289290

290291

292+
@pytest.mark.parametrize("allow_large_results", (True, False))
293+
def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
294+
"""Verify to_pandas_batches() APIs returns the expected page size.
295+
296+
Regression test for b/407521010.
297+
"""
298+
bf_df = session.read_gbq(
299+
"bigquery-public-data.usa_names.usa_1910_2013",
300+
index_col=bigframes.enums.DefaultIndexKind.NULL,
301+
)
302+
expected_column_count = len(bf_df.columns)
303+
304+
batch_count = 0
305+
for pd_df in bf_df.to_pandas_batches(
306+
page_size=42, allow_large_results=allow_large_results, max_results=42 * 3
307+
):
308+
batch_row_count, batch_column_count = pd_df.shape
309+
batch_count += 1
310+
assert batch_column_count == expected_column_count
311+
assert batch_row_count == 42
312+
313+
assert batch_count == 3
314+
315+
291316
@pytest.mark.parametrize(
292317
("index",),
293318
[(True,), (False,)],

0 commit comments

Comments
 (0)