fix: to_pandas_batches() respects page_size and max_results again (#1572)

tswast · web-flow · commit 27c59051549b · 2025-03-31T14:43:07.000-05:00
* fix: `to_pandas_batches()` respects `page_size` and `max_results` again

* fix lint

* help with session close flakiness
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,11 +31,11 @@ repos:
     hooks:
     - id: black
 -   repo: https://github.com/pycqa/flake8
-    rev: 6.1.0
+    rev: 7.1.2
     hooks:
     - id: flake8
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.15.0
     hooks:
     -   id: mypy
         additional_dependencies: [types-requests, types-tabulate, pandas-stubs<=2.2.3.241126]
diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py
@@ -245,6 +245,8 @@ def start_query_with_client(
                 location=location,
                 project=project,
                 api_timeout=timeout,
+                page_size=page_size,
+                max_results=max_results,
             )
             if metrics is not None:
                 metrics.count_job_stats(query=sql)
diff --git a/noxfile.py b/noxfile.py
@@ -29,7 +29,9 @@
 import nox.sessions
 
 BLACK_VERSION = "black==22.3.0"
+FLAKE8_VERSION = "flake8==7.1.2"
 ISORT_VERSION = "isort==5.12.0"
+MYPY_VERSION = "mypy==1.15.0"
 
 # TODO: switch to 3.13 once remote functions / cloud run adds a runtime for it (internal issue 333742751)
 LATEST_FULLY_SUPPORTED_PYTHON = "3.12"
@@ -135,7 +137,7 @@ def lint(session):
     Returns a failure if the linters find linting errors or sufficiently
     serious code quality issues.
     """
-    session.install("flake8", BLACK_VERSION, ISORT_VERSION)
+    session.install(FLAKE8_VERSION, BLACK_VERSION, ISORT_VERSION)
     session.run(
         "isort",
         "--check",
@@ -264,7 +266,7 @@ def mypy(session):
     deps = (
         set(
             [
-                "mypy",
+                MYPY_VERSION,
                 # TODO: update to latest pandas-stubs once we resolve bigframes issues.
                 "pandas-stubs<=2.2.3.241126",
                 "types-protobuf",
diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py
@@ -75,17 +75,19 @@ def test_index_repr_large_table():
 
 
 def test_to_pandas_batches_large_table():
-    df = bpd.read_gbq("load_testing.scalars_1tb")
+    df = bpd.read_gbq("load_testing.scalars_100gb")
     _, expected_column_count = df.shape
 
     # download only a few batches, since 1tb would be too much
-    iterable = df.to_pandas_batches(page_size=500, max_results=1500)
+    iterable = df.to_pandas_batches(
+        page_size=500, max_results=1500, allow_large_results=True
+    )
     # use page size since client library doesn't support
     # streaming only part of the dataframe via bqstorage
     for pdf in iterable:
         batch_row_count, batch_column_count = pdf.shape
         assert batch_column_count == expected_column_count
-        assert batch_row_count > 0
+        assert 0 < batch_row_count <= 500
 
 
 @pytest.mark.skip(reason="See if it caused kokoro build aborted.")
diff --git a/tests/system/small/test_bq_sessions.py b/tests/system/small/test_bq_sessions.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from concurrent.futures import ThreadPoolExecutor
+import time
 
 import google
 import google.api_core.exceptions
@@ -58,7 +59,11 @@ def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client
 
     session_resource_manager.close()
     with pytest.raises(google.api_core.exceptions.NotFound):
-        bigquery_client.get_table(session_table_ref)
+        # It may take time for the underlying tables to get cleaned up after
+        # closing the session, so wait at least 1 minute to check.
+        for _ in range(6):
+            bigquery_client.get_table(session_table_ref)
+            time.sleep(10)
 
 
 def test_bq_session_create_multi_temp_tables(bigquery_client: bigquery.Client):
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -35,6 +35,7 @@
 
 import bigframes
 import bigframes.dataframe
+import bigframes.enums
 import bigframes.features
 import bigframes.pandas as bpd
 
@@ -288,6 +289,30 @@ def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
         pd.testing.assert_series_equal(actual, expected)
 
 
+@pytest.mark.parametrize("allow_large_results", (True, False))
+def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
+    """Verify to_pandas_batches() APIs returns the expected page size.
+
+    Regression test for b/407521010.
+    """
+    bf_df = session.read_gbq(
+        "bigquery-public-data.usa_names.usa_1910_2013",
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    expected_column_count = len(bf_df.columns)
+
+    batch_count = 0
+    for pd_df in bf_df.to_pandas_batches(
+        page_size=42, allow_large_results=allow_large_results, max_results=42 * 3
+    ):
+        batch_row_count, batch_column_count = pd_df.shape
+        batch_count += 1
+        assert batch_column_count == expected_column_count
+        assert batch_row_count == 42
+
+    assert batch_count == 3
+
+
 @pytest.mark.parametrize(
     ("index",),
     [(True,), (False,)],

Original file line number	Diff line number	Diff line change
`@@ -245,6 +245,8 @@ def start_query_with_client(`
`245`	`245`	`location=location,`
`246`	`246`	`project=project,`
`247`	`247`	`api_timeout=timeout,`
	`248`	`+ page_size=page_size,`
	`249`	`+ max_results=max_results,`
`248`	`250`	`)`
`249`	`251`	`if metrics is not None:`
`250`	`252`	`metrics.count_job_stats(query=sql)`