Skip to content

Commit 256b0f7

Browse files
authored
Merge branch 'main' into fix-bq-storage-client-deadlock
2 parents a3486d7 + 7372ad6 commit 256b0f7

File tree

8 files changed

+143
-32
lines changed

8 files changed

+143
-32
lines changed

docs/bigquery/legacy_proto_types.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Legacy proto-based Types for Google Cloud Bigquery v2 API
33

44
.. warning::
55
These types are provided for backward compatibility only, and are not maintained
6-
anymore. They might also differ from the types uspported on the backend. It is
6+
anymore. They might also differ from the types supported on the backend. It is
77
therefore strongly advised to migrate to the types found in :doc:`standard_sql`.
88

99
Also see the :doc:`3.0.0 Migration Guide<../UPGRADING>` for more information.

google/cloud/bigquery/_pandas_helpers.py

+99-19
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,14 @@
2121
import logging
2222
import queue
2323
import warnings
24-
from typing import Any, Union
24+
from typing import Any, Union, Optional, Callable, Generator, List
2525

2626

2727
from google.cloud.bigquery import _pyarrow_helpers
2828
from google.cloud.bigquery import _versions_helpers
2929
from google.cloud.bigquery import schema
3030

31+
3132
try:
3233
import pandas # type: ignore
3334

@@ -75,7 +76,7 @@ def _to_wkb(v):
7576
_to_wkb = _to_wkb()
7677

7778
try:
78-
from google.cloud.bigquery_storage import ArrowSerializationOptions
79+
from google.cloud.bigquery_storage_v1.types import ArrowSerializationOptions
7980
except ImportError:
8081
_ARROW_COMPRESSION_SUPPORT = False
8182
else:
@@ -821,18 +822,54 @@ def _nowait(futures):
821822

822823

823824
def _download_table_bqstorage(
824-
project_id,
825-
table,
826-
bqstorage_client,
827-
preserve_order=False,
828-
selected_fields=None,
829-
page_to_item=None,
830-
max_queue_size=_MAX_QUEUE_SIZE_DEFAULT,
831-
):
832-
"""Use (faster, but billable) BQ Storage API to construct DataFrame."""
825+
project_id: str,
826+
table: Any,
827+
bqstorage_client: Any,
828+
preserve_order: bool = False,
829+
selected_fields: Optional[List[Any]] = None,
830+
page_to_item: Optional[Callable] = None,
831+
max_queue_size: Any = _MAX_QUEUE_SIZE_DEFAULT,
832+
max_stream_count: Optional[int] = None,
833+
) -> Generator[Any, None, None]:
834+
"""Downloads a BigQuery table using the BigQuery Storage API.
835+
836+
This method uses the faster, but potentially more expensive, BigQuery
837+
Storage API to download a table as a Pandas DataFrame. It supports
838+
parallel downloads and optional data transformations.
839+
840+
Args:
841+
project_id (str): The ID of the Google Cloud project containing
842+
the table.
843+
table (Any): The BigQuery table to download.
844+
bqstorage_client (Any): An
845+
authenticated BigQuery Storage API client.
846+
preserve_order (bool, optional): Whether to preserve the order
847+
of the rows as they are read from BigQuery. If True this limits
848+
the number of streams to one and overrides `max_stream_count`.
849+
Defaults to False.
850+
selected_fields (Optional[List[SchemaField]]):
851+
A list of BigQuery schema fields to select for download. If None,
852+
all fields are downloaded. Defaults to None.
853+
page_to_item (Optional[Callable]): An optional callable
854+
function that takes a page of data from the BigQuery Storage API
855+
max_stream_count (Optional[int]): The maximum number of
856+
concurrent streams to use for downloading data. If `preserve_order`
857+
is True, the requested streams are limited to 1 regardless of the
858+
`max_stream_count` value. If 0 or None, then the number of
859+
requested streams will be unbounded. Defaults to None.
860+
861+
Yields:
862+
pandas.DataFrame: Pandas DataFrames, one for each chunk of data
863+
downloaded from BigQuery.
864+
865+
Raises:
866+
ValueError: If attempting to read from a specific partition or snapshot.
867+
868+
Note:
869+
This method requires the `google-cloud-bigquery-storage` library
870+
to be installed.
871+
"""
833872

834-
# Passing a BQ Storage client in implies that the BigQuery Storage library
835-
# is available and can be imported.
836873
from google.cloud import bigquery_storage
837874

838875
if "$" in table.table_id:
@@ -842,18 +879,20 @@ def _download_table_bqstorage(
842879
if "@" in table.table_id:
843880
raise ValueError("Reading from a specific snapshot is not currently supported.")
844881

845-
requested_streams = 1 if preserve_order else 0
882+
requested_streams = determine_requested_streams(preserve_order, max_stream_count)
846883

847-
requested_session = bigquery_storage.types.ReadSession(
848-
table=table.to_bqstorage(), data_format=bigquery_storage.types.DataFormat.ARROW
884+
requested_session = bigquery_storage.types.stream.ReadSession(
885+
table=table.to_bqstorage(),
886+
data_format=bigquery_storage.types.stream.DataFormat.ARROW,
849887
)
850888
if selected_fields is not None:
851889
for field in selected_fields:
852890
requested_session.read_options.selected_fields.append(field.name)
853891

854892
if _ARROW_COMPRESSION_SUPPORT:
855893
requested_session.read_options.arrow_serialization_options.buffer_compression = (
856-
ArrowSerializationOptions.CompressionCodec.LZ4_FRAME
894+
# CompressionCodec(1) -> LZ4_FRAME
895+
ArrowSerializationOptions.CompressionCodec(1)
857896
)
858897

859898
session = bqstorage_client.create_read_session(
@@ -889,7 +928,7 @@ def _download_table_bqstorage(
889928
elif max_queue_size is None:
890929
max_queue_size = 0 # unbounded
891930

892-
worker_queue = queue.Queue(maxsize=max_queue_size)
931+
worker_queue: queue.Queue[int] = queue.Queue(maxsize=max_queue_size)
893932

894933
with concurrent.futures.ThreadPoolExecutor(max_workers=total_streams) as pool:
895934
try:
@@ -915,7 +954,7 @@ def _download_table_bqstorage(
915954
# we want to block on the queue's get method, instead. This
916955
# prevents the queue from filling up, because the main thread
917956
# has smaller gaps in time between calls to the queue's get
918-
# method. For a detailed explaination, see:
957+
# method. For a detailed explanation, see:
919958
# https://friendliness.dev/2019/06/18/python-nowait/
920959
done, not_done = _nowait(not_done)
921960
for future in done:
@@ -954,6 +993,7 @@ def download_arrow_bqstorage(
954993
preserve_order=False,
955994
selected_fields=None,
956995
max_queue_size=_MAX_QUEUE_SIZE_DEFAULT,
996+
max_stream_count=None,
957997
):
958998
return _download_table_bqstorage(
959999
project_id,
@@ -963,6 +1003,7 @@ def download_arrow_bqstorage(
9631003
selected_fields=selected_fields,
9641004
page_to_item=_bqstorage_page_to_arrow,
9651005
max_queue_size=max_queue_size,
1006+
max_stream_count=max_stream_count,
9661007
)
9671008

9681009

@@ -975,6 +1016,7 @@ def download_dataframe_bqstorage(
9751016
preserve_order=False,
9761017
selected_fields=None,
9771018
max_queue_size=_MAX_QUEUE_SIZE_DEFAULT,
1019+
max_stream_count=None,
9781020
):
9791021
page_to_item = functools.partial(_bqstorage_page_to_dataframe, column_names, dtypes)
9801022
return _download_table_bqstorage(
@@ -985,6 +1027,7 @@ def download_dataframe_bqstorage(
9851027
selected_fields=selected_fields,
9861028
page_to_item=page_to_item,
9871029
max_queue_size=max_queue_size,
1030+
max_stream_count=max_stream_count,
9881031
)
9891032

9901033

@@ -1029,3 +1072,40 @@ def verify_pandas_imports():
10291072
raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception
10301073
if db_dtypes is None:
10311074
raise ValueError(_NO_DB_TYPES_ERROR) from db_dtypes_import_exception
1075+
1076+
1077+
def determine_requested_streams(
1078+
preserve_order: bool,
1079+
max_stream_count: Union[int, None],
1080+
) -> int:
1081+
"""Determines the value of requested_streams based on the values of
1082+
`preserve_order` and `max_stream_count`.
1083+
1084+
Args:
1085+
preserve_order (bool): Whether to preserve the order of streams. If True,
1086+
this limits the number of streams to one. `preserve_order` takes
1087+
precedence over `max_stream_count`.
1088+
max_stream_count (Union[int, None]]): The maximum number of streams
1089+
allowed. Must be a non-negative number or None, where None indicates
1090+
the value is unset. NOTE: if `preserve_order` is also set, it takes
1091+
precedence over `max_stream_count`, thus to ensure that `max_stream_count`
1092+
is used, ensure that `preserve_order` is None.
1093+
1094+
Returns:
1095+
(int) The appropriate value for requested_streams.
1096+
"""
1097+
1098+
if preserve_order:
1099+
# If preserve order is set, it takes precendence.
1100+
# Limit the requested streams to 1, to ensure that order
1101+
# is preserved)
1102+
return 1
1103+
1104+
elif max_stream_count is not None:
1105+
# If preserve_order is not set, only then do we consider max_stream_count
1106+
if max_stream_count <= -1:
1107+
raise ValueError("max_stream_count must be non-negative OR None")
1108+
return max_stream_count
1109+
1110+
# Default to zero requested streams (unbounded).
1111+
return 0

samples/desktopapp/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
google-cloud-bigquery==3.25.0
1+
google-cloud-bigquery==3.26.0
22
google-auth-oauthlib==1.2.1

samples/geography/requirements.txt

+6-6
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ cligj==0.7.2
99
dataclasses==0.8; python_version < '3.7'
1010
db-dtypes==1.3.0
1111
Fiona===1.9.6; python_version == '3.7'
12-
Fiona==1.10.0; python_version >= '3.8'
12+
Fiona==1.10.1; python_version >= '3.8'
1313
geojson==3.1.0
1414
geopandas===0.10.2; python_version == '3.7'
1515
geopandas===0.13.2; python_version == '3.8'
1616
geopandas==1.0.1; python_version >= '3.9'
17-
google-api-core==2.19.2
18-
google-auth==2.34.0
19-
google-cloud-bigquery==3.25.0
17+
google-api-core==2.20.0
18+
google-auth==2.35.0
19+
google-cloud-bigquery==3.26.0
2020
google-cloud-bigquery-storage==2.26.0
2121
google-cloud-core==2.4.1
2222
google-crc32c===1.5.0; python_version < '3.9'
@@ -32,7 +32,7 @@ packaging===24.0; python_version == '3.7'
3232
packaging==24.1; python_version >= '3.8'
3333
pandas===1.3.5; python_version == '3.7'
3434
pandas===2.0.3; python_version == '3.8'
35-
pandas==2.2.2; python_version >= '3.9'
35+
pandas==2.2.3; python_version >= '3.9'
3636
proto-plus==1.24.0
3737
pyarrow==12.0.1; python_version == '3.7'
3838
pyarrow==17.0.0; python_version >= '3.8'
@@ -56,4 +56,4 @@ typing-extensions===4.7.1; python_version == '3.7'
5656
typing-extensions==4.12.2; python_version >= '3.8'
5757
typing-inspect==0.9.0
5858
urllib3===1.26.18; python_version == '3.7'
59-
urllib3==2.2.2; python_version >= '3.8'
59+
urllib3==2.2.3; python_version >= '3.8'

samples/magics/requirements.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
bigquery_magics==0.3.0
1+
bigquery_magics==0.4.0
22
db-dtypes==1.3.0
3-
google.cloud.bigquery==3.25.0
3+
google.cloud.bigquery==3.26.0
44
google-cloud-bigquery-storage==2.26.0
55
ipython===7.31.1; python_version == '3.7'
66
ipython===8.0.1; python_version == '3.8'

samples/notebooks/requirements.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
bigquery-magics==0.3.0
1+
bigquery-magics==0.4.0
22
db-dtypes==1.3.0
3-
google-cloud-bigquery==3.25.0
3+
google-cloud-bigquery==3.26.0
44
google-cloud-bigquery-storage==2.26.0
55
ipython===7.31.1; python_version == '3.7'
66
ipython===8.0.1; python_version == '3.8'

samples/snippets/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# samples/snippets should be runnable with no "extras"
2-
google-cloud-bigquery==3.25.0
2+
google-cloud-bigquery==3.26.0

tests/unit/test__pandas_helpers.py

+31
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import functools
1919
import operator
2020
import queue
21+
from typing import Union
2122
from unittest import mock
2223
import warnings
2324

@@ -46,6 +47,7 @@
4647
from google.cloud.bigquery import _pyarrow_helpers
4748
from google.cloud.bigquery import _versions_helpers
4849
from google.cloud.bigquery import schema
50+
from google.cloud.bigquery._pandas_helpers import determine_requested_streams
4951

5052
pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import()
5153

@@ -2053,3 +2055,32 @@ def test_verify_pandas_imports_no_db_dtypes(module_under_test, monkeypatch):
20532055
monkeypatch.setattr(module_under_test, "db_dtypes", None)
20542056
with pytest.raises(ValueError, match="Please install the 'db-dtypes' package"):
20552057
module_under_test.verify_pandas_imports()
2058+
2059+
2060+
@pytest.mark.parametrize(
2061+
"preserve_order, max_stream_count, expected_requested_streams",
2062+
[
2063+
# If preserve_order is set/True, it takes precedence:
2064+
(True, 10, 1), # use 1
2065+
(True, None, 1), # use 1
2066+
# If preserve_order is not set check max_stream_count:
2067+
(False, 10, 10), # max_stream_count (X) takes precedence
2068+
(False, None, 0), # Unbounded (0) when both are unset
2069+
],
2070+
)
2071+
def test_determine_requested_streams(
2072+
preserve_order: bool,
2073+
max_stream_count: Union[int, None],
2074+
expected_requested_streams: int,
2075+
):
2076+
"""Tests various combinations of preserve_order and max_stream_count."""
2077+
actual_requested_streams = determine_requested_streams(
2078+
preserve_order, max_stream_count
2079+
)
2080+
assert actual_requested_streams == expected_requested_streams
2081+
2082+
2083+
def test_determine_requested_streams_invalid_max_stream_count():
2084+
"""Tests that a ValueError is raised if max_stream_count is negative."""
2085+
with pytest.raises(ValueError):
2086+
determine_requested_streams(preserve_order=False, max_stream_count=-1)

0 commit comments

Comments
 (0)