21
21
import logging
22
22
import queue
23
23
import warnings
24
- from typing import Any , Union
24
+ from typing import Any , Union , Optional , Callable , Generator , List
25
25
26
26
27
27
from google .cloud .bigquery import _pyarrow_helpers
28
28
from google .cloud .bigquery import _versions_helpers
29
29
from google .cloud .bigquery import schema
30
30
31
+
31
32
try :
32
33
import pandas # type: ignore
33
34
@@ -75,7 +76,7 @@ def _to_wkb(v):
75
76
_to_wkb = _to_wkb ()
76
77
77
78
try :
78
- from google .cloud .bigquery_storage import ArrowSerializationOptions
79
+ from google .cloud .bigquery_storage_v1 . types import ArrowSerializationOptions
79
80
except ImportError :
80
81
_ARROW_COMPRESSION_SUPPORT = False
81
82
else :
@@ -816,18 +817,54 @@ def _nowait(futures):
816
817
817
818
818
819
def _download_table_bqstorage (
819
- project_id ,
820
- table ,
821
- bqstorage_client ,
822
- preserve_order = False ,
823
- selected_fields = None ,
824
- page_to_item = None ,
825
- max_queue_size = _MAX_QUEUE_SIZE_DEFAULT ,
826
- ):
827
- """Use (faster, but billable) BQ Storage API to construct DataFrame."""
820
+ project_id : str ,
821
+ table : Any ,
822
+ bqstorage_client : Any ,
823
+ preserve_order : bool = False ,
824
+ selected_fields : Optional [List [Any ]] = None ,
825
+ page_to_item : Optional [Callable ] = None ,
826
+ max_queue_size : Any = _MAX_QUEUE_SIZE_DEFAULT ,
827
+ max_stream_count : Optional [int ] = None ,
828
+ ) -> Generator [Any , None , None ]:
829
+ """Downloads a BigQuery table using the BigQuery Storage API.
830
+
831
+ This method uses the faster, but potentially more expensive, BigQuery
832
+ Storage API to download a table as a Pandas DataFrame. It supports
833
+ parallel downloads and optional data transformations.
834
+
835
+ Args:
836
+ project_id (str): The ID of the Google Cloud project containing
837
+ the table.
838
+ table (Any): The BigQuery table to download.
839
+ bqstorage_client (Any): An
840
+ authenticated BigQuery Storage API client.
841
+ preserve_order (bool, optional): Whether to preserve the order
842
+ of the rows as they are read from BigQuery. If True this limits
843
+ the number of streams to one and overrides `max_stream_count`.
844
+ Defaults to False.
845
+ selected_fields (Optional[List[SchemaField]]):
846
+ A list of BigQuery schema fields to select for download. If None,
847
+ all fields are downloaded. Defaults to None.
848
+ page_to_item (Optional[Callable]): An optional callable
849
+ function that takes a page of data from the BigQuery Storage API
850
+ max_stream_count (Optional[int]): The maximum number of
851
+ concurrent streams to use for downloading data. If `preserve_order`
852
+ is True, the requested streams are limited to 1 regardless of the
853
+ `max_stream_count` value. If 0 or None, then the number of
854
+ requested streams will be unbounded. Defaults to None.
855
+
856
+ Yields:
857
+ pandas.DataFrame: Pandas DataFrames, one for each chunk of data
858
+ downloaded from BigQuery.
859
+
860
+ Raises:
861
+ ValueError: If attempting to read from a specific partition or snapshot.
862
+
863
+ Note:
864
+ This method requires the `google-cloud-bigquery-storage` library
865
+ to be installed.
866
+ """
828
867
829
- # Passing a BQ Storage client in implies that the BigQuery Storage library
830
- # is available and can be imported.
831
868
from google .cloud import bigquery_storage
832
869
833
870
if "$" in table .table_id :
@@ -837,18 +874,20 @@ def _download_table_bqstorage(
837
874
if "@" in table .table_id :
838
875
raise ValueError ("Reading from a specific snapshot is not currently supported." )
839
876
840
- requested_streams = 1 if preserve_order else 0
877
+ requested_streams = determine_requested_streams ( preserve_order , max_stream_count )
841
878
842
- requested_session = bigquery_storage .types .ReadSession (
843
- table = table .to_bqstorage (), data_format = bigquery_storage .types .DataFormat .ARROW
879
+ requested_session = bigquery_storage .types .stream .ReadSession (
880
+ table = table .to_bqstorage (),
881
+ data_format = bigquery_storage .types .stream .DataFormat .ARROW ,
844
882
)
845
883
if selected_fields is not None :
846
884
for field in selected_fields :
847
885
requested_session .read_options .selected_fields .append (field .name )
848
886
849
887
if _ARROW_COMPRESSION_SUPPORT :
850
888
requested_session .read_options .arrow_serialization_options .buffer_compression = (
851
- ArrowSerializationOptions .CompressionCodec .LZ4_FRAME
889
+ # CompressionCodec(1) -> LZ4_FRAME
890
+ ArrowSerializationOptions .CompressionCodec (1 )
852
891
)
853
892
854
893
session = bqstorage_client .create_read_session (
@@ -884,7 +923,7 @@ def _download_table_bqstorage(
884
923
elif max_queue_size is None :
885
924
max_queue_size = 0 # unbounded
886
925
887
- worker_queue = queue .Queue (maxsize = max_queue_size )
926
+ worker_queue : queue . Queue [ int ] = queue .Queue (maxsize = max_queue_size )
888
927
889
928
with concurrent .futures .ThreadPoolExecutor (max_workers = total_streams ) as pool :
890
929
try :
@@ -910,7 +949,7 @@ def _download_table_bqstorage(
910
949
# we want to block on the queue's get method, instead. This
911
950
# prevents the queue from filling up, because the main thread
912
951
# has smaller gaps in time between calls to the queue's get
913
- # method. For a detailed explaination , see:
952
+ # method. For a detailed explanation , see:
914
953
# https://friendliness.dev/2019/06/18/python-nowait/
915
954
done , not_done = _nowait (not_done )
916
955
for future in done :
@@ -949,6 +988,7 @@ def download_arrow_bqstorage(
949
988
preserve_order = False ,
950
989
selected_fields = None ,
951
990
max_queue_size = _MAX_QUEUE_SIZE_DEFAULT ,
991
+ max_stream_count = None ,
952
992
):
953
993
return _download_table_bqstorage (
954
994
project_id ,
@@ -958,6 +998,7 @@ def download_arrow_bqstorage(
958
998
selected_fields = selected_fields ,
959
999
page_to_item = _bqstorage_page_to_arrow ,
960
1000
max_queue_size = max_queue_size ,
1001
+ max_stream_count = max_stream_count ,
961
1002
)
962
1003
963
1004
@@ -970,6 +1011,7 @@ def download_dataframe_bqstorage(
970
1011
preserve_order = False ,
971
1012
selected_fields = None ,
972
1013
max_queue_size = _MAX_QUEUE_SIZE_DEFAULT ,
1014
+ max_stream_count = None ,
973
1015
):
974
1016
page_to_item = functools .partial (_bqstorage_page_to_dataframe , column_names , dtypes )
975
1017
return _download_table_bqstorage (
@@ -980,6 +1022,7 @@ def download_dataframe_bqstorage(
980
1022
selected_fields = selected_fields ,
981
1023
page_to_item = page_to_item ,
982
1024
max_queue_size = max_queue_size ,
1025
+ max_stream_count = max_stream_count ,
983
1026
)
984
1027
985
1028
@@ -1024,3 +1067,40 @@ def verify_pandas_imports():
1024
1067
raise ValueError (_NO_PANDAS_ERROR ) from pandas_import_exception
1025
1068
if db_dtypes is None :
1026
1069
raise ValueError (_NO_DB_TYPES_ERROR ) from db_dtypes_import_exception
1070
+
1071
+
1072
+ def determine_requested_streams (
1073
+ preserve_order : bool ,
1074
+ max_stream_count : Union [int , None ],
1075
+ ) -> int :
1076
+ """Determines the value of requested_streams based on the values of
1077
+ `preserve_order` and `max_stream_count`.
1078
+
1079
+ Args:
1080
+ preserve_order (bool): Whether to preserve the order of streams. If True,
1081
+ this limits the number of streams to one. `preserve_order` takes
1082
+ precedence over `max_stream_count`.
1083
+ max_stream_count (Union[int, None]]): The maximum number of streams
1084
+ allowed. Must be a non-negative number or None, where None indicates
1085
+ the value is unset. NOTE: if `preserve_order` is also set, it takes
1086
+ precedence over `max_stream_count`, thus to ensure that `max_stream_count`
1087
+ is used, ensure that `preserve_order` is None.
1088
+
1089
+ Returns:
1090
+ (int) The appropriate value for requested_streams.
1091
+ """
1092
+
1093
+ if preserve_order :
1094
+ # If preserve order is set, it takes precendence.
1095
+ # Limit the requested streams to 1, to ensure that order
1096
+ # is preserved)
1097
+ return 1
1098
+
1099
+ elif max_stream_count is not None :
1100
+ # If preserve_order is not set, only then do we consider max_stream_count
1101
+ if max_stream_count <= - 1 :
1102
+ raise ValueError ("max_stream_count must be non-negative OR None" )
1103
+ return max_stream_count
1104
+
1105
+ # Default to zero requested streams (unbounded).
1106
+ return 0
0 commit comments