Skip to content

Commit 5e4465d

Browse files
authored
feat: add bool, int, float, string dtype to to_dataframe (#1529)
1 parent a2520ca commit 5e4465d

File tree

5 files changed

+294
-12
lines changed

5 files changed

+294
-12
lines changed

google/cloud/bigquery/_pandas_helpers.py

+19-6
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import logging
2222
import queue
2323
import warnings
24+
from typing import Any, Union
2425

2526
from packaging import version
2627

@@ -283,7 +284,13 @@ def bq_to_arrow_schema(bq_schema):
283284
return pyarrow.schema(arrow_fields)
284285

285286

286-
def default_types_mapper(date_as_object: bool = False):
287+
def default_types_mapper(
288+
date_as_object: bool = False,
289+
bool_dtype: Union[Any, None] = None,
290+
int_dtype: Union[Any, None] = None,
291+
float_dtype: Union[Any, None] = None,
292+
string_dtype: Union[Any, None] = None,
293+
):
287294
"""Create a mapping from pyarrow types to pandas types.
288295
289296
This overrides the pandas defaults to use null-safe extension types where
@@ -299,8 +306,17 @@ def default_types_mapper(date_as_object: bool = False):
299306
"""
300307

301308
def types_mapper(arrow_data_type):
302-
if pyarrow.types.is_boolean(arrow_data_type):
303-
return pandas.BooleanDtype()
309+
if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type):
310+
return bool_dtype
311+
312+
elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type):
313+
return int_dtype
314+
315+
elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type):
316+
return float_dtype
317+
318+
elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type):
319+
return string_dtype
304320

305321
elif (
306322
# If date_as_object is True, we know some DATE columns are
@@ -310,9 +326,6 @@ def types_mapper(arrow_data_type):
310326
):
311327
return db_dtypes.DateDtype()
312328

313-
elif pyarrow.types.is_integer(arrow_data_type):
314-
return pandas.Int64Dtype()
315-
316329
elif pyarrow.types.is_time(arrow_data_type):
317330
return db_dtypes.TimeDtype()
318331

google/cloud/bigquery/enums.py

+14
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,20 @@ class CreateDisposition(object):
7777
returned in the job result."""
7878

7979

80+
class DefaultPandasDTypes(enum.Enum):
81+
"""Default Pandas DataFrem DTypes to convert BigQuery data. These
82+
Sentinel values are used instead of None to maintain backward compatibility,
83+
and allow Pandas package is not available. For more information:
84+
https://stackoverflow.com/a/60605919/101923
85+
"""
86+
87+
BOOL_DTYPE = object()
88+
"""Specifies default bool dtype"""
89+
90+
INT_DTYPE = object()
91+
"""Specifies default integer dtype"""
92+
93+
8094
class DestinationFormat(object):
8195
"""The exported file format. The default value is :attr:`CSV`.
8296

google/cloud/bigquery/job/query.py

+54-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from google.cloud.bigquery.dataset import DatasetListItem
2929
from google.cloud.bigquery.dataset import DatasetReference
3030
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
31-
from google.cloud.bigquery.enums import KeyResultStatementKind
31+
from google.cloud.bigquery.enums import KeyResultStatementKind, DefaultPandasDTypes
3232
from google.cloud.bigquery.external_config import ExternalConfig
3333
from google.cloud.bigquery import _helpers
3434
from google.cloud.bigquery.query import (
@@ -53,6 +53,11 @@
5353
from google.cloud.bigquery.job.base import _JobConfig
5454
from google.cloud.bigquery.job.base import _JobReference
5555

56+
try:
57+
import pandas # type: ignore
58+
except ImportError: # pragma: NO COVER
59+
pandas = None
60+
5661
if typing.TYPE_CHECKING: # pragma: NO COVER
5762
# Assumption: type checks are only used by library developers and CI environments
5863
# that have all optional dependencies installed, thus no conditional imports.
@@ -1620,6 +1625,10 @@ def to_dataframe(
16201625
create_bqstorage_client: bool = True,
16211626
max_results: Optional[int] = None,
16221627
geography_as_object: bool = False,
1628+
bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
1629+
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
1630+
float_dtype: Union[Any, None] = None,
1631+
string_dtype: Union[Any, None] = None,
16231632
) -> "pandas.DataFrame":
16241633
"""Return a pandas DataFrame from a QueryJob
16251634
@@ -1672,6 +1681,46 @@ def to_dataframe(
16721681
16731682
.. versionadded:: 2.24.0
16741683
1684+
bool_dtype (Optional[pandas.Series.dtype, None]):
1685+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
1686+
to convert BigQuery Boolean type, instead of relying on the default
1687+
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
1688+
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
1689+
type can be found at:
1690+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
1691+
1692+
.. versionadded:: 3.7.1
1693+
1694+
int_dtype (Optional[pandas.Series.dtype, None]):
1695+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
1696+
to convert BigQuery Integer types, instead of relying on the default
1697+
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
1698+
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
1699+
Integer types can be found at:
1700+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
1701+
1702+
.. versionadded:: 3.7.1
1703+
1704+
float_dtype (Optional[pandas.Series.dtype, None]):
1705+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
1706+
to convert BigQuery Float type, instead of relying on the default
1707+
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
1708+
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
1709+
type can be found at:
1710+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
1711+
1712+
.. versionadded:: 3.7.1
1713+
1714+
string_dtype (Optional[pandas.Series.dtype, None]):
1715+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
1716+
convert BigQuery String type, instead of relying on the default
1717+
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
1718+
then the data type will be ``numpy.dtype("object")``. BigQuery String
1719+
type can be found at:
1720+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
1721+
1722+
.. versionadded:: 3.7.1
1723+
16751724
Returns:
16761725
pandas.DataFrame:
16771726
A :class:`~pandas.DataFrame` populated with row data
@@ -1694,6 +1743,10 @@ def to_dataframe(
16941743
progress_bar_type=progress_bar_type,
16951744
create_bqstorage_client=create_bqstorage_client,
16961745
geography_as_object=geography_as_object,
1746+
bool_dtype=bool_dtype,
1747+
int_dtype=int_dtype,
1748+
float_dtype=float_dtype,
1749+
string_dtype=string_dtype,
16971750
)
16981751

16991752
# If changing the signature of this method, make sure to apply the same

google/cloud/bigquery/table.py

+94-5
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@
3434
except ImportError: # pragma: NO COVER
3535
pyarrow = None
3636

37+
try:
38+
import db_dtypes # type: ignore
39+
except ImportError: # pragma: NO COVER
40+
db_dtypes = None
41+
3742
try:
3843
import geopandas # type: ignore
3944
except ImportError:
@@ -55,6 +60,7 @@
5560
import google.cloud._helpers # type: ignore
5661
from google.cloud.bigquery import _helpers
5762
from google.cloud.bigquery import _pandas_helpers
63+
from google.cloud.bigquery.enums import DefaultPandasDTypes
5864
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
5965
from google.cloud.bigquery.schema import _build_schema_resource
6066
from google.cloud.bigquery.schema import _parse_schema_resource
@@ -88,6 +94,11 @@
8894

8995
_TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"'
9096

97+
_NO_SUPPORTED_DTYPE = (
98+
"The dtype cannot to be converted to a pandas ExtensionArray "
99+
"because the necessary `__from_arrow__` attribute is missing."
100+
)
101+
91102

92103
def _reference_getter(table):
93104
"""A :class:`~google.cloud.bigquery.table.TableReference` pointing to
@@ -1920,6 +1931,10 @@ def to_dataframe(
19201931
progress_bar_type: str = None,
19211932
create_bqstorage_client: bool = True,
19221933
geography_as_object: bool = False,
1934+
bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
1935+
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
1936+
float_dtype: Union[Any, None] = None,
1937+
string_dtype: Union[Any, None] = None,
19231938
) -> "pandas.DataFrame":
19241939
"""Create a pandas DataFrame by loading all pages of a query.
19251940
@@ -1958,6 +1973,7 @@ def to_dataframe(
19581973
progress bar as a graphical dialog box.
19591974
19601975
.. versionadded:: 1.11.0
1976+
19611977
create_bqstorage_client (Optional[bool]):
19621978
If ``True`` (default), create a BigQuery Storage API client
19631979
using the default API settings. The BigQuery Storage API
@@ -1975,6 +1991,46 @@ def to_dataframe(
19751991
19761992
.. versionadded:: 2.24.0
19771993
1994+
bool_dtype (Optional[pandas.Series.dtype, None]):
1995+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
1996+
to convert BigQuery Boolean type, instead of relying on the default
1997+
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
1998+
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
1999+
type can be found at:
2000+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
2001+
2002+
.. versionadded:: 3.7.1
2003+
2004+
int_dtype (Optional[pandas.Series.dtype, None]):
2005+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
2006+
to convert BigQuery Integer types, instead of relying on the default
2007+
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
2008+
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
2009+
Integer types can be found at:
2010+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
2011+
2012+
.. versionadded:: 3.7.1
2013+
2014+
float_dtype (Optional[pandas.Series.dtype, None]):
2015+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
2016+
to convert BigQuery Float type, instead of relying on the default
2017+
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
2018+
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
2019+
type can be found at:
2020+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
2021+
2022+
.. versionadded:: 3.7.1
2023+
2024+
string_dtype (Optional[pandas.Series.dtype, None]):
2025+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
2026+
convert BigQuery String type, instead of relying on the default
2027+
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
2028+
then the data type will be ``numpy.dtype("object")``. BigQuery String
2029+
type can be found at:
2030+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
2031+
2032+
.. versionadded:: 3.7.1
2033+
19782034
Returns:
19792035
pandas.DataFrame:
19802036
A :class:`~pandas.DataFrame` populated with row data and column
@@ -1987,14 +2043,34 @@ def to_dataframe(
19872043
the :mod:`google.cloud.bigquery_storage_v1` module is
19882044
required but cannot be imported. Also if
19892045
`geography_as_object` is `True`, but the
1990-
:mod:`shapely` library cannot be imported.
2046+
:mod:`shapely` library cannot be imported. Also if
2047+
`bool_dtype`, `int_dtype` or other dtype parameters
2048+
is not supported dtype.
19912049
19922050
"""
19932051
_pandas_helpers.verify_pandas_imports()
19942052

19952053
if geography_as_object and shapely is None:
19962054
raise ValueError(_NO_SHAPELY_ERROR)
19972055

2056+
if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
2057+
bool_dtype = pandas.BooleanDtype()
2058+
2059+
if int_dtype is DefaultPandasDTypes.INT_DTYPE:
2060+
int_dtype = pandas.Int64Dtype()
2061+
2062+
if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
2063+
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)
2064+
2065+
if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
2066+
raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)
2067+
2068+
if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
2069+
raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)
2070+
2071+
if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
2072+
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)
2073+
19982074
if dtypes is None:
19992075
dtypes = {}
20002076

@@ -2019,15 +2095,15 @@ def to_dataframe(
20192095
for col in record_batch
20202096
# Type can be date32 or date64 (plus units).
20212097
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2022-
if str(col.type).startswith("date")
2098+
if pyarrow.types.is_date(col.type)
20232099
)
20242100

20252101
timestamp_as_object = not all(
20262102
self.__can_cast_timestamp_ns(col)
20272103
for col in record_batch
2028-
# Type can be timestamp (plus units and time zone).
2104+
# Type can be datetime and timestamp (plus units and time zone).
20292105
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2030-
if str(col.type).startswith("timestamp")
2106+
if pyarrow.types.is_timestamp(col.type)
20312107
)
20322108

20332109
if len(record_batch) > 0:
@@ -2036,7 +2112,11 @@ def to_dataframe(
20362112
timestamp_as_object=timestamp_as_object,
20372113
integer_object_nulls=True,
20382114
types_mapper=_pandas_helpers.default_types_mapper(
2039-
date_as_object=date_as_object
2115+
date_as_object=date_as_object,
2116+
bool_dtype=bool_dtype,
2117+
int_dtype=int_dtype,
2118+
float_dtype=float_dtype,
2119+
string_dtype=string_dtype,
20402120
),
20412121
)
20422122
else:
@@ -2233,6 +2313,10 @@ def to_dataframe(
22332313
progress_bar_type=None,
22342314
create_bqstorage_client=True,
22352315
geography_as_object=False,
2316+
bool_dtype=None,
2317+
int_dtype=None,
2318+
float_dtype=None,
2319+
string_dtype=None,
22362320
) -> "pandas.DataFrame":
22372321
"""Create an empty dataframe.
22382322
@@ -2241,6 +2325,11 @@ def to_dataframe(
22412325
dtypes (Any): Ignored. Added for compatibility with RowIterator.
22422326
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
22432327
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2328+
geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
2329+
bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
2330+
int_dtype (Any): Ignored. Added for compatibility with RowIterator.
2331+
float_dtype (Any): Ignored. Added for compatibility with RowIterator.
2332+
string_dtype (Any): Ignored. Added for compatibility with RowIterator.
22442333
22452334
Returns:
22462335
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.

0 commit comments

Comments
 (0)