feat: add bool, int, float, string dtype to to_dataframe (#1529)

chelsea-lin · web-flow · commit 5e4465d0975f · 2023-03-23T11:17:18.000-07:00
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -21,6 +21,7 @@
 import logging
 import queue
 import warnings
+from typing import Any, Union
 
 from packaging import version
 
@@ -283,7 +284,13 @@ def bq_to_arrow_schema(bq_schema):
     return pyarrow.schema(arrow_fields)
 
 
-def default_types_mapper(date_as_object: bool = False):
+def default_types_mapper(
+    date_as_object: bool = False,
+    bool_dtype: Union[Any, None] = None,
+    int_dtype: Union[Any, None] = None,
+    float_dtype: Union[Any, None] = None,
+    string_dtype: Union[Any, None] = None,
+):
     """Create a mapping from pyarrow types to pandas types.
 
     This overrides the pandas defaults to use null-safe extension types where
@@ -299,8 +306,17 @@ def default_types_mapper(date_as_object: bool = False):
     """
 
     def types_mapper(arrow_data_type):
-        if pyarrow.types.is_boolean(arrow_data_type):
-            return pandas.BooleanDtype()
+        if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type):
+            return bool_dtype
+
+        elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type):
+            return int_dtype
+
+        elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type):
+            return float_dtype
+
+        elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type):
+            return string_dtype
 
         elif (
             # If date_as_object is True, we know some DATE columns are
@@ -310,9 +326,6 @@ def types_mapper(arrow_data_type):
         ):
             return db_dtypes.DateDtype()
 
-        elif pyarrow.types.is_integer(arrow_data_type):
-            return pandas.Int64Dtype()
-
         elif pyarrow.types.is_time(arrow_data_type):
             return db_dtypes.TimeDtype()
 
diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py
@@ -77,6 +77,20 @@ class CreateDisposition(object):
     returned in the job result."""
 
 
+class DefaultPandasDTypes(enum.Enum):
+    """Default Pandas DataFrem DTypes to convert BigQuery data. These
+    Sentinel values are used instead of None to maintain backward compatibility,
+    and allow Pandas package is not available. For more information:
+    https://stackoverflow.com/a/60605919/101923
+    """
+
+    BOOL_DTYPE = object()
+    """Specifies default bool dtype"""
+
+    INT_DTYPE = object()
+    """Specifies default integer dtype"""
+
+
 class DestinationFormat(object):
     """The exported file format. The default value is :attr:`CSV`.
 
diff --git a/google/cloud/bigquery/job/query.py b/google/cloud/bigquery/job/query.py
@@ -28,7 +28,7 @@
 from google.cloud.bigquery.dataset import DatasetListItem
 from google.cloud.bigquery.dataset import DatasetReference
 from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
-from google.cloud.bigquery.enums import KeyResultStatementKind
+from google.cloud.bigquery.enums import KeyResultStatementKind, DefaultPandasDTypes
 from google.cloud.bigquery.external_config import ExternalConfig
 from google.cloud.bigquery import _helpers
 from google.cloud.bigquery.query import (
@@ -53,6 +53,11 @@
 from google.cloud.bigquery.job.base import _JobConfig
 from google.cloud.bigquery.job.base import _JobReference
 
+try:
+    import pandas  # type: ignore
+except ImportError:  # pragma: NO COVER
+    pandas = None
+
 if typing.TYPE_CHECKING:  # pragma: NO COVER
     # Assumption: type checks are only used by library developers and CI environments
     # that have all optional dependencies installed, thus no conditional imports.
@@ -1620,6 +1625,10 @@ def to_dataframe(
         create_bqstorage_client: bool = True,
         max_results: Optional[int] = None,
         geography_as_object: bool = False,
+        bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
+        int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
+        float_dtype: Union[Any, None] = None,
+        string_dtype: Union[Any, None] = None,
     ) -> "pandas.DataFrame":
         """Return a pandas DataFrame from a QueryJob
 
@@ -1672,6 +1681,46 @@ def to_dataframe(
 
                 .. versionadded:: 2.24.0
 
+            bool_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
+                to convert BigQuery Boolean type, instead of relying on the default
+                ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
+
+                .. versionadded:: 3.7.1
+
+            int_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
+                to convert BigQuery Integer types, instead of relying on the default
+                ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
+                Integer types can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
+
+                .. versionadded:: 3.7.1
+
+            float_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
+                to convert BigQuery Float type, instead of relying on the default
+                ``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("float64")``. BigQuery Float
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
+
+                .. versionadded:: 3.7.1
+
+            string_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
+                convert BigQuery String type, instead of relying on the default
+                ``numpy.dtype("object")``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("object")``. BigQuery String
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
+
+                .. versionadded:: 3.7.1
+
         Returns:
             pandas.DataFrame:
                 A :class:`~pandas.DataFrame` populated with row data
@@ -1694,6 +1743,10 @@ def to_dataframe(
             progress_bar_type=progress_bar_type,
             create_bqstorage_client=create_bqstorage_client,
             geography_as_object=geography_as_object,
+            bool_dtype=bool_dtype,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            string_dtype=string_dtype,
         )
 
     # If changing the signature of this method, make sure to apply the same
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -34,6 +34,11 @@
 except ImportError:  # pragma: NO COVER
     pyarrow = None
 
+try:
+    import db_dtypes  # type: ignore
+except ImportError:  # pragma: NO COVER
+    db_dtypes = None
+
 try:
     import geopandas  # type: ignore
 except ImportError:
@@ -55,6 +60,7 @@
 import google.cloud._helpers  # type: ignore
 from google.cloud.bigquery import _helpers
 from google.cloud.bigquery import _pandas_helpers
+from google.cloud.bigquery.enums import DefaultPandasDTypes
 from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
 from google.cloud.bigquery.schema import _build_schema_resource
 from google.cloud.bigquery.schema import _parse_schema_resource
@@ -88,6 +94,11 @@
 
 _TABLE_HAS_NO_SCHEMA = 'Table has no schema:  call "client.get_table()"'
 
+_NO_SUPPORTED_DTYPE = (
+    "The dtype cannot to be converted to a pandas ExtensionArray "
+    "because the necessary `__from_arrow__` attribute is missing."
+)
+
 
 def _reference_getter(table):
     """A :class:`~google.cloud.bigquery.table.TableReference` pointing to
@@ -1920,6 +1931,10 @@ def to_dataframe(
         progress_bar_type: str = None,
         create_bqstorage_client: bool = True,
         geography_as_object: bool = False,
+        bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
+        int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
+        float_dtype: Union[Any, None] = None,
+        string_dtype: Union[Any, None] = None,
     ) -> "pandas.DataFrame":
         """Create a pandas DataFrame by loading all pages of a query.
 
@@ -1958,6 +1973,7 @@ def to_dataframe(
                   progress bar as a graphical dialog box.
 
                 .. versionadded:: 1.11.0
+
             create_bqstorage_client (Optional[bool]):
                 If ``True`` (default), create a BigQuery Storage API client
                 using the default API settings. The BigQuery Storage API
@@ -1975,6 +1991,46 @@ def to_dataframe(
 
                 .. versionadded:: 2.24.0
 
+            bool_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
+                to convert BigQuery Boolean type, instead of relying on the default
+                ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
+
+                .. versionadded:: 3.7.1
+
+            int_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
+                to convert BigQuery Integer types, instead of relying on the default
+                ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
+                Integer types can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
+
+                .. versionadded:: 3.7.1
+
+            float_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
+                to convert BigQuery Float type, instead of relying on the default
+                ``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("float64")``. BigQuery Float
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
+
+                .. versionadded:: 3.7.1
+
+            string_dtype (Optional[pandas.Series.dtype, None]):
+                If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
+                convert BigQuery String type, instead of relying on the default
+                ``numpy.dtype("object")``. If you explicitly set the value to ``None``,
+                then the data type will be ``numpy.dtype("object")``. BigQuery String
+                type can be found at:
+                https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
+
+                .. versionadded:: 3.7.1
+
         Returns:
             pandas.DataFrame:
                 A :class:`~pandas.DataFrame` populated with row data and column
@@ -1987,14 +2043,34 @@ def to_dataframe(
                 the :mod:`google.cloud.bigquery_storage_v1` module is
                 required but cannot be imported.  Also if
                 `geography_as_object` is `True`, but the
-                :mod:`shapely` library cannot be imported.
+                :mod:`shapely` library cannot be imported. Also if
+                `bool_dtype`, `int_dtype` or other dtype parameters
+                is not supported dtype.
 
         """
         _pandas_helpers.verify_pandas_imports()
 
         if geography_as_object and shapely is None:
             raise ValueError(_NO_SHAPELY_ERROR)
 
+        if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
+            bool_dtype = pandas.BooleanDtype()
+
+        if int_dtype is DefaultPandasDTypes.INT_DTYPE:
+            int_dtype = pandas.Int64Dtype()
+
+        if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
+            raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)
+
+        if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
+            raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)
+
+        if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
+            raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)
+
+        if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
+            raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)
+
         if dtypes is None:
             dtypes = {}
 
@@ -2019,15 +2095,15 @@ def to_dataframe(
             for col in record_batch
             # Type can be date32 or date64 (plus units).
             # See: https://arrow.apache.org/docs/python/api/datatypes.html
-            if str(col.type).startswith("date")
+            if pyarrow.types.is_date(col.type)
         )
 
         timestamp_as_object = not all(
             self.__can_cast_timestamp_ns(col)
             for col in record_batch
-            # Type can be timestamp (plus units and time zone).
+            # Type can be datetime and timestamp (plus units and time zone).
             # See: https://arrow.apache.org/docs/python/api/datatypes.html
-            if str(col.type).startswith("timestamp")
+            if pyarrow.types.is_timestamp(col.type)
         )
 
         if len(record_batch) > 0:
@@ -2036,7 +2112,11 @@ def to_dataframe(
                 timestamp_as_object=timestamp_as_object,
                 integer_object_nulls=True,
                 types_mapper=_pandas_helpers.default_types_mapper(
-                    date_as_object=date_as_object
+                    date_as_object=date_as_object,
+                    bool_dtype=bool_dtype,
+                    int_dtype=int_dtype,
+                    float_dtype=float_dtype,
+                    string_dtype=string_dtype,
                 ),
             )
         else:
@@ -2233,6 +2313,10 @@ def to_dataframe(
         progress_bar_type=None,
         create_bqstorage_client=True,
         geography_as_object=False,
+        bool_dtype=None,
+        int_dtype=None,
+        float_dtype=None,
+        string_dtype=None,
     ) -> "pandas.DataFrame":
         """Create an empty dataframe.
 
@@ -2241,6 +2325,11 @@ def to_dataframe(
             dtypes (Any): Ignored. Added for compatibility with RowIterator.
             progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
             create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
+            geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
+            bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
+            int_dtype (Any): Ignored. Added for compatibility with RowIterator.
+            float_dtype (Any): Ignored. Added for compatibility with RowIterator.
+            string_dtype (Any): Ignored. Added for compatibility with RowIterator.
 
         Returns:
             pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py