Skip to content

Commit 64e913d

Browse files
authored
feat: add date, datetime, time, timestamp dtype to to_dataframe (#1547)
1 parent 6458bbd commit 64e913d

File tree

6 files changed

+494
-38
lines changed

6 files changed

+494
-38
lines changed

google/cloud/bigquery/_pandas_helpers.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,10 @@ def default_types_mapper(
290290
int_dtype: Union[Any, None] = None,
291291
float_dtype: Union[Any, None] = None,
292292
string_dtype: Union[Any, None] = None,
293+
date_dtype: Union[Any, None] = None,
294+
datetime_dtype: Union[Any, None] = None,
295+
time_dtype: Union[Any, None] = None,
296+
timestamp_dtype: Union[Any, None] = None,
293297
):
294298
"""Create a mapping from pyarrow types to pandas types.
295299
@@ -321,13 +325,28 @@ def types_mapper(arrow_data_type):
321325
elif (
322326
# If date_as_object is True, we know some DATE columns are
323327
# out-of-bounds of what is supported by pandas.
324-
not date_as_object
328+
date_dtype is not None
329+
and not date_as_object
325330
and pyarrow.types.is_date(arrow_data_type)
326331
):
327-
return db_dtypes.DateDtype()
332+
return date_dtype
328333

329-
elif pyarrow.types.is_time(arrow_data_type):
330-
return db_dtypes.TimeDtype()
334+
elif (
335+
datetime_dtype is not None
336+
and pyarrow.types.is_timestamp(arrow_data_type)
337+
and arrow_data_type.tz is None
338+
):
339+
return datetime_dtype
340+
341+
elif (
342+
timestamp_dtype is not None
343+
and pyarrow.types.is_timestamp(arrow_data_type)
344+
and arrow_data_type.tz is not None
345+
):
346+
return timestamp_dtype
347+
348+
elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type):
349+
return time_dtype
331350

332351
return types_mapper
333352

google/cloud/bigquery/enums.py

+6
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ class DefaultPandasDTypes(enum.Enum):
9090
INT_DTYPE = object()
9191
"""Specifies default integer dtype"""
9292

93+
DATE_DTYPE = object()
94+
"""Specifies default date dtype"""
95+
96+
TIME_DTYPE = object()
97+
"""Specifies default time dtype"""
98+
9399

94100
class DestinationFormat(object):
95101
"""The exported file format. The default value is :attr:`CSV`.

google/cloud/bigquery/job/query.py

+60-4
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@
5858
except ImportError: # pragma: NO COVER
5959
pandas = None
6060

61+
try:
62+
import db_dtypes # type: ignore
63+
except ImportError: # pragma: NO COVER
64+
db_dtypes = None
65+
6166
if typing.TYPE_CHECKING: # pragma: NO COVER
6267
# Assumption: type checks are only used by library developers and CI environments
6368
# that have all optional dependencies installed, thus no conditional imports.
@@ -1637,6 +1642,10 @@ def to_dataframe(
16371642
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
16381643
float_dtype: Union[Any, None] = None,
16391644
string_dtype: Union[Any, None] = None,
1645+
date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE,
1646+
datetime_dtype: Union[Any, None] = None,
1647+
time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
1648+
timestamp_dtype: Union[Any, None] = None,
16401649
) -> "pandas.DataFrame":
16411650
"""Return a pandas DataFrame from a QueryJob
16421651
@@ -1697,7 +1706,7 @@ def to_dataframe(
16971706
type can be found at:
16981707
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
16991708
1700-
.. versionadded:: 3.7.1
1709+
.. versionadded:: 3.8.0
17011710
17021711
int_dtype (Optional[pandas.Series.dtype, None]):
17031712
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
@@ -1707,7 +1716,7 @@ def to_dataframe(
17071716
Integer types can be found at:
17081717
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
17091718
1710-
.. versionadded:: 3.7.1
1719+
.. versionadded:: 3.8.0
17111720
17121721
float_dtype (Optional[pandas.Series.dtype, None]):
17131722
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
@@ -1717,7 +1726,7 @@ def to_dataframe(
17171726
type can be found at:
17181727
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
17191728
1720-
.. versionadded:: 3.7.1
1729+
.. versionadded:: 3.8.0
17211730
17221731
string_dtype (Optional[pandas.Series.dtype, None]):
17231732
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
@@ -1727,7 +1736,50 @@ def to_dataframe(
17271736
type can be found at:
17281737
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
17291738
1730-
.. versionadded:: 3.7.1
1739+
.. versionadded:: 3.8.0
1740+
1741+
date_dtype (Optional[pandas.Series.dtype, None]):
1742+
If set, indicate a pandas ExtensionDtype (e.g.
1743+
``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date
1744+
type, instead of relying on the default ``db_dtypes.DateDtype()``.
1745+
If you explicitly set the value to ``None``, then the data type will be
1746+
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
1747+
Date type can be found at:
1748+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type
1749+
1750+
.. versionadded:: 3.10.0
1751+
1752+
datetime_dtype (Optional[pandas.Series.dtype, None]):
1753+
If set, indicate a pandas ExtensionDtype (e.g.
1754+
``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime
1755+
type, instead of relying on the default ``numpy.dtype("datetime64[ns]``.
1756+
If you explicitly set the value to ``None``, then the data type will be
1757+
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
1758+
Datetime type can be found at:
1759+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type
1760+
1761+
.. versionadded:: 3.10.0
1762+
1763+
time_dtype (Optional[pandas.Series.dtype, None]):
1764+
If set, indicate a pandas ExtensionDtype (e.g.
1765+
``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time
1766+
type, instead of relying on the default ``db_dtypes.TimeDtype()``.
1767+
If you explicitly set the value to ``None``, then the data type will be
1768+
``numpy.dtype("object")``. BigQuery Time type can be found at:
1769+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type
1770+
1771+
.. versionadded:: 3.10.0
1772+
1773+
timestamp_dtype (Optional[pandas.Series.dtype, None]):
1774+
If set, indicate a pandas ExtensionDtype (e.g.
1775+
``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp
1776+
type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``.
1777+
If you explicitly set the value to ``None``, then the data type will be
1778+
``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery
1779+
Datetime type can be found at:
1780+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type
1781+
1782+
.. versionadded:: 3.10.0
17311783
17321784
Returns:
17331785
pandas.DataFrame:
@@ -1755,6 +1807,10 @@ def to_dataframe(
17551807
int_dtype=int_dtype,
17561808
float_dtype=float_dtype,
17571809
string_dtype=string_dtype,
1810+
date_dtype=date_dtype,
1811+
datetime_dtype=datetime_dtype,
1812+
time_dtype=time_dtype,
1813+
timestamp_dtype=timestamp_dtype,
17581814
)
17591815

17601816
# If changing the signature of this method, make sure to apply the same

google/cloud/bigquery/table.py

+106-22
Original file line numberDiff line numberDiff line change
@@ -1935,6 +1935,10 @@ def to_dataframe(
19351935
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
19361936
float_dtype: Union[Any, None] = None,
19371937
string_dtype: Union[Any, None] = None,
1938+
date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE,
1939+
datetime_dtype: Union[Any, None] = None,
1940+
time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
1941+
timestamp_dtype: Union[Any, None] = None,
19381942
) -> "pandas.DataFrame":
19391943
"""Create a pandas DataFrame by loading all pages of a query.
19401944
@@ -1999,7 +2003,7 @@ def to_dataframe(
19992003
type can be found at:
20002004
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
20012005
2002-
.. versionadded:: 3.7.1
2006+
.. versionadded:: 3.8.0
20032007
20042008
int_dtype (Optional[pandas.Series.dtype, None]):
20052009
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
@@ -2009,7 +2013,7 @@ def to_dataframe(
20092013
Integer types can be found at:
20102014
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
20112015
2012-
.. versionadded:: 3.7.1
2016+
.. versionadded:: 3.8.0
20132017
20142018
float_dtype (Optional[pandas.Series.dtype, None]):
20152019
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
@@ -2019,7 +2023,7 @@ def to_dataframe(
20192023
type can be found at:
20202024
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
20212025
2022-
.. versionadded:: 3.7.1
2026+
.. versionadded:: 3.8.0
20232027
20242028
string_dtype (Optional[pandas.Series.dtype, None]):
20252029
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
@@ -2029,7 +2033,50 @@ def to_dataframe(
20292033
type can be found at:
20302034
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
20312035
2032-
.. versionadded:: 3.7.1
2036+
.. versionadded:: 3.8.0
2037+
2038+
date_dtype (Optional[pandas.Series.dtype, None]):
2039+
If set, indicate a pandas ExtensionDtype (e.g.
2040+
``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date
2041+
type, instead of relying on the default ``db_dtypes.DateDtype()``.
2042+
If you explicitly set the value to ``None``, then the data type will be
2043+
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
2044+
Date type can be found at:
2045+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type
2046+
2047+
.. versionadded:: 3.10.0
2048+
2049+
datetime_dtype (Optional[pandas.Series.dtype, None]):
2050+
If set, indicate a pandas ExtensionDtype (e.g.
2051+
``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime
2052+
type, instead of relying on the default ``numpy.dtype("datetime64[ns]``.
2053+
If you explicitly set the value to ``None``, then the data type will be
2054+
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
2055+
Datetime type can be found at:
2056+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type
2057+
2058+
.. versionadded:: 3.10.0
2059+
2060+
time_dtype (Optional[pandas.Series.dtype, None]):
2061+
If set, indicate a pandas ExtensionDtype (e.g.
2062+
``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time
2063+
type, instead of relying on the default ``db_dtypes.TimeDtype()``.
2064+
If you explicitly set the value to ``None``, then the data type will be
2065+
``numpy.dtype("object")``. BigQuery Time type can be found at:
2066+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type
2067+
2068+
.. versionadded:: 3.10.0
2069+
2070+
timestamp_dtype (Optional[pandas.Series.dtype, None]):
2071+
If set, indicate a pandas ExtensionDtype (e.g.
2072+
``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp
2073+
type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``.
2074+
If you explicitly set the value to ``None``, then the data type will be
2075+
``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery
2076+
Datetime type can be found at:
2077+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type
2078+
2079+
.. versionadded:: 3.10.0
20332080
20342081
Returns:
20352082
pandas.DataFrame:
@@ -2059,6 +2106,9 @@ def to_dataframe(
20592106
if int_dtype is DefaultPandasDTypes.INT_DTYPE:
20602107
int_dtype = pandas.Int64Dtype()
20612108

2109+
if time_dtype is DefaultPandasDTypes.TIME_DTYPE:
2110+
time_dtype = db_dtypes.TimeDtype()
2111+
20622112
if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
20632113
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)
20642114

@@ -2071,6 +2121,24 @@ def to_dataframe(
20712121
if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
20722122
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)
20732123

2124+
if (
2125+
date_dtype is not None
2126+
and date_dtype is not DefaultPandasDTypes.DATE_DTYPE
2127+
and not hasattr(date_dtype, "__from_arrow__")
2128+
):
2129+
raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE)
2130+
2131+
if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"):
2132+
raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE)
2133+
2134+
if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"):
2135+
raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE)
2136+
2137+
if timestamp_dtype is not None and not hasattr(
2138+
timestamp_dtype, "__from_arrow__"
2139+
):
2140+
raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE)
2141+
20742142
if dtypes is None:
20752143
dtypes = {}
20762144

@@ -2086,25 +2154,29 @@ def to_dataframe(
20862154
create_bqstorage_client=create_bqstorage_client,
20872155
)
20882156

2089-
# When converting date or timestamp values to nanosecond precision, the result
2090-
# can be out of pyarrow bounds. To avoid the error when converting to
2091-
# Pandas, we set the date_as_object or timestamp_as_object parameter to True,
2092-
# if necessary.
2093-
date_as_object = not all(
2094-
self.__can_cast_timestamp_ns(col)
2095-
for col in record_batch
2096-
# Type can be date32 or date64 (plus units).
2097-
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2098-
if pyarrow.types.is_date(col.type)
2099-
)
2157+
# Default date dtype is `db_dtypes.DateDtype()` that could cause out of bounds error,
2158+
# when pyarrow converts date values to nanosecond precision. To avoid the error, we
2159+
# set the date_as_object parameter to True, if necessary.
2160+
date_as_object = False
2161+
if date_dtype is DefaultPandasDTypes.DATE_DTYPE:
2162+
date_dtype = db_dtypes.DateDtype()
2163+
date_as_object = not all(
2164+
self.__can_cast_timestamp_ns(col)
2165+
for col in record_batch
2166+
# Type can be date32 or date64 (plus units).
2167+
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2168+
if pyarrow.types.is_date(col.type)
2169+
)
21002170

2101-
timestamp_as_object = not all(
2102-
self.__can_cast_timestamp_ns(col)
2103-
for col in record_batch
2104-
# Type can be datetime and timestamp (plus units and time zone).
2105-
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2106-
if pyarrow.types.is_timestamp(col.type)
2107-
)
2171+
timestamp_as_object = False
2172+
if datetime_dtype is None and timestamp_dtype is None:
2173+
timestamp_as_object = not all(
2174+
self.__can_cast_timestamp_ns(col)
2175+
for col in record_batch
2176+
# Type can be datetime and timestamp (plus units and time zone).
2177+
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2178+
if pyarrow.types.is_timestamp(col.type)
2179+
)
21082180

21092181
if len(record_batch) > 0:
21102182
df = record_batch.to_pandas(
@@ -2117,6 +2189,10 @@ def to_dataframe(
21172189
int_dtype=int_dtype,
21182190
float_dtype=float_dtype,
21192191
string_dtype=string_dtype,
2192+
date_dtype=date_dtype,
2193+
datetime_dtype=datetime_dtype,
2194+
time_dtype=time_dtype,
2195+
timestamp_dtype=timestamp_dtype,
21202196
),
21212197
)
21222198
else:
@@ -2317,6 +2393,10 @@ def to_dataframe(
23172393
int_dtype=None,
23182394
float_dtype=None,
23192395
string_dtype=None,
2396+
date_dtype=None,
2397+
datetime_dtype=None,
2398+
time_dtype=None,
2399+
timestamp_dtype=None,
23202400
) -> "pandas.DataFrame":
23212401
"""Create an empty dataframe.
23222402
@@ -2330,6 +2410,10 @@ def to_dataframe(
23302410
int_dtype (Any): Ignored. Added for compatibility with RowIterator.
23312411
float_dtype (Any): Ignored. Added for compatibility with RowIterator.
23322412
string_dtype (Any): Ignored. Added for compatibility with RowIterator.
2413+
date_dtype (Any): Ignored. Added for compatibility with RowIterator.
2414+
datetime_dtype (Any): Ignored. Added for compatibility with RowIterator.
2415+
time_dtype (Any): Ignored. Added for compatibility with RowIterator.
2416+
timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator.
23332417
23342418
Returns:
23352419
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.

0 commit comments

Comments
 (0)