Skip to content

Commit 16f65e6

Browse files
author
Jim Fulton
authored
feat: Support using GeoPandas for GEOGRAPHY columns (#848)
1 parent 5c5b4b8 commit 16f65e6

16 files changed

+1102
-29
lines changed

docs/conf.py

+2
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,8 @@
366366
"grpc": ("https://grpc.github.io/grpc/python/", None),
367367
"proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None),
368368
"protobuf": ("https://googleapis.dev/python/protobuf/latest/", None),
369+
"pandas": ("http://pandas.pydata.org/pandas-docs/dev", None),
370+
"geopandas": ("https://geopandas.org/", None),
369371
}
370372

371373

docs/usage/pandas.rst

+15
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,21 @@ To retrieve table rows as a :class:`pandas.DataFrame`:
3737
:start-after: [START bigquery_list_rows_dataframe]
3838
:end-before: [END bigquery_list_rows_dataframe]
3939

40+
41+
Retrieve BigQuery GEOGRAPHY data as a GeoPandas GeoDataFrame
42+
------------------------------------------------------------
43+
44+
`GeoPandas <https://geopandas.org/>`_ adds geospatial analytics
45+
capabilities to Pandas. To retrieve query results containing
46+
GEOGRAPHY data as a :class:`geopandas.GeoDataFrame`:
47+
48+
.. literalinclude:: ../samples/geography/to_geodataframe.py
49+
:language: python
50+
:dedent: 4
51+
:start-after: [START bigquery_query_results_geodataframe]
52+
:end-before: [END bigquery_query_results_geodataframe]
53+
54+
4055
Load a Pandas DataFrame to a BigQuery Table
4156
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4257

google/cloud/bigquery/_pandas_helpers.py

+67-4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,36 @@
2424
import pandas
2525
except ImportError: # pragma: NO COVER
2626
pandas = None
27+
else:
28+
import numpy
29+
30+
try:
31+
# _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array`
32+
from shapely.geometry.base import BaseGeometry as _BaseGeometry
33+
except ImportError: # pragma: NO COVER
34+
# No shapely, use NoneType for _BaseGeometry as a placeholder.
35+
_BaseGeometry = type(None)
36+
else:
37+
if pandas is not None: # pragma: NO COVER
38+
39+
def _to_wkb():
40+
# Create a closure that:
41+
# - Adds a not-null check. This allows the returned function to
42+
# be used directly with apply, unlike `shapely.wkb.dumps`.
43+
# - Avoid extra work done by `shapely.wkb.dumps` that we don't need.
44+
# - Caches the WKBWriter (and write method lookup :) )
45+
# - Avoids adding WKBWriter, lgeos, and notnull to the module namespace.
46+
from shapely.geos import WKBWriter, lgeos
47+
48+
write = WKBWriter(lgeos).write
49+
notnull = pandas.notnull
50+
51+
def _to_wkb(v):
52+
return write(v) if notnull(v) else v
53+
54+
return _to_wkb
55+
56+
_to_wkb = _to_wkb()
2757

2858
try:
2959
import pyarrow
@@ -69,6 +99,7 @@
6999
"uint8": "INTEGER",
70100
"uint16": "INTEGER",
71101
"uint32": "INTEGER",
102+
"geometry": "GEOGRAPHY",
72103
}
73104

74105

@@ -193,14 +224,16 @@ def bq_to_arrow_data_type(field):
193224
return data_type_constructor()
194225

195226

196-
def bq_to_arrow_field(bq_field):
227+
def bq_to_arrow_field(bq_field, array_type=None):
197228
"""Return the Arrow field, corresponding to a given BigQuery column.
198229
199230
Returns:
200231
None: if the Arrow type cannot be determined.
201232
"""
202233
arrow_type = bq_to_arrow_data_type(bq_field)
203-
if arrow_type:
234+
if arrow_type is not None:
235+
if array_type is not None:
236+
arrow_type = array_type # For GEOGRAPHY, at least initially
204237
is_nullable = bq_field.mode.upper() == "NULLABLE"
205238
return pyarrow.field(bq_field.name, arrow_type, nullable=is_nullable)
206239

@@ -225,7 +258,24 @@ def bq_to_arrow_schema(bq_schema):
225258

226259

227260
def bq_to_arrow_array(series, bq_field):
228-
arrow_type = bq_to_arrow_data_type(bq_field)
261+
if bq_field.field_type.upper() == "GEOGRAPHY":
262+
arrow_type = None
263+
first = _first_valid(series)
264+
if first is not None:
265+
if series.dtype.name == "geometry" or isinstance(first, _BaseGeometry):
266+
arrow_type = pyarrow.binary()
267+
# Convert shapey geometry to WKB binary format:
268+
series = series.apply(_to_wkb)
269+
elif isinstance(first, bytes):
270+
arrow_type = pyarrow.binary()
271+
elif series.dtype.name == "geometry":
272+
# We have a GeoSeries containing all nulls, convert it to a pandas series
273+
series = pandas.Series(numpy.array(series))
274+
275+
if arrow_type is None:
276+
arrow_type = bq_to_arrow_data_type(bq_field)
277+
else:
278+
arrow_type = bq_to_arrow_data_type(bq_field)
229279

230280
field_type_upper = bq_field.field_type.upper() if bq_field.field_type else ""
231281

@@ -279,6 +329,12 @@ def list_columns_and_indexes(dataframe):
279329
return columns_and_indexes
280330

281331

332+
def _first_valid(series):
333+
first_valid_index = series.first_valid_index()
334+
if first_valid_index is not None:
335+
return series.at[first_valid_index]
336+
337+
282338
def dataframe_to_bq_schema(dataframe, bq_schema):
283339
"""Convert a pandas DataFrame schema to a BigQuery schema.
284340
@@ -319,6 +375,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
319375
# Otherwise, try to automatically determine the type based on the
320376
# pandas dtype.
321377
bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
378+
if bq_type is None:
379+
sample_data = _first_valid(dataframe[column])
380+
if (
381+
isinstance(sample_data, _BaseGeometry)
382+
and sample_data is not None # Paranoia
383+
):
384+
bq_type = "GEOGRAPHY"
322385
bq_field = schema.SchemaField(column, bq_type)
323386
bq_schema_out.append(bq_field)
324387

@@ -450,11 +513,11 @@ def dataframe_to_arrow(dataframe, bq_schema):
450513
arrow_names = []
451514
arrow_fields = []
452515
for bq_field in bq_schema:
453-
arrow_fields.append(bq_to_arrow_field(bq_field))
454516
arrow_names.append(bq_field.name)
455517
arrow_arrays.append(
456518
bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)
457519
)
520+
arrow_fields.append(bq_to_arrow_field(bq_field, arrow_arrays[-1].type))
458521

459522
if all((field is not None for field in arrow_fields)):
460523
return pyarrow.Table.from_arrays(

google/cloud/bigquery/job/query.py

+115-4
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
# Assumption: type checks are only used by library developers and CI environments
5454
# that have all optional dependencies installed, thus no conditional imports.
5555
import pandas
56+
import geopandas
5657
import pyarrow
5758
from google.api_core import retry as retries
5859
from google.cloud import bigquery_storage
@@ -1487,6 +1488,7 @@ def to_dataframe(
14871488
create_bqstorage_client: bool = True,
14881489
date_as_object: bool = True,
14891490
max_results: Optional[int] = None,
1491+
geography_as_object: bool = False,
14901492
) -> "pandas.DataFrame":
14911493
"""Return a pandas DataFrame from a QueryJob
14921494
@@ -1538,13 +1540,27 @@ def to_dataframe(
15381540
15391541
.. versionadded:: 2.21.0
15401542
1543+
geography_as_object (Optional[bool]):
1544+
If ``True``, convert GEOGRAPHY data to :mod:`shapely`
1545+
geometry objects. If ``False`` (default), don't cast
1546+
geography data to :mod:`shapely` geometry objects.
1547+
1548+
.. versionadded:: 2.24.0
1549+
15411550
Returns:
1542-
A :class:`~pandas.DataFrame` populated with row data and column
1543-
headers from the query results. The column headers are derived
1544-
from the destination table's schema.
1551+
pandas.DataFrame:
1552+
A :class:`~pandas.DataFrame` populated with row data
1553+
and column headers from the query results. The column
1554+
headers are derived from the destination table's
1555+
schema.
15451556
15461557
Raises:
1547-
ValueError: If the `pandas` library cannot be imported.
1558+
ValueError:
1559+
If the :mod:`pandas` library cannot be imported, or
1560+
the :mod:`google.cloud.bigquery_storage_v1` module is
1561+
required but cannot be imported. Also if
1562+
`geography_as_object` is `True`, but the
1563+
:mod:`shapely` library cannot be imported.
15481564
"""
15491565
query_result = wait_for_query(self, progress_bar_type, max_results=max_results)
15501566
return query_result.to_dataframe(
@@ -1553,6 +1569,101 @@ def to_dataframe(
15531569
progress_bar_type=progress_bar_type,
15541570
create_bqstorage_client=create_bqstorage_client,
15551571
date_as_object=date_as_object,
1572+
geography_as_object=geography_as_object,
1573+
)
1574+
1575+
# If changing the signature of this method, make sure to apply the same
1576+
# changes to table.RowIterator.to_dataframe(), except for the max_results parameter
1577+
# that should only exist here in the QueryJob method.
1578+
def to_geodataframe(
1579+
self,
1580+
bqstorage_client: "bigquery_storage.BigQueryReadClient" = None,
1581+
dtypes: Dict[str, Any] = None,
1582+
progress_bar_type: str = None,
1583+
create_bqstorage_client: bool = True,
1584+
date_as_object: bool = True,
1585+
max_results: Optional[int] = None,
1586+
geography_column: Optional[str] = None,
1587+
) -> "geopandas.GeoDataFrame":
1588+
"""Return a GeoPandas GeoDataFrame from a QueryJob
1589+
1590+
Args:
1591+
bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
1592+
A BigQuery Storage API client. If supplied, use the faster
1593+
BigQuery Storage API to fetch rows from BigQuery. This
1594+
API is a billable API.
1595+
1596+
This method requires the ``fastavro`` and
1597+
``google-cloud-bigquery-storage`` libraries.
1598+
1599+
Reading from a specific partition or snapshot is not
1600+
currently supported by this method.
1601+
1602+
dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
1603+
A dictionary of column names pandas ``dtype``s. The provided
1604+
``dtype`` is used when constructing the series for the column
1605+
specified. Otherwise, the default pandas behavior is used.
1606+
1607+
progress_bar_type (Optional[str]):
1608+
If set, use the `tqdm <https://tqdm.github.io/>`_ library to
1609+
display a progress bar while the data downloads. Install the
1610+
``tqdm`` package to use this feature.
1611+
1612+
See
1613+
:func:`~google.cloud.bigquery.table.RowIterator.to_dataframe`
1614+
for details.
1615+
1616+
.. versionadded:: 1.11.0
1617+
create_bqstorage_client (Optional[bool]):
1618+
If ``True`` (default), create a BigQuery Storage API client
1619+
using the default API settings. The BigQuery Storage API
1620+
is a faster way to fetch rows from BigQuery. See the
1621+
``bqstorage_client`` parameter for more information.
1622+
1623+
This argument does nothing if ``bqstorage_client`` is supplied.
1624+
1625+
.. versionadded:: 1.24.0
1626+
1627+
date_as_object (Optional[bool]):
1628+
If ``True`` (default), cast dates to objects. If ``False``, convert
1629+
to datetime64[ns] dtype.
1630+
1631+
.. versionadded:: 1.26.0
1632+
1633+
max_results (Optional[int]):
1634+
Maximum number of rows to include in the result. No limit by default.
1635+
1636+
.. versionadded:: 2.21.0
1637+
1638+
geography_column (Optional[str]):
1639+
If there are more than one GEOGRAPHY column,
1640+
identifies which one to use to construct a GeoPandas
1641+
GeoDataFrame. This option can be ommitted if there's
1642+
only one GEOGRAPHY column.
1643+
1644+
Returns:
1645+
geopandas.GeoDataFrame:
1646+
A :class:`geopandas.GeoDataFrame` populated with row
1647+
data and column headers from the query results. The
1648+
column headers are derived from the destination
1649+
table's schema.
1650+
1651+
Raises:
1652+
ValueError:
1653+
If the :mod:`geopandas` library cannot be imported, or the
1654+
:mod:`google.cloud.bigquery_storage_v1` module is
1655+
required but cannot be imported.
1656+
1657+
.. versionadded:: 2.24.0
1658+
"""
1659+
query_result = wait_for_query(self, progress_bar_type, max_results=max_results)
1660+
return query_result.to_geodataframe(
1661+
bqstorage_client=bqstorage_client,
1662+
dtypes=dtypes,
1663+
progress_bar_type=progress_bar_type,
1664+
create_bqstorage_client=create_bqstorage_client,
1665+
date_as_object=date_as_object,
1666+
geography_column=geography_column,
15561667
)
15571668

15581669
def __iter__(self):

0 commit comments

Comments
 (0)