Skip to content

Commit e1aa921

Browse files
steffnaypartheatswast
authored
deps: update dependencies (#1282)
* update dependencies * deps: pyarrow extras * clean up comments * add test pyarrow skips * replace storage checks * update tests * update tests * Update setup.py * update system tests * update verify_pandas_imports * add pyarrow guards * add datetime check * change pyarrow import * update * add pyarrow skips * fix types * lint * Update google/cloud/bigquery/client.py Co-authored-by: Tim Swast <[email protected]> * update pyarrow version * update test * lint * update pyarrow req * update noxfile * remove bignum check * remove comments * add test importorskip * update test * update test * update dependency * change version * update imports Co-authored-by: Anthonios Partheniou <[email protected]> Co-authored-by: Tim Swast <[email protected]>
1 parent 589c8bd commit e1aa921

23 files changed

+1013
-118
lines changed

docs/snippets.py

+5
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131
except (ImportError, AttributeError):
3232
pandas = None
3333

34+
try:
35+
import pyarrow
36+
except (ImportError, AttributeError):
37+
pyarrow = None
38+
3439
from google.api_core.exceptions import InternalServerError
3540
from google.api_core.exceptions import ServiceUnavailable
3641
from google.api_core.exceptions import TooManyRequests

google/cloud/bigquery/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
from google.cloud.bigquery.enums import KeyResultStatementKind
4343
from google.cloud.bigquery.enums import SqlTypeNames
4444
from google.cloud.bigquery.enums import StandardSqlTypeNames
45+
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
46+
from google.cloud.bigquery.exceptions import LegacyPyarrowError
4547
from google.cloud.bigquery.external_config import ExternalConfig
4648
from google.cloud.bigquery.external_config import BigtableOptions
4749
from google.cloud.bigquery.external_config import BigtableColumnFamily
@@ -195,6 +197,9 @@
195197
"WriteDisposition",
196198
# EncryptionConfiguration
197199
"EncryptionConfiguration",
200+
# Custom exceptions
201+
"LegacyBigQueryStorageError",
202+
"LegacyPyarrowError",
198203
]
199204

200205

google/cloud/bigquery/_helpers.py

+72-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import math
2121
import re
2222
import os
23-
from typing import Optional, Union
23+
from typing import Any, Optional, Union
2424

2525
from dateutil import relativedelta
2626
from google.cloud._helpers import UTC # type: ignore
@@ -32,6 +32,11 @@
3232

3333
import packaging.version
3434

35+
from google.cloud.bigquery.exceptions import (
36+
LegacyBigQueryStorageError,
37+
LegacyPyarrowError,
38+
)
39+
3540
_RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f"
3641
_TIMEONLY_WO_MICROS = "%H:%M:%S"
3742
_TIMEONLY_W_MICROS = "%H:%M:%S.%f"
@@ -50,6 +55,10 @@
5055
r"(?P<time_sign>-?)(?P<hours>\d+):(?P<minutes>\d+):(?P<seconds>\d+)\.?(?P<fraction>\d*)?$"
5156
)
5257

58+
_MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0")
59+
60+
_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0")
61+
5362
_BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0")
5463

5564
BIGQUERY_EMULATOR_HOST = "BIGQUERY_EMULATOR_HOST"
@@ -83,7 +92,7 @@ def installed_version(self) -> packaging.version.Version:
8392
getattr(bigquery_storage, "__version__", "0.0.0")
8493
)
8594

86-
return self._installed_version
95+
return self._installed_version # type: ignore
8796

8897
@property
8998
def is_read_session_optional(self) -> bool:
@@ -93,6 +102,29 @@ def is_read_session_optional(self) -> bool:
93102
"""
94103
return self.installed_version >= _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION
95104

105+
def verify_version(self):
106+
"""Verify that a recent enough version of BigQuery Storage extra is
107+
installed.
108+
109+
The function assumes that google-cloud-bigquery-storage extra is
110+
installed, and should thus be used in places where this assumption
111+
holds.
112+
113+
Because `pip` can install an outdated version of this extra despite the
114+
constraints in `setup.py`, the calling code can use this helper to
115+
verify the version compatibility at runtime.
116+
117+
Raises:
118+
LegacyBigQueryStorageError:
119+
If the google-cloud-bigquery-storage package is outdated.
120+
"""
121+
if self.installed_version < _MIN_BQ_STORAGE_VERSION:
122+
msg = (
123+
"Dependency google-cloud-bigquery-storage is outdated, please upgrade "
124+
f"it to version >= {_MIN_BQ_STORAGE_VERSION} (version found: {self.installed_version})."
125+
)
126+
raise LegacyBigQueryStorageError(msg)
127+
96128

97129
class PyarrowVersions:
98130
"""Version comparisons for pyarrow package."""
@@ -120,6 +152,44 @@ def installed_version(self) -> packaging.version.Version:
120152
def use_compliant_nested_type(self) -> bool:
121153
return self.installed_version.major >= 4
122154

155+
def try_import(self, raise_if_error: bool = False) -> Any:
156+
"""Verify that a recent enough version of pyarrow extra is
157+
installed.
158+
159+
The function assumes that pyarrow extra is installed, and should thus
160+
be used in places where this assumption holds.
161+
162+
Because `pip` can install an outdated version of this extra despite the
163+
constraints in `setup.py`, the calling code can use this helper to
164+
verify the version compatibility at runtime.
165+
166+
Returns:
167+
The ``pyarrow`` module or ``None``.
168+
169+
Raises:
170+
LegacyPyarrowError:
171+
If the pyarrow package is outdated and ``raise_if_error`` is ``True``.
172+
"""
173+
try:
174+
import pyarrow
175+
except ImportError as exc: # pragma: NO COVER
176+
if raise_if_error:
177+
raise LegacyPyarrowError(
178+
f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}."
179+
) from exc
180+
return None
181+
182+
if self.installed_version < _MIN_PYARROW_VERSION:
183+
if raise_if_error:
184+
msg = (
185+
"Dependency pyarrow is outdated, please upgrade "
186+
f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})."
187+
)
188+
raise LegacyPyarrowError(msg)
189+
return None
190+
191+
return pyarrow
192+
123193

124194
BQ_STORAGE_VERSIONS = BQStorageVersions()
125195
PYARROW_VERSIONS = PyarrowVersions()

google/cloud/bigquery/_pandas_helpers.py

+74-53
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
import queue
2323
import warnings
2424

25+
from packaging import version
26+
27+
from google.cloud.bigquery import _helpers
28+
from google.cloud.bigquery import schema
29+
2530
try:
2631
import pandas # type: ignore
2732

@@ -43,9 +48,7 @@
4348
db_dtypes_import_exception = exc
4449
date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype
4550

46-
47-
import pyarrow # type: ignore
48-
import pyarrow.parquet # type: ignore
51+
pyarrow = _helpers.PYARROW_VERSIONS.try_import()
4952

5053
try:
5154
# _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array`
@@ -77,10 +80,6 @@ def _to_wkb(v):
7780
# Having BQ Storage available implies that pyarrow >=1.0.0 is available, too.
7881
_ARROW_COMPRESSION_SUPPORT = True
7982

80-
from google.cloud.bigquery import _helpers
81-
from google.cloud.bigquery import schema
82-
83-
8483
_LOGGER = logging.getLogger(__name__)
8584

8685
_PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds.
@@ -141,52 +140,65 @@ def pyarrow_timestamp():
141140
return pyarrow.timestamp("us", tz="UTC")
142141

143142

144-
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
145-
# When modifying it be sure to update it there as well.
146-
BQ_TO_ARROW_SCALARS = {
147-
"BIGNUMERIC": pyarrow_bignumeric,
148-
"BOOL": pyarrow.bool_,
149-
"BOOLEAN": pyarrow.bool_,
150-
"BYTES": pyarrow.binary,
151-
"DATE": pyarrow.date32,
152-
"DATETIME": pyarrow_datetime,
153-
"FLOAT": pyarrow.float64,
154-
"FLOAT64": pyarrow.float64,
155-
"GEOGRAPHY": pyarrow.string,
156-
"INT64": pyarrow.int64,
157-
"INTEGER": pyarrow.int64,
158-
"NUMERIC": pyarrow_numeric,
159-
"STRING": pyarrow.string,
160-
"TIME": pyarrow_time,
161-
"TIMESTAMP": pyarrow_timestamp,
162-
}
163-
ARROW_SCALAR_IDS_TO_BQ = {
164-
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
165-
pyarrow.bool_().id: "BOOL",
166-
pyarrow.int8().id: "INT64",
167-
pyarrow.int16().id: "INT64",
168-
pyarrow.int32().id: "INT64",
169-
pyarrow.int64().id: "INT64",
170-
pyarrow.uint8().id: "INT64",
171-
pyarrow.uint16().id: "INT64",
172-
pyarrow.uint32().id: "INT64",
173-
pyarrow.uint64().id: "INT64",
174-
pyarrow.float16().id: "FLOAT64",
175-
pyarrow.float32().id: "FLOAT64",
176-
pyarrow.float64().id: "FLOAT64",
177-
pyarrow.time32("ms").id: "TIME",
178-
pyarrow.time64("ns").id: "TIME",
179-
pyarrow.timestamp("ns").id: "TIMESTAMP",
180-
pyarrow.date32().id: "DATE",
181-
pyarrow.date64().id: "DATETIME", # because millisecond resolution
182-
pyarrow.binary().id: "BYTES",
183-
pyarrow.string().id: "STRING", # also alias for pyarrow.utf8()
184-
# The exact scale and precision don't matter, see below.
185-
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
186-
# The exact decimal's scale and precision are not important, as only
187-
# the type ID matters, and it's the same for all decimal256 instances.
188-
pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC",
189-
}
143+
if pyarrow:
144+
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
145+
# When modifying it be sure to update it there as well.
146+
BQ_TO_ARROW_SCALARS = {
147+
"BOOL": pyarrow.bool_,
148+
"BOOLEAN": pyarrow.bool_,
149+
"BYTES": pyarrow.binary,
150+
"DATE": pyarrow.date32,
151+
"DATETIME": pyarrow_datetime,
152+
"FLOAT": pyarrow.float64,
153+
"FLOAT64": pyarrow.float64,
154+
"GEOGRAPHY": pyarrow.string,
155+
"INT64": pyarrow.int64,
156+
"INTEGER": pyarrow.int64,
157+
"NUMERIC": pyarrow_numeric,
158+
"STRING": pyarrow.string,
159+
"TIME": pyarrow_time,
160+
"TIMESTAMP": pyarrow_timestamp,
161+
}
162+
ARROW_SCALAR_IDS_TO_BQ = {
163+
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
164+
pyarrow.bool_().id: "BOOL",
165+
pyarrow.int8().id: "INT64",
166+
pyarrow.int16().id: "INT64",
167+
pyarrow.int32().id: "INT64",
168+
pyarrow.int64().id: "INT64",
169+
pyarrow.uint8().id: "INT64",
170+
pyarrow.uint16().id: "INT64",
171+
pyarrow.uint32().id: "INT64",
172+
pyarrow.uint64().id: "INT64",
173+
pyarrow.float16().id: "FLOAT64",
174+
pyarrow.float32().id: "FLOAT64",
175+
pyarrow.float64().id: "FLOAT64",
176+
pyarrow.time32("ms").id: "TIME",
177+
pyarrow.time64("ns").id: "TIME",
178+
pyarrow.timestamp("ns").id: "TIMESTAMP",
179+
pyarrow.date32().id: "DATE",
180+
pyarrow.date64().id: "DATETIME", # because millisecond resolution
181+
pyarrow.binary().id: "BYTES",
182+
pyarrow.string().id: "STRING", # also alias for pyarrow.utf8()
183+
# The exact scale and precision don't matter, see below.
184+
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
185+
}
186+
187+
if version.parse(pyarrow.__version__) >= version.parse("3.0.0"):
188+
BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
189+
# The exact decimal's scale and precision are not important, as only
190+
# the type ID matters, and it's the same for all decimal256 instances.
191+
ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
192+
_BIGNUMERIC_SUPPORT = True
193+
else:
194+
_BIGNUMERIC_SUPPORT = False # pragma: NO COVER
195+
196+
else: # pragma: NO COVER
197+
BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER
198+
ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER
199+
_BIGNUMERIC_SUPPORT = False # pragma: NO COVER
200+
201+
190202
BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = {
191203
"GEOGRAPHY": {
192204
b"ARROW:extension:name": b"google:sqlType:geography",
@@ -480,6 +492,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
480492
# If schema detection was not successful for all columns, also try with
481493
# pyarrow, if available.
482494
if unknown_type_fields:
495+
if not pyarrow:
496+
msg = "Could not determine the type of columns: {}".format(
497+
", ".join(field.name for field in unknown_type_fields)
498+
)
499+
warnings.warn(msg)
500+
return None # We cannot detect the schema in full.
501+
483502
# The augment_schema() helper itself will also issue unknown type
484503
# warnings if detection still fails for any of the fields.
485504
bq_schema_out = augment_schema(dataframe, bq_schema_out)
@@ -654,6 +673,8 @@ def dataframe_to_parquet(
654673
655674
This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``.
656675
"""
676+
pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)
677+
657678
import pyarrow.parquet # type: ignore
658679

659680
kwargs = (

0 commit comments

Comments
 (0)