Skip to content

Commit 9cd7554

Browse files
plamuttswast
andauthored
deps!: BigQuery Storage and pyarrow are required dependencies (#776)
* process: make BQ Storage and pyarrow required * Make pyarrow required in _pandas_helpers.py * Make pyarrow required in client.py * Make pyarrow required in table.py * Make pyarrow required in job/query.py * Make pyarrow required in DBAPI tests * Make pyarrow required in snippets tests * Make BQ storage required in client.py * Make BQ storage required in table.py * Make BQ storage required in DB API tests * Make BQ storage required in magics.py * Make BQ storage required in test__helpers.py * Make BQ storage required in test__pandas_helpers.py * Make BQ storage required in test_query_pandas.py * Make method signatures compatible again The annotations caused a mismatch * Remove checks for minimum BQ Storage version Since this is now a required dependency, there should not be any more pip quirks that used to allow installing BQ Storage as an extra, but without always respecting its minimum version pin. * Remove LegacyBigQueryStorageError Since it will be released in a major version bump, we can make this a breaking change, i.e. without deprecation. * Bump minimum pyarrow version to 3.0.0 * Remove unneeded pytest.importorskip for BQ Storage * Remove pyarrow version checks in pandas helpers tests * Conditionally skip pandas tests where needed * Remove unneeded conditional pyarrow version paths * Cover schema autodetect failed code path in test * fix bad merge Co-authored-by: Tim Swast <[email protected]>
1 parent c293e3c commit 9cd7554

22 files changed

+196
-998
lines changed

docs/snippets.py

-4
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@
3030
import pandas
3131
except (ImportError, AttributeError):
3232
pandas = None
33-
try:
34-
import pyarrow
35-
except (ImportError, AttributeError):
36-
pyarrow = None
3733

3834
from google.api_core.exceptions import InternalServerError
3935
from google.api_core.exceptions import ServiceUnavailable

google/cloud/bigquery/__init__.py

-3
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
from google.cloud.bigquery.enums import KeyResultStatementKind
4343
from google.cloud.bigquery.enums import SqlTypeNames
4444
from google.cloud.bigquery.enums import StandardSqlDataTypes
45-
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
4645
from google.cloud.bigquery.external_config import ExternalConfig
4746
from google.cloud.bigquery.external_config import BigtableOptions
4847
from google.cloud.bigquery.external_config import BigtableColumnFamily
@@ -171,8 +170,6 @@
171170
"WriteDisposition",
172171
# EncryptionConfiguration
173172
"EncryptionConfiguration",
174-
# Custom exceptions
175-
"LegacyBigQueryStorageError",
176173
]
177174

178175

google/cloud/bigquery/_helpers.py

-26
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828
from google.cloud._helpers import _to_bytes
2929
import packaging.version
3030

31-
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
32-
3331

3432
_RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f"
3533
_TIMEONLY_WO_MICROS = "%H:%M:%S"
@@ -41,7 +39,6 @@
4139
re.VERBOSE,
4240
)
4341

44-
_MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0")
4542
_BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0")
4643

4744

@@ -75,29 +72,6 @@ def is_read_session_optional(self) -> bool:
7572
"""
7673
return self.installed_version >= _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION
7774

78-
def verify_version(self):
79-
"""Verify that a recent enough version of BigQuery Storage extra is
80-
installed.
81-
82-
The function assumes that google-cloud-bigquery-storage extra is
83-
installed, and should thus be used in places where this assumption
84-
holds.
85-
86-
Because `pip` can install an outdated version of this extra despite the
87-
constraints in `setup.py`, the calling code can use this helper to
88-
verify the version compatibility at runtime.
89-
90-
Raises:
91-
LegacyBigQueryStorageError:
92-
If the google-cloud-bigquery-storage package is outdated.
93-
"""
94-
if self.installed_version < _MIN_BQ_STORAGE_VERSION:
95-
msg = (
96-
"Dependency google-cloud-bigquery-storage is outdated, please upgrade "
97-
f"it to version >= 2.0.0 (version found: {self.installed_version})."
98-
)
99-
raise LegacyBigQueryStorageError(msg)
100-
10175

10276
BQ_STORAGE_VERSIONS = BQStorageVersions()
10377

google/cloud/bigquery/_pandas_helpers.py

+48-74
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,13 @@
2020
import queue
2121
import warnings
2222

23-
from packaging import version
24-
2523
try:
2624
import pandas
2725
except ImportError: # pragma: NO COVER
2826
pandas = None
2927

30-
try:
31-
import pyarrow
32-
import pyarrow.parquet
33-
except ImportError: # pragma: NO COVER
34-
pyarrow = None
28+
import pyarrow
29+
import pyarrow.parquet
3530

3631
try:
3732
from google.cloud.bigquery_storage import ArrowSerializationOptions
@@ -106,63 +101,52 @@ def pyarrow_timestamp():
106101
return pyarrow.timestamp("us", tz="UTC")
107102

108103

109-
if pyarrow:
110-
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
111-
# When modifying it be sure to update it there as well.
112-
BQ_TO_ARROW_SCALARS = {
113-
"BOOL": pyarrow.bool_,
114-
"BOOLEAN": pyarrow.bool_,
115-
"BYTES": pyarrow.binary,
116-
"DATE": pyarrow.date32,
117-
"DATETIME": pyarrow_datetime,
118-
"FLOAT": pyarrow.float64,
119-
"FLOAT64": pyarrow.float64,
120-
"GEOGRAPHY": pyarrow.string,
121-
"INT64": pyarrow.int64,
122-
"INTEGER": pyarrow.int64,
123-
"NUMERIC": pyarrow_numeric,
124-
"STRING": pyarrow.string,
125-
"TIME": pyarrow_time,
126-
"TIMESTAMP": pyarrow_timestamp,
127-
}
128-
ARROW_SCALAR_IDS_TO_BQ = {
129-
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
130-
pyarrow.bool_().id: "BOOL",
131-
pyarrow.int8().id: "INT64",
132-
pyarrow.int16().id: "INT64",
133-
pyarrow.int32().id: "INT64",
134-
pyarrow.int64().id: "INT64",
135-
pyarrow.uint8().id: "INT64",
136-
pyarrow.uint16().id: "INT64",
137-
pyarrow.uint32().id: "INT64",
138-
pyarrow.uint64().id: "INT64",
139-
pyarrow.float16().id: "FLOAT64",
140-
pyarrow.float32().id: "FLOAT64",
141-
pyarrow.float64().id: "FLOAT64",
142-
pyarrow.time32("ms").id: "TIME",
143-
pyarrow.time64("ns").id: "TIME",
144-
pyarrow.timestamp("ns").id: "TIMESTAMP",
145-
pyarrow.date32().id: "DATE",
146-
pyarrow.date64().id: "DATETIME", # because millisecond resolution
147-
pyarrow.binary().id: "BYTES",
148-
pyarrow.string().id: "STRING", # also alias for pyarrow.utf8()
149-
# The exact scale and precision don't matter, see below.
150-
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
151-
}
152-
153-
if version.parse(pyarrow.__version__) >= version.parse("3.0.0"):
154-
BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
155-
# The exact decimal's scale and precision are not important, as only
156-
# the type ID matters, and it's the same for all decimal256 instances.
157-
ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
158-
_BIGNUMERIC_SUPPORT = True
159-
else:
160-
_BIGNUMERIC_SUPPORT = False
161-
162-
else: # pragma: NO COVER
163-
BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER
164-
ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER
165-
_BIGNUMERIC_SUPPORT = False # pragma: NO COVER
104+
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
105+
# When modifying it be sure to update it there as well.
106+
BQ_TO_ARROW_SCALARS = {
107+
"BIGNUMERIC": pyarrow_bignumeric,
108+
"BOOL": pyarrow.bool_,
109+
"BOOLEAN": pyarrow.bool_,
110+
"BYTES": pyarrow.binary,
111+
"DATE": pyarrow.date32,
112+
"DATETIME": pyarrow_datetime,
113+
"FLOAT": pyarrow.float64,
114+
"FLOAT64": pyarrow.float64,
115+
"GEOGRAPHY": pyarrow.string,
116+
"INT64": pyarrow.int64,
117+
"INTEGER": pyarrow.int64,
118+
"NUMERIC": pyarrow_numeric,
119+
"STRING": pyarrow.string,
120+
"TIME": pyarrow_time,
121+
"TIMESTAMP": pyarrow_timestamp,
122+
}
123+
ARROW_SCALAR_IDS_TO_BQ = {
124+
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
125+
pyarrow.bool_().id: "BOOL",
126+
pyarrow.int8().id: "INT64",
127+
pyarrow.int16().id: "INT64",
128+
pyarrow.int32().id: "INT64",
129+
pyarrow.int64().id: "INT64",
130+
pyarrow.uint8().id: "INT64",
131+
pyarrow.uint16().id: "INT64",
132+
pyarrow.uint32().id: "INT64",
133+
pyarrow.uint64().id: "INT64",
134+
pyarrow.float16().id: "FLOAT64",
135+
pyarrow.float32().id: "FLOAT64",
136+
pyarrow.float64().id: "FLOAT64",
137+
pyarrow.time32("ms").id: "TIME",
138+
pyarrow.time64("ns").id: "TIME",
139+
pyarrow.timestamp("ns").id: "TIMESTAMP",
140+
pyarrow.date32().id: "DATE",
141+
pyarrow.date64().id: "DATETIME", # because millisecond resolution
142+
pyarrow.binary().id: "BYTES",
143+
pyarrow.string().id: "STRING", # also alias for pyarrow.utf8()
144+
# The exact scale and precision don't matter, see below.
145+
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
146+
# The exact decimal's scale and precision are not important, as only
147+
# the type ID matters, and it's the same for all decimal256 instances.
148+
pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC",
149+
}
166150

167151

168152
def bq_to_arrow_struct_data_type(field):
@@ -346,13 +330,6 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
346330
# If schema detection was not successful for all columns, also try with
347331
# pyarrow, if available.
348332
if unknown_type_fields:
349-
if not pyarrow:
350-
msg = u"Could not determine the type of columns: {}".format(
351-
", ".join(field.name for field in unknown_type_fields)
352-
)
353-
warnings.warn(msg)
354-
return None # We cannot detect the schema in full.
355-
356333
# The augment_schema() helper itself will also issue unknown type
357334
# warnings if detection still fails for any of the fields.
358335
bq_schema_out = augment_schema(dataframe, bq_schema_out)
@@ -494,9 +471,6 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN
494471
serializing method. Defaults to "SNAPPY".
495472
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
496473
"""
497-
if pyarrow is None:
498-
raise ValueError("pyarrow is required for BigQuery schema conversion.")
499-
500474
bq_schema = schema._to_schema_fields(bq_schema)
501475
arrow_table = dataframe_to_arrow(dataframe, bq_schema)
502476
pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression)

google/cloud/bigquery/client.py

+7-60
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,11 @@
2727
import json
2828
import math
2929
import os
30-
import packaging.version
3130
import tempfile
3231
from typing import Any, BinaryIO, Dict, Iterable, Optional, Sequence, Tuple, Union
3332
import uuid
3433
import warnings
3534

36-
try:
37-
import pyarrow
38-
39-
_PYARROW_VERSION = packaging.version.parse(pyarrow.__version__)
40-
except ImportError: # pragma: NO COVER
41-
pyarrow = None
42-
4335
from google import resumable_media # type: ignore
4436
from google.resumable_media.requests import MultipartUpload
4537
from google.resumable_media.requests import ResumableUpload
@@ -53,26 +45,21 @@
5345
from google.cloud import exceptions # pytype: disable=import-error
5446
from google.cloud.client import ClientWithProject # pytype: disable=import-error
5547

56-
try:
57-
from google.cloud.bigquery_storage_v1.services.big_query_read.client import (
58-
DEFAULT_CLIENT_INFO as DEFAULT_BQSTORAGE_CLIENT_INFO,
59-
)
60-
except ImportError:
61-
DEFAULT_BQSTORAGE_CLIENT_INFO = None
48+
from google.cloud.bigquery_storage_v1.services.big_query_read.client import (
49+
DEFAULT_CLIENT_INFO as DEFAULT_BQSTORAGE_CLIENT_INFO,
50+
)
6251

6352
from google.cloud.bigquery._helpers import _del_sub_prop
6453
from google.cloud.bigquery._helpers import _get_sub_prop
6554
from google.cloud.bigquery._helpers import _record_field_to_json
6655
from google.cloud.bigquery._helpers import _str_or_none
67-
from google.cloud.bigquery._helpers import BQ_STORAGE_VERSIONS
6856
from google.cloud.bigquery._helpers import _verify_job_config_type
6957
from google.cloud.bigquery._http import Connection
7058
from google.cloud.bigquery import _pandas_helpers
7159
from google.cloud.bigquery.dataset import Dataset
7260
from google.cloud.bigquery.dataset import DatasetListItem
7361
from google.cloud.bigquery.dataset import DatasetReference
7462
from google.cloud.bigquery.enums import AutoRowIDs
75-
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
7663
from google.cloud.bigquery.opentelemetry_tracing import create_span
7764
from google.cloud.bigquery import job
7865
from google.cloud.bigquery.job import (
@@ -121,9 +108,6 @@
121108
# https://github.com/googleapis/python-bigquery/issues/438
122109
_MIN_GET_QUERY_RESULTS_TIMEOUT = 120
123110

124-
# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414
125-
_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")])
126-
127111

128112
class Project(object):
129113
"""Wrapper for resource describing a BigQuery project.
@@ -483,17 +467,10 @@ def _ensure_bqstorage_client(
483467
) -> Optional["google.cloud.bigquery_storage.BigQueryReadClient"]:
484468
"""Create a BigQuery Storage API client using this client's credentials.
485469
486-
If a client cannot be created due to a missing or outdated dependency
487-
`google-cloud-bigquery-storage`, raise a warning and return ``None``.
488-
489-
If the `bqstorage_client` argument is not ``None``, still perform the version
490-
check and return the argument back to the caller if the check passes. If it
491-
fails, raise a warning and return ``None``.
492-
493470
Args:
494471
bqstorage_client:
495-
An existing BigQuery Storage client instance to check for version
496-
compatibility. If ``None``, a new instance is created and returned.
472+
An existing BigQuery Storage client instance. If ``None``, a new
473+
instance is created and returned.
497474
client_options:
498475
Custom options used with a new BigQuery Storage client instance if one
499476
is created.
@@ -504,20 +481,7 @@ def _ensure_bqstorage_client(
504481
Returns:
505482
A BigQuery Storage API client.
506483
"""
507-
try:
508-
from google.cloud import bigquery_storage
509-
except ImportError:
510-
warnings.warn(
511-
"Cannot create BigQuery Storage client, the dependency "
512-
"google-cloud-bigquery-storage is not installed."
513-
)
514-
return None
515-
516-
try:
517-
BQ_STORAGE_VERSIONS.verify_version()
518-
except LegacyBigQueryStorageError as exc:
519-
warnings.warn(str(exc))
520-
return None
484+
from google.cloud import bigquery_storage
521485

522486
if bqstorage_client is None:
523487
bqstorage_client = bigquery_storage.BigQueryReadClient(
@@ -2496,7 +2460,7 @@ def load_table_from_dataframe(
24962460
:attr:`~google.cloud.bigquery.job.LoadJobConfig.schema` with
24972461
column names matching those of the dataframe. The BigQuery
24982462
schema is used to determine the correct data type conversion.
2499-
Indexes are not loaded. Requires the :mod:`pyarrow` library.
2463+
Indexes are not loaded.
25002464
25012465
By default, this method uses the parquet source format. To
25022466
override this, supply a value for
@@ -2526,9 +2490,6 @@ def load_table_from_dataframe(
25262490
google.cloud.bigquery.job.LoadJob: A new load job.
25272491
25282492
Raises:
2529-
ValueError:
2530-
If a usable parquet engine cannot be found. This method
2531-
requires :mod:`pyarrow` to be installed.
25322493
TypeError:
25332494
If ``job_config`` is not an instance of :class:`~google.cloud.bigquery.job.LoadJobConfig`
25342495
class.
@@ -2556,10 +2517,6 @@ def load_table_from_dataframe(
25562517
)
25572518
)
25582519

2559-
if pyarrow is None and job_config.source_format == job.SourceFormat.PARQUET:
2560-
# pyarrow is now the only supported parquet engine.
2561-
raise ValueError("This method requires pyarrow to be installed")
2562-
25632520
if location is None:
25642521
location = self.location
25652522

@@ -2615,16 +2572,6 @@ def load_table_from_dataframe(
26152572
try:
26162573

26172574
if job_config.source_format == job.SourceFormat.PARQUET:
2618-
if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS:
2619-
msg = (
2620-
"Loading dataframe data in PARQUET format with pyarrow "
2621-
f"{_PYARROW_VERSION} can result in data corruption. It is "
2622-
"therefore *strongly* advised to use a different pyarrow "
2623-
"version or a different source format. "
2624-
"See: https://github.com/googleapis/python-bigquery/issues/781"
2625-
)
2626-
warnings.warn(msg, category=RuntimeWarning)
2627-
26282575
if job_config.schema:
26292576
if parquet_compression == "snappy": # adjust the default value
26302577
parquet_compression = parquet_compression.upper()

0 commit comments

Comments
 (0)