|
22 | 22 | import queue
|
23 | 23 | import warnings
|
24 | 24 |
|
| 25 | +from packaging import version |
| 26 | + |
| 27 | +from google.cloud.bigquery import _helpers |
| 28 | +from google.cloud.bigquery import schema |
| 29 | + |
25 | 30 | try:
|
26 | 31 | import pandas # type: ignore
|
27 | 32 |
|
|
43 | 48 | db_dtypes_import_exception = exc
|
44 | 49 | date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype
|
45 | 50 |
|
46 |
| - |
47 |
| -import pyarrow # type: ignore |
48 |
| -import pyarrow.parquet # type: ignore |
| 51 | +pyarrow = _helpers.PYARROW_VERSIONS.try_import() |
49 | 52 |
|
50 | 53 | try:
|
51 | 54 | # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array`
|
@@ -77,10 +80,6 @@ def _to_wkb(v):
|
77 | 80 | # Having BQ Storage available implies that pyarrow >=1.0.0 is available, too.
|
78 | 81 | _ARROW_COMPRESSION_SUPPORT = True
|
79 | 82 |
|
80 |
| -from google.cloud.bigquery import _helpers |
81 |
| -from google.cloud.bigquery import schema |
82 |
| - |
83 |
| - |
84 | 83 | _LOGGER = logging.getLogger(__name__)
|
85 | 84 |
|
86 | 85 | _PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds.
|
@@ -141,52 +140,65 @@ def pyarrow_timestamp():
|
141 | 140 | return pyarrow.timestamp("us", tz="UTC")
|
142 | 141 |
|
143 | 142 |
|
144 |
| -# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py |
145 |
| -# When modifying it be sure to update it there as well. |
146 |
| -BQ_TO_ARROW_SCALARS = { |
147 |
| - "BIGNUMERIC": pyarrow_bignumeric, |
148 |
| - "BOOL": pyarrow.bool_, |
149 |
| - "BOOLEAN": pyarrow.bool_, |
150 |
| - "BYTES": pyarrow.binary, |
151 |
| - "DATE": pyarrow.date32, |
152 |
| - "DATETIME": pyarrow_datetime, |
153 |
| - "FLOAT": pyarrow.float64, |
154 |
| - "FLOAT64": pyarrow.float64, |
155 |
| - "GEOGRAPHY": pyarrow.string, |
156 |
| - "INT64": pyarrow.int64, |
157 |
| - "INTEGER": pyarrow.int64, |
158 |
| - "NUMERIC": pyarrow_numeric, |
159 |
| - "STRING": pyarrow.string, |
160 |
| - "TIME": pyarrow_time, |
161 |
| - "TIMESTAMP": pyarrow_timestamp, |
162 |
| -} |
163 |
| -ARROW_SCALAR_IDS_TO_BQ = { |
164 |
| - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes |
165 |
| - pyarrow.bool_().id: "BOOL", |
166 |
| - pyarrow.int8().id: "INT64", |
167 |
| - pyarrow.int16().id: "INT64", |
168 |
| - pyarrow.int32().id: "INT64", |
169 |
| - pyarrow.int64().id: "INT64", |
170 |
| - pyarrow.uint8().id: "INT64", |
171 |
| - pyarrow.uint16().id: "INT64", |
172 |
| - pyarrow.uint32().id: "INT64", |
173 |
| - pyarrow.uint64().id: "INT64", |
174 |
| - pyarrow.float16().id: "FLOAT64", |
175 |
| - pyarrow.float32().id: "FLOAT64", |
176 |
| - pyarrow.float64().id: "FLOAT64", |
177 |
| - pyarrow.time32("ms").id: "TIME", |
178 |
| - pyarrow.time64("ns").id: "TIME", |
179 |
| - pyarrow.timestamp("ns").id: "TIMESTAMP", |
180 |
| - pyarrow.date32().id: "DATE", |
181 |
| - pyarrow.date64().id: "DATETIME", # because millisecond resolution |
182 |
| - pyarrow.binary().id: "BYTES", |
183 |
| - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() |
184 |
| - # The exact scale and precision don't matter, see below. |
185 |
| - pyarrow.decimal128(38, scale=9).id: "NUMERIC", |
186 |
| - # The exact decimal's scale and precision are not important, as only |
187 |
| - # the type ID matters, and it's the same for all decimal256 instances. |
188 |
| - pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", |
189 |
| -} |
| 143 | +if pyarrow: |
| 144 | + # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py |
| 145 | + # When modifying it be sure to update it there as well. |
| 146 | + BQ_TO_ARROW_SCALARS = { |
| 147 | + "BOOL": pyarrow.bool_, |
| 148 | + "BOOLEAN": pyarrow.bool_, |
| 149 | + "BYTES": pyarrow.binary, |
| 150 | + "DATE": pyarrow.date32, |
| 151 | + "DATETIME": pyarrow_datetime, |
| 152 | + "FLOAT": pyarrow.float64, |
| 153 | + "FLOAT64": pyarrow.float64, |
| 154 | + "GEOGRAPHY": pyarrow.string, |
| 155 | + "INT64": pyarrow.int64, |
| 156 | + "INTEGER": pyarrow.int64, |
| 157 | + "NUMERIC": pyarrow_numeric, |
| 158 | + "STRING": pyarrow.string, |
| 159 | + "TIME": pyarrow_time, |
| 160 | + "TIMESTAMP": pyarrow_timestamp, |
| 161 | + } |
| 162 | + ARROW_SCALAR_IDS_TO_BQ = { |
| 163 | + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes |
| 164 | + pyarrow.bool_().id: "BOOL", |
| 165 | + pyarrow.int8().id: "INT64", |
| 166 | + pyarrow.int16().id: "INT64", |
| 167 | + pyarrow.int32().id: "INT64", |
| 168 | + pyarrow.int64().id: "INT64", |
| 169 | + pyarrow.uint8().id: "INT64", |
| 170 | + pyarrow.uint16().id: "INT64", |
| 171 | + pyarrow.uint32().id: "INT64", |
| 172 | + pyarrow.uint64().id: "INT64", |
| 173 | + pyarrow.float16().id: "FLOAT64", |
| 174 | + pyarrow.float32().id: "FLOAT64", |
| 175 | + pyarrow.float64().id: "FLOAT64", |
| 176 | + pyarrow.time32("ms").id: "TIME", |
| 177 | + pyarrow.time64("ns").id: "TIME", |
| 178 | + pyarrow.timestamp("ns").id: "TIMESTAMP", |
| 179 | + pyarrow.date32().id: "DATE", |
| 180 | + pyarrow.date64().id: "DATETIME", # because millisecond resolution |
| 181 | + pyarrow.binary().id: "BYTES", |
| 182 | + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() |
| 183 | + # The exact scale and precision don't matter, see below. |
| 184 | + pyarrow.decimal128(38, scale=9).id: "NUMERIC", |
| 185 | + } |
| 186 | + |
| 187 | + if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): |
| 188 | + BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric |
| 189 | + # The exact decimal's scale and precision are not important, as only |
| 190 | + # the type ID matters, and it's the same for all decimal256 instances. |
| 191 | + ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" |
| 192 | + _BIGNUMERIC_SUPPORT = True |
| 193 | + else: |
| 194 | + _BIGNUMERIC_SUPPORT = False # pragma: NO COVER |
| 195 | + |
| 196 | +else: # pragma: NO COVER |
| 197 | + BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER |
| 198 | + ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER |
| 199 | + _BIGNUMERIC_SUPPORT = False # pragma: NO COVER |
| 200 | + |
| 201 | + |
190 | 202 | BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = {
|
191 | 203 | "GEOGRAPHY": {
|
192 | 204 | b"ARROW:extension:name": b"google:sqlType:geography",
|
@@ -480,6 +492,13 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
|
480 | 492 | # If schema detection was not successful for all columns, also try with
|
481 | 493 | # pyarrow, if available.
|
482 | 494 | if unknown_type_fields:
|
| 495 | + if not pyarrow: |
| 496 | + msg = "Could not determine the type of columns: {}".format( |
| 497 | + ", ".join(field.name for field in unknown_type_fields) |
| 498 | + ) |
| 499 | + warnings.warn(msg) |
| 500 | + return None # We cannot detect the schema in full. |
| 501 | + |
483 | 502 | # The augment_schema() helper itself will also issue unknown type
|
484 | 503 | # warnings if detection still fails for any of the fields.
|
485 | 504 | bq_schema_out = augment_schema(dataframe, bq_schema_out)
|
@@ -654,6 +673,8 @@ def dataframe_to_parquet(
|
654 | 673 |
|
655 | 674 | This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``.
|
656 | 675 | """
|
| 676 | + pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) |
| 677 | + |
657 | 678 | import pyarrow.parquet # type: ignore
|
658 | 679 |
|
659 | 680 | kwargs = (
|
|
0 commit comments