|
20 | 20 | import queue
|
21 | 21 | import warnings
|
22 | 22 |
|
23 |
| -from packaging import version |
24 |
| - |
25 | 23 | try:
|
26 | 24 | import pandas
|
27 | 25 | except ImportError: # pragma: NO COVER
|
28 | 26 | pandas = None
|
29 | 27 |
|
30 |
| -try: |
31 |
| - import pyarrow |
32 |
| - import pyarrow.parquet |
33 |
| -except ImportError: # pragma: NO COVER |
34 |
| - pyarrow = None |
| 28 | +import pyarrow |
| 29 | +import pyarrow.parquet |
35 | 30 |
|
36 | 31 | try:
|
37 | 32 | from google.cloud.bigquery_storage import ArrowSerializationOptions
|
@@ -106,63 +101,52 @@ def pyarrow_timestamp():
|
106 | 101 | return pyarrow.timestamp("us", tz="UTC")
|
107 | 102 |
|
108 | 103 |
|
109 |
| -if pyarrow: |
110 |
| - # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py |
111 |
| - # When modifying it be sure to update it there as well. |
112 |
| - BQ_TO_ARROW_SCALARS = { |
113 |
| - "BOOL": pyarrow.bool_, |
114 |
| - "BOOLEAN": pyarrow.bool_, |
115 |
| - "BYTES": pyarrow.binary, |
116 |
| - "DATE": pyarrow.date32, |
117 |
| - "DATETIME": pyarrow_datetime, |
118 |
| - "FLOAT": pyarrow.float64, |
119 |
| - "FLOAT64": pyarrow.float64, |
120 |
| - "GEOGRAPHY": pyarrow.string, |
121 |
| - "INT64": pyarrow.int64, |
122 |
| - "INTEGER": pyarrow.int64, |
123 |
| - "NUMERIC": pyarrow_numeric, |
124 |
| - "STRING": pyarrow.string, |
125 |
| - "TIME": pyarrow_time, |
126 |
| - "TIMESTAMP": pyarrow_timestamp, |
127 |
| - } |
128 |
| - ARROW_SCALAR_IDS_TO_BQ = { |
129 |
| - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes |
130 |
| - pyarrow.bool_().id: "BOOL", |
131 |
| - pyarrow.int8().id: "INT64", |
132 |
| - pyarrow.int16().id: "INT64", |
133 |
| - pyarrow.int32().id: "INT64", |
134 |
| - pyarrow.int64().id: "INT64", |
135 |
| - pyarrow.uint8().id: "INT64", |
136 |
| - pyarrow.uint16().id: "INT64", |
137 |
| - pyarrow.uint32().id: "INT64", |
138 |
| - pyarrow.uint64().id: "INT64", |
139 |
| - pyarrow.float16().id: "FLOAT64", |
140 |
| - pyarrow.float32().id: "FLOAT64", |
141 |
| - pyarrow.float64().id: "FLOAT64", |
142 |
| - pyarrow.time32("ms").id: "TIME", |
143 |
| - pyarrow.time64("ns").id: "TIME", |
144 |
| - pyarrow.timestamp("ns").id: "TIMESTAMP", |
145 |
| - pyarrow.date32().id: "DATE", |
146 |
| - pyarrow.date64().id: "DATETIME", # because millisecond resolution |
147 |
| - pyarrow.binary().id: "BYTES", |
148 |
| - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() |
149 |
| - # The exact scale and precision don't matter, see below. |
150 |
| - pyarrow.decimal128(38, scale=9).id: "NUMERIC", |
151 |
| - } |
152 |
| - |
153 |
| - if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): |
154 |
| - BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric |
155 |
| - # The exact decimal's scale and precision are not important, as only |
156 |
| - # the type ID matters, and it's the same for all decimal256 instances. |
157 |
| - ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" |
158 |
| - _BIGNUMERIC_SUPPORT = True |
159 |
| - else: |
160 |
| - _BIGNUMERIC_SUPPORT = False |
161 |
| - |
162 |
| -else: # pragma: NO COVER |
163 |
| - BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER |
164 |
| - ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER |
165 |
| - _BIGNUMERIC_SUPPORT = False # pragma: NO COVER |
| 104 | +# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py |
| 105 | +# When modifying it be sure to update it there as well. |
| 106 | +BQ_TO_ARROW_SCALARS = { |
| 107 | + "BIGNUMERIC": pyarrow_bignumeric, |
| 108 | + "BOOL": pyarrow.bool_, |
| 109 | + "BOOLEAN": pyarrow.bool_, |
| 110 | + "BYTES": pyarrow.binary, |
| 111 | + "DATE": pyarrow.date32, |
| 112 | + "DATETIME": pyarrow_datetime, |
| 113 | + "FLOAT": pyarrow.float64, |
| 114 | + "FLOAT64": pyarrow.float64, |
| 115 | + "GEOGRAPHY": pyarrow.string, |
| 116 | + "INT64": pyarrow.int64, |
| 117 | + "INTEGER": pyarrow.int64, |
| 118 | + "NUMERIC": pyarrow_numeric, |
| 119 | + "STRING": pyarrow.string, |
| 120 | + "TIME": pyarrow_time, |
| 121 | + "TIMESTAMP": pyarrow_timestamp, |
| 122 | +} |
| 123 | +ARROW_SCALAR_IDS_TO_BQ = { |
| 124 | + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes |
| 125 | + pyarrow.bool_().id: "BOOL", |
| 126 | + pyarrow.int8().id: "INT64", |
| 127 | + pyarrow.int16().id: "INT64", |
| 128 | + pyarrow.int32().id: "INT64", |
| 129 | + pyarrow.int64().id: "INT64", |
| 130 | + pyarrow.uint8().id: "INT64", |
| 131 | + pyarrow.uint16().id: "INT64", |
| 132 | + pyarrow.uint32().id: "INT64", |
| 133 | + pyarrow.uint64().id: "INT64", |
| 134 | + pyarrow.float16().id: "FLOAT64", |
| 135 | + pyarrow.float32().id: "FLOAT64", |
| 136 | + pyarrow.float64().id: "FLOAT64", |
| 137 | + pyarrow.time32("ms").id: "TIME", |
| 138 | + pyarrow.time64("ns").id: "TIME", |
| 139 | + pyarrow.timestamp("ns").id: "TIMESTAMP", |
| 140 | + pyarrow.date32().id: "DATE", |
| 141 | + pyarrow.date64().id: "DATETIME", # because millisecond resolution |
| 142 | + pyarrow.binary().id: "BYTES", |
| 143 | + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() |
| 144 | + # The exact scale and precision don't matter, see below. |
| 145 | + pyarrow.decimal128(38, scale=9).id: "NUMERIC", |
| 146 | + # The exact decimal's scale and precision are not important, as only |
| 147 | + # the type ID matters, and it's the same for all decimal256 instances. |
| 148 | + pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", |
| 149 | +} |
166 | 150 |
|
167 | 151 |
|
168 | 152 | def bq_to_arrow_struct_data_type(field):
|
@@ -346,13 +330,6 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
|
346 | 330 | # If schema detection was not successful for all columns, also try with
|
347 | 331 | # pyarrow, if available.
|
348 | 332 | if unknown_type_fields:
|
349 |
| - if not pyarrow: |
350 |
| - msg = u"Could not determine the type of columns: {}".format( |
351 |
| - ", ".join(field.name for field in unknown_type_fields) |
352 |
| - ) |
353 |
| - warnings.warn(msg) |
354 |
| - return None # We cannot detect the schema in full. |
355 |
| - |
356 | 333 | # The augment_schema() helper itself will also issue unknown type
|
357 | 334 | # warnings if detection still fails for any of the fields.
|
358 | 335 | bq_schema_out = augment_schema(dataframe, bq_schema_out)
|
@@ -494,9 +471,6 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN
|
494 | 471 | serializing method. Defaults to "SNAPPY".
|
495 | 472 | https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
|
496 | 473 | """
|
497 |
| - if pyarrow is None: |
498 |
| - raise ValueError("pyarrow is required for BigQuery schema conversion.") |
499 |
| - |
500 | 474 | bq_schema = schema._to_schema_fields(bq_schema)
|
501 | 475 | arrow_table = dataframe_to_arrow(dataframe, bq_schema)
|
502 | 476 | pyarrow.parquet.write_table(arrow_table, filepath, compression=parquet_compression)
|
|
0 commit comments