@@ -187,6 +187,49 @@ def bq_to_arrow_array(series, bq_field):
187
187
return pyarrow .array (series , type = arrow_type )
188
188
189
189
190
+ def get_column_or_index (dataframe , name ):
191
+ """Return a column or index as a pandas series."""
192
+ if name in dataframe .columns :
193
+ return dataframe [name ].reset_index (drop = True )
194
+
195
+ if isinstance (dataframe .index , pandas .MultiIndex ):
196
+ if name in dataframe .index .names :
197
+ return (
198
+ dataframe .index .get_level_values (name )
199
+ .to_series ()
200
+ .reset_index (drop = True )
201
+ )
202
+ else :
203
+ if name == dataframe .index .name :
204
+ return dataframe .index .to_series ().reset_index (drop = True )
205
+
206
+ raise ValueError ("column or index '{}' not found." .format (name ))
207
+
208
+
209
+ def list_columns_and_indexes (dataframe ):
210
+ """Return all index and column names with dtypes.
211
+
212
+ Returns:
213
+ Sequence[Tuple[dtype, str]]:
214
+ Returns a sorted list of indexes and column names with
215
+ corresponding dtypes. If an index is missing a name or has the
216
+ same name as a column, the index is omitted.
217
+ """
218
+ column_names = frozenset (dataframe .columns )
219
+ columns_and_indexes = []
220
+ if isinstance (dataframe .index , pandas .MultiIndex ):
221
+ for name in dataframe .index .names :
222
+ if name and name not in column_names :
223
+ values = dataframe .index .get_level_values (name )
224
+ columns_and_indexes .append ((name , values .dtype ))
225
+ else :
226
+ if dataframe .index .name and dataframe .index .name not in column_names :
227
+ columns_and_indexes .append ((dataframe .index .name , dataframe .index .dtype ))
228
+
229
+ columns_and_indexes += zip (dataframe .columns , dataframe .dtypes )
230
+ return columns_and_indexes
231
+
232
+
190
233
def dataframe_to_bq_schema (dataframe , bq_schema ):
191
234
"""Convert a pandas DataFrame schema to a BigQuery schema.
192
235
@@ -217,7 +260,7 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
217
260
bq_schema_unused = set ()
218
261
219
262
bq_schema_out = []
220
- for column , dtype in zip (dataframe . columns , dataframe . dtypes ):
263
+ for column , dtype in list_columns_and_indexes (dataframe ):
221
264
# Use provided type from schema, if present.
222
265
bq_field = bq_schema_index .get (column )
223
266
if bq_field :
@@ -229,7 +272,7 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
229
272
# pandas dtype.
230
273
bq_type = _PANDAS_DTYPE_TO_BQ .get (dtype .name )
231
274
if not bq_type :
232
- warnings .warn ("Unable to determine type of column '{}'." .format (column ))
275
+ warnings .warn (u "Unable to determine type of column '{}'." .format (column ))
233
276
return None
234
277
bq_field = schema .SchemaField (column , bq_type )
235
278
bq_schema_out .append (bq_field )
@@ -238,7 +281,7 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
238
281
# column, but it was not found.
239
282
if bq_schema_unused :
240
283
raise ValueError (
241
- "bq_schema contains fields not present in dataframe: {}" .format (
284
+ u "bq_schema contains fields not present in dataframe: {}" .format (
242
285
bq_schema_unused
243
286
)
244
287
)
@@ -261,20 +304,25 @@ def dataframe_to_arrow(dataframe, bq_schema):
261
304
BigQuery schema.
262
305
"""
263
306
column_names = set (dataframe .columns )
307
+ column_and_index_names = set (
308
+ name for name , _ in list_columns_and_indexes (dataframe )
309
+ )
264
310
bq_field_names = set (field .name for field in bq_schema )
265
311
266
- extra_fields = bq_field_names - column_names
312
+ extra_fields = bq_field_names - column_and_index_names
267
313
if extra_fields :
268
314
raise ValueError (
269
- "bq_schema contains fields not present in dataframe: {}" .format (
315
+ u "bq_schema contains fields not present in dataframe: {}" .format (
270
316
extra_fields
271
317
)
272
318
)
273
319
320
+ # It's okay for indexes to be missing from bq_schema, but it's not okay to
321
+ # be missing columns.
274
322
missing_fields = column_names - bq_field_names
275
323
if missing_fields :
276
324
raise ValueError (
277
- "bq_schema is missing fields from dataframe: {}" .format (missing_fields )
325
+ u "bq_schema is missing fields from dataframe: {}" .format (missing_fields )
278
326
)
279
327
280
328
arrow_arrays = []
@@ -283,7 +331,9 @@ def dataframe_to_arrow(dataframe, bq_schema):
283
331
for bq_field in bq_schema :
284
332
arrow_fields .append (bq_to_arrow_field (bq_field ))
285
333
arrow_names .append (bq_field .name )
286
- arrow_arrays .append (bq_to_arrow_array (dataframe [bq_field .name ], bq_field ))
334
+ arrow_arrays .append (
335
+ bq_to_arrow_array (get_column_or_index (dataframe , bq_field .name ), bq_field )
336
+ )
287
337
288
338
if all ((field is not None for field in arrow_fields )):
289
339
return pyarrow .Table .from_arrays (
0 commit comments