Skip to content

Commit 3511e87

Browse files
alixhamiJon Wayne Parrott
authored and
Jon Wayne Parrott
committed
BigQuery: Add ability to get query results as a Pandas dataframe. (#4354)
1 parent ad1174b commit 3511e87

File tree

9 files changed

+437
-16
lines changed

9 files changed

+437
-16
lines changed

bigquery/google/cloud/bigquery/_helpers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ def _rows_page_start(iterator, page, response):
468468
total_rows = response.get('totalRows')
469469
if total_rows is not None:
470470
total_rows = int(total_rows)
471-
iterator.total_rows = total_rows
471+
iterator._total_rows = total_rows
472472
# pylint: enable=unused-argument
473473

474474

bigquery/google/cloud/bigquery/client.py

+5-12
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,6 @@
3333

3434
from google.cloud.bigquery._helpers import DEFAULT_RETRY
3535
from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_ROW
36-
from google.cloud.bigquery._helpers import _field_to_index_mapping
37-
from google.cloud.bigquery._helpers import _item_to_row
38-
from google.cloud.bigquery._helpers import _rows_page_start
3936
from google.cloud.bigquery._helpers import _snake_to_camel_case
4037
from google.cloud.bigquery._http import Connection
4138
from google.cloud.bigquery.dataset import Dataset
@@ -48,6 +45,7 @@
4845
from google.cloud.bigquery.table import Table
4946
from google.cloud.bigquery.table import TableListItem
5047
from google.cloud.bigquery.table import TableReference
48+
from google.cloud.bigquery.table import RowIterator
5149
from google.cloud.bigquery.table import _TABLE_HAS_NO_SCHEMA
5250
from google.cloud.bigquery.table import _row_from_mapping
5351

@@ -1189,7 +1187,7 @@ def list_rows(self, table, selected_fields=None, max_results=None,
11891187
:type retry: :class:`google.api_core.retry.Retry`
11901188
:param retry: (Optional) How to retry the RPC.
11911189
1192-
:rtype: :class:`~google.api_core.page_iterator.Iterator`
1190+
:rtype: :class:`~google.cloud.bigquery.table.RowIterator`
11931191
:returns: Iterator of row data
11941192
:class:`~google.cloud.bigquery.table.Row`-s. During each
11951193
page, the iterator will have the ``total_rows`` attribute
@@ -1217,20 +1215,15 @@ def list_rows(self, table, selected_fields=None, max_results=None,
12171215
if start_index is not None:
12181216
params['startIndex'] = start_index
12191217

1220-
iterator = page_iterator.HTTPIterator(
1218+
row_iterator = RowIterator(
12211219
client=self,
12221220
api_request=functools.partial(self._call_api, retry),
12231221
path='%s/data' % (table.path,),
1224-
item_to_value=_item_to_row,
1225-
items_key='rows',
1222+
schema=schema,
12261223
page_token=page_token,
1227-
next_token='pageToken',
12281224
max_results=max_results,
1229-
page_start=_rows_page_start,
12301225
extra_params=params)
1231-
iterator.schema = schema
1232-
iterator._field_to_index = _field_to_index_mapping(schema)
1233-
return iterator
1226+
return row_iterator
12341227

12351228
def list_partitions(self, table, retry=DEFAULT_RETRY):
12361229
"""List the partitions in a table.

bigquery/google/cloud/bigquery/job.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -1929,7 +1929,7 @@ def result(self, timeout=None, retry=DEFAULT_RETRY):
19291929
:type retry: :class:`google.api_core.retry.Retry`
19301930
:param retry: (Optional) How to retry the call that retrieves rows.
19311931
1932-
:rtype: :class:`~google.api_core.page_iterator.Iterator`
1932+
:rtype: :class:`~google.cloud.bigquery.table.RowIterator`
19331933
:returns:
19341934
Iterator of row data :class:`~google.cloud.bigquery.table.Row`-s.
19351935
During each page, the iterator will have the ``total_rows``
@@ -1949,6 +1949,19 @@ def result(self, timeout=None, retry=DEFAULT_RETRY):
19491949
return self._client.list_rows(dest_table, selected_fields=schema,
19501950
retry=retry)
19511951

1952+
def to_dataframe(self):
1953+
"""Return a pandas DataFrame from a QueryJob
1954+
1955+
Returns:
1956+
A :class:`~pandas.DataFrame` populated with row data and column
1957+
headers from the query results. The column headers are derived
1958+
from the destination table's schema.
1959+
1960+
Raises:
1961+
ValueError: If the `pandas` library cannot be imported.
1962+
"""
1963+
return self.result().to_dataframe()
1964+
19521965
def __iter__(self):
19531966
return iter(self.result())
19541967

bigquery/google/cloud/bigquery/table.py

+78
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,19 @@
2121
import operator
2222

2323
import six
24+
try:
25+
import pandas
26+
except ImportError: # pragma: NO COVER
27+
pandas = None
28+
29+
from google.api_core.page_iterator import HTTPIterator
2430

2531
from google.cloud._helpers import _datetime_from_microseconds
2632
from google.cloud._helpers import _millis_from_datetime
33+
from google.cloud.bigquery._helpers import _item_to_row
34+
from google.cloud.bigquery._helpers import _rows_page_start
2735
from google.cloud.bigquery._helpers import _snake_to_camel_case
36+
from google.cloud.bigquery._helpers import _field_to_index_mapping
2837
from google.cloud.bigquery.schema import SchemaField
2938
from google.cloud.bigquery.schema import _build_schema_resource
3039
from google.cloud.bigquery.schema import _parse_schema_resource
@@ -1023,3 +1032,72 @@ def __repr__(self):
10231032
key=operator.itemgetter(1))
10241033
f2i = '{' + ', '.join('%r: %d' % item for item in items) + '}'
10251034
return 'Row({}, {})'.format(self._xxx_values, f2i)
1035+
1036+
1037+
class RowIterator(HTTPIterator):
1038+
"""A class for iterating through HTTP/JSON API row list responses.
1039+
1040+
Args:
1041+
client (google.cloud.bigquery.Client): The API client.
1042+
api_request (Callable[google.cloud._http.JSONConnection.api_request]):
1043+
The function to use to make API requests.
1044+
path (str): The method path to query for the list of items.
1045+
page_token (str): A token identifying a page in a result set to start
1046+
fetching results from.
1047+
max_results (int): The maximum number of results to fetch.
1048+
extra_params (dict): Extra query string parameters for the API call.
1049+
1050+
.. autoattribute:: pages
1051+
"""
1052+
1053+
def __init__(self, client, api_request, path, schema, page_token=None,
1054+
max_results=None, extra_params=None):
1055+
super(RowIterator, self).__init__(
1056+
client, api_request, path, item_to_value=_item_to_row,
1057+
items_key='rows', page_token=page_token, max_results=max_results,
1058+
extra_params=extra_params, page_start=_rows_page_start,
1059+
next_token='pageToken')
1060+
self._schema = schema
1061+
self._field_to_index = _field_to_index_mapping(schema)
1062+
self._total_rows = None
1063+
1064+
@property
1065+
def schema(self):
1066+
"""Schema for the table containing the rows
1067+
1068+
Returns:
1069+
list of :class:`~google.cloud.bigquery.schema.SchemaField`:
1070+
fields describing the schema
1071+
"""
1072+
return list(self._schema)
1073+
1074+
@property
1075+
def total_rows(self):
1076+
"""The total number of rows in the table.
1077+
1078+
Returns:
1079+
int: the row count.
1080+
"""
1081+
return self._total_rows
1082+
1083+
def to_dataframe(self):
1084+
"""Create a pandas DataFrame from the query results.
1085+
1086+
Returns:
1087+
A :class:`~pandas.DataFrame` populated with row data and column
1088+
headers from the query results. The column headers are derived
1089+
from the destination table's schema.
1090+
1091+
Raises:
1092+
ValueError: If the `pandas` library cannot be imported.
1093+
1094+
"""
1095+
if pandas is None:
1096+
raise ValueError('The pandas library is not installed, please '
1097+
'install pandas to use the to_dataframe() '
1098+
'function.')
1099+
1100+
column_headers = [field.name for field in self.schema]
1101+
rows = [row.values() for row in iter(self)]
1102+
1103+
return pandas.DataFrame(rows, columns=column_headers)

bigquery/nox.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def default(session):
3636
"""
3737
# Install all test dependencies, then install this package in-place.
3838
session.install('mock', 'pytest', 'pytest-cov', *LOCAL_DEPS)
39-
session.install('-e', '.')
39+
session.install('-e', '.[pandas]')
4040

4141
# Run py.test against the unit tests.
4242
session.run(
@@ -89,7 +89,7 @@ def system(session, py):
8989
os.path.join('..', 'storage'),
9090
os.path.join('..', 'test_utils'),
9191
)
92-
session.install('-e', '.')
92+
session.install('-e', '.[pandas]')
9393

9494
# Run py.test against the system tests.
9595
session.run(

bigquery/setup.py

+5
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@
5858
'requests >= 2.18.0',
5959
]
6060

61+
EXTRAS_REQUIREMENTS = {
62+
'pandas': ['pandas >= 0.17.1'],
63+
}
64+
6165
setup(
6266
name='google-cloud-bigquery',
6367
version='0.28.1.dev1',
@@ -69,5 +73,6 @@
6973
],
7074
packages=find_packages(exclude=('tests*',)),
7175
install_requires=REQUIREMENTS,
76+
extras_require=EXTRAS_REQUIREMENTS,
7277
**SETUP_BASE
7378
)

bigquery/tests/system.py

+76
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
import uuid
2525

2626
import six
27+
try:
28+
import pandas
29+
except ImportError: # pragma: NO COVER
30+
pandas = None
2731

2832
from google.api_core.exceptions import PreconditionFailed
2933
from google.cloud import bigquery
@@ -1244,6 +1248,28 @@ def test_query_iter(self):
12441248
row_tuples = [r.values() for r in query_job]
12451249
self.assertEqual(row_tuples, [(1,)])
12461250

1251+
@unittest.skipIf(pandas is None, 'Requires `pandas`')
1252+
def test_query_results_to_dataframe(self):
1253+
QUERY = """
1254+
SELECT id, author, time_ts, dead
1255+
from `bigquery-public-data.hacker_news.comments`
1256+
LIMIT 10
1257+
"""
1258+
1259+
df = Config.CLIENT.query(QUERY).result().to_dataframe()
1260+
1261+
self.assertIsInstance(df, pandas.DataFrame)
1262+
self.assertEqual(len(df), 10) # verify the number of rows
1263+
column_names = ['id', 'author', 'time_ts', 'dead']
1264+
self.assertEqual(list(df), column_names) # verify the column names
1265+
exp_datatypes = {'id': int, 'author': str,
1266+
'time_ts': pandas.Timestamp, 'dead': bool}
1267+
for index, row in df.iterrows():
1268+
for col in column_names:
1269+
# all the schema fields are nullable, so None is acceptable
1270+
if not row[col] is None:
1271+
self.assertIsInstance(row[col], exp_datatypes[col])
1272+
12471273
def test_query_table_def(self):
12481274
gs_url = self._write_csv_to_storage(
12491275
'bq_external_test' + unique_resource_id(), 'person_ages.csv',
@@ -1419,6 +1445,56 @@ def test_create_table_rows_fetch_nested_schema(self):
14191445
e_favtime = datetime.datetime(*parts[0:6])
14201446
self.assertEqual(found[7], e_favtime)
14211447

1448+
def _fetch_dataframe(self, query):
1449+
return Config.CLIENT.query(query).result().to_dataframe()
1450+
1451+
@unittest.skipIf(pandas is None, 'Requires `pandas`')
1452+
def test_nested_table_to_dataframe(self):
1453+
SF = bigquery.SchemaField
1454+
schema = [
1455+
SF('string_col', 'STRING', mode='NULLABLE'),
1456+
SF('record_col', 'RECORD', mode='NULLABLE', fields=[
1457+
SF('nested_string', 'STRING', mode='NULLABLE'),
1458+
SF('nested_repeated', 'INTEGER', mode='REPEATED'),
1459+
SF('nested_record', 'RECORD', mode='NULLABLE', fields=[
1460+
SF('nested_nested_string', 'STRING', mode='NULLABLE'),
1461+
]),
1462+
]),
1463+
]
1464+
record = {
1465+
'nested_string': 'another string value',
1466+
'nested_repeated': [0, 1, 2],
1467+
'nested_record': {'nested_nested_string': 'some deep insight'},
1468+
}
1469+
to_insert = [
1470+
('Some value', record)
1471+
]
1472+
table_id = 'test_table'
1473+
dataset = self.temp_dataset(_make_dataset_id('nested_df'))
1474+
table_arg = Table(dataset.table(table_id), schema=schema)
1475+
table = retry_403(Config.CLIENT.create_table)(table_arg)
1476+
self.to_delete.insert(0, table)
1477+
Config.CLIENT.create_rows(table, to_insert)
1478+
QUERY = 'SELECT * from `{}.{}.{}`'.format(
1479+
Config.CLIENT.project, dataset.dataset_id, table_id)
1480+
1481+
retry = RetryResult(_has_rows, max_tries=8)
1482+
df = retry(self._fetch_dataframe)(QUERY)
1483+
1484+
self.assertIsInstance(df, pandas.DataFrame)
1485+
self.assertEqual(len(df), 1) # verify the number of rows
1486+
exp_columns = ['string_col', 'record_col']
1487+
self.assertEqual(list(df), exp_columns) # verify the column names
1488+
row = df.iloc[0]
1489+
# verify the row content
1490+
self.assertEqual(row['string_col'], 'Some value')
1491+
self.assertEqual(row['record_col'], record)
1492+
# verify that nested data can be accessed with indices/keys
1493+
self.assertEqual(row['record_col']['nested_repeated'][0], 0)
1494+
self.assertEqual(
1495+
row['record_col']['nested_record']['nested_nested_string'],
1496+
'some deep insight')
1497+
14221498
def temp_dataset(self, dataset_id):
14231499
dataset = retry_403(Config.CLIENT.create_dataset)(
14241500
Dataset(Config.CLIENT.dataset(dataset_id)))

bigquery/tests/unit/test_job.py

+39
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616

1717
from six.moves import http_client
1818
import unittest
19+
try:
20+
import pandas
21+
except ImportError: # pragma: NO COVER
22+
pandas = None
1923

2024
from google.cloud.bigquery.job import ExtractJobConfig, CopyJobConfig
2125
from google.cloud.bigquery.job import LoadJobConfig
@@ -2720,6 +2724,41 @@ def test_reload_w_alternate_client(self):
27202724
self.assertEqual(req['path'], PATH)
27212725
self._verifyResourceProperties(job, RESOURCE)
27222726

2727+
@unittest.skipIf(pandas is None, 'Requires `pandas`')
2728+
def test_to_dataframe(self):
2729+
begun_resource = self._make_resource()
2730+
query_resource = {
2731+
'jobComplete': True,
2732+
'jobReference': {
2733+
'projectId': self.PROJECT,
2734+
'jobId': self.JOB_ID,
2735+
},
2736+
'schema': {
2737+
'fields': [
2738+
{'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
2739+
{'name': 'age', 'type': 'INTEGER', 'mode': 'NULLABLE'},
2740+
],
2741+
},
2742+
'rows': [
2743+
{'f': [{'v': 'Phred Phlyntstone'}, {'v': '32'}]},
2744+
{'f': [{'v': 'Bharney Rhubble'}, {'v': '33'}]},
2745+
{'f': [{'v': 'Wylma Phlyntstone'}, {'v': '29'}]},
2746+
{'f': [{'v': 'Bhettye Rhubble'}, {'v': '27'}]},
2747+
],
2748+
}
2749+
done_resource = copy.deepcopy(begun_resource)
2750+
done_resource['status'] = {'state': 'DONE'}
2751+
connection = _Connection(
2752+
begun_resource, query_resource, done_resource, query_resource)
2753+
client = _make_client(project=self.PROJECT, connection=connection)
2754+
job = self._make_one(self.JOB_ID, self.QUERY, client)
2755+
2756+
df = job.to_dataframe()
2757+
2758+
self.assertIsInstance(df, pandas.DataFrame)
2759+
self.assertEqual(len(df), 4) # verify the number of rows
2760+
self.assertEqual(list(df), ['name', 'age']) # verify the column names
2761+
27232762
def test_iter(self):
27242763
import types
27252764

0 commit comments

Comments
 (0)