Skip to content

Commit 1d58a73

Browse files
committed
moves to_dataframe() to RowIterator
1 parent 51801f3 commit 1d58a73

File tree

6 files changed

+192
-99
lines changed

6 files changed

+192
-99
lines changed

bigquery/google/cloud/bigquery/job.py

-27
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,6 @@
1919

2020
import six
2121
from six.moves import http_client
22-
try:
23-
import pandas
24-
except ImportError: # pragma: NO COVER
25-
pandas = None
2622

2723
import google.api_core.future.polling
2824
from google.cloud import exceptions
@@ -1953,29 +1949,6 @@ def result(self, timeout=None, retry=DEFAULT_RETRY):
19531949
return self._client.list_rows(dest_table, selected_fields=schema,
19541950
retry=retry)
19551951

1956-
def to_dataframe(self):
1957-
"""Create a pandas DataFrame from the query results.
1958-
1959-
Returns:
1960-
A :class:`~pandas.DataFrame` populated with row data and column
1961-
headers from the query results. The column headers are derived
1962-
from the destination table's schema.
1963-
1964-
Raises:
1965-
ValueError: If the `pandas` library cannot be imported.
1966-
1967-
"""
1968-
if pandas is None:
1969-
raise ValueError('The pandas library is not installed, please '
1970-
'install pandas to use the to_dataframe() '
1971-
'function.')
1972-
1973-
query_results = self.result()
1974-
column_headers = [field.name for field in query_results.schema]
1975-
rows = [row.values() for row in query_results]
1976-
1977-
return pandas.DataFrame(rows, columns=column_headers)
1978-
19791952
def __iter__(self):
19801953
return iter(self.result())
19811954

bigquery/google/cloud/bigquery/table.py

+26
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
import operator
2121

2222
import six
23+
try:
24+
import pandas
25+
except ImportError: # pragma: NO COVER
26+
pandas = None
2327

2428
from google.api_core.page_iterator import HTTPIterator
2529

@@ -869,3 +873,25 @@ def total_rows(self):
869873
int: the row count.
870874
"""
871875
return self._total_rows
876+
877+
def to_dataframe(self):
878+
"""Create a pandas DataFrame from the query results.
879+
880+
Returns:
881+
A :class:`~pandas.DataFrame` populated with row data and column
882+
headers from the query results. The column headers are derived
883+
from the destination table's schema.
884+
885+
Raises:
886+
ValueError: If the `pandas` library cannot be imported.
887+
888+
"""
889+
if pandas is None:
890+
raise ValueError('The pandas library is not installed, please '
891+
'install pandas to use the to_dataframe() '
892+
'function.')
893+
894+
column_headers = [field.name for field in self.schema]
895+
rows = [row.values() for row in iter(self)]
896+
897+
return pandas.DataFrame(rows, columns=column_headers)

bigquery/nox.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def system(session, py):
8989
os.path.join('..', 'storage'),
9090
os.path.join('..', 'test_utils'),
9191
)
92-
session.install('-e', '.')
92+
session.install('-e', '.[pandas]')
9393

9494
# Run py.test against the system tests.
9595
session.run(

bigquery/tests/system.py

+20
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
import uuid
2525

2626
import six
27+
try:
28+
import pandas
29+
except ImportError: # pragma: NO COVER
30+
pandas = None
2731

2832
from google.api_core.exceptions import PreconditionFailed
2933
from google.cloud import bigquery
@@ -1242,6 +1246,22 @@ def test_query_iter(self):
12421246
row_tuples = [r.values() for r in query_job]
12431247
self.assertEqual(row_tuples, [(1,)])
12441248

1249+
@unittest.skipIf(pandas is None, 'Requires `pandas`')
1250+
def test_query_results_to_dataframe(self):
1251+
PUBLIC = 'bigquery-public-data'
1252+
DATASET_ID = 'samples'
1253+
TABLE_NAME = 'natality'
1254+
LIMIT = 1000
1255+
SQL = 'SELECT year, weight_pounds from `{}.{}.{}` LIMIT {}'.format(
1256+
PUBLIC, DATASET_ID, TABLE_NAME, LIMIT)
1257+
1258+
df = Config.CLIENT.query(SQL).result().to_dataframe()
1259+
1260+
self.assertIsInstance(df, pandas.DataFrame)
1261+
self.assertEqual(len(df), LIMIT) # verify the number of rows
1262+
self.assertEqual(
1263+
list(df), ['year', 'weight_pounds']) # verify the column names
1264+
12451265
def test_query_table_def(self):
12461266
gs_url = self._write_csv_to_storage(
12471267
'bq_external_test' + unique_resource_id(), 'person_ages.csv',

bigquery/tests/unit/test_job.py

-71
Original file line numberDiff line numberDiff line change
@@ -2724,77 +2724,6 @@ def test_reload_w_alternate_client(self):
27242724
self.assertEqual(req['path'], PATH)
27252725
self._verifyResourceProperties(job, RESOURCE)
27262726

2727-
@unittest.skipIf(pandas is None, 'Requires `pandas`')
2728-
def test_to_dataframe(self):
2729-
begun_resource = self._make_resource()
2730-
query_resource = {
2731-
'jobComplete': True,
2732-
'jobReference': {
2733-
'projectId': self.PROJECT,
2734-
'jobId': self.JOB_ID,
2735-
},
2736-
'schema': {
2737-
'fields': [
2738-
{'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
2739-
{'name': 'age', 'type': 'INTEGER', 'mode': 'NULLABLE'},
2740-
],
2741-
},
2742-
'rows': [
2743-
{'f': [{'v': 'Phred Phlyntstone'}, {'v': '32'}]},
2744-
{'f': [{'v': 'Bharney Rhubble'}, {'v': '33'}]},
2745-
{'f': [{'v': 'Wylma Phlyntstone'}, {'v': '29'}]},
2746-
{'f': [{'v': 'Bhettye Rhubble'}, {'v': '27'}]},
2747-
],
2748-
}
2749-
done_resource = copy.deepcopy(begun_resource)
2750-
done_resource['status'] = {'state': 'DONE'}
2751-
connection = _Connection(
2752-
begun_resource, query_resource, done_resource, query_resource)
2753-
client = _make_client(project=self.PROJECT, connection=connection)
2754-
job = self._make_one(self.JOB_ID, self.QUERY, client)
2755-
df = job.to_dataframe()
2756-
2757-
self.assertIsInstance(df, pandas.DataFrame)
2758-
self.assertEqual(len(df), 4) # verify the number of rows
2759-
self.assertEqual(list(df), ['name', 'age']) # verify the column names
2760-
2761-
@unittest.skipIf(pandas is None, 'Requires `pandas`')
2762-
def test_to_dataframe_w_empty_results(self):
2763-
begun_resource = self._make_resource()
2764-
query_resource = {
2765-
'jobComplete': True,
2766-
'jobReference': {
2767-
'projectId': self.PROJECT,
2768-
'jobId': self.JOB_ID,
2769-
},
2770-
'schema': {
2771-
'fields': [
2772-
{'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
2773-
{'name': 'age', 'type': 'INTEGER', 'mode': 'NULLABLE'},
2774-
],
2775-
},
2776-
}
2777-
done_resource = copy.deepcopy(begun_resource)
2778-
done_resource['status'] = {'state': 'DONE'}
2779-
connection = _Connection(
2780-
begun_resource, query_resource, done_resource, query_resource)
2781-
client = _make_client(project=self.PROJECT, connection=connection)
2782-
job = self._make_one(self.JOB_ID, self.QUERY, client)
2783-
df = job.to_dataframe()
2784-
2785-
self.assertIsInstance(df, pandas.DataFrame)
2786-
self.assertEqual(len(df), 0) # verify the number of rows
2787-
self.assertEqual(list(df), ['name', 'age']) # verify the column names
2788-
2789-
@mock.patch('google.cloud.bigquery.job.pandas', new=None)
2790-
def test_to_dataframe_error_if_pandas_is_none(self):
2791-
connection = _Connection({})
2792-
client = _make_client(project=self.PROJECT, connection=connection)
2793-
job = self._make_one(self.JOB_ID, self.QUERY, client)
2794-
2795-
with self.assertRaises(ValueError):
2796-
job.to_dataframe()
2797-
27982727
def test_iter(self):
27992728
import types
28002729

bigquery/tests/unit/test_table.py

+145
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
import unittest
1616

1717
import mock
18+
import six
19+
try:
20+
import pandas
21+
except ImportError: # pragma: NO COVER
22+
pandas = None
1823

1924
from google.cloud.bigquery.dataset import DatasetReference
2025

@@ -773,3 +778,143 @@ def test_row(self):
773778
row.z
774779
with self.assertRaises(KeyError):
775780
row['z']
781+
782+
783+
class TestRowIterator(unittest.TestCase):
784+
785+
def test_constructor(self):
786+
from google.cloud.bigquery.table import RowIterator
787+
from google.cloud.bigquery._helpers import _item_to_row
788+
from google.cloud.bigquery._helpers import _rows_page_start
789+
790+
client = mock.sentinel.client
791+
api_request = mock.sentinel.api_request
792+
path = '/foo'
793+
iterator = RowIterator(client, api_request, path)
794+
795+
self.assertFalse(iterator._started)
796+
self.assertIs(iterator.client, client)
797+
self.assertEqual(iterator.path, path)
798+
self.assertIs(iterator._item_to_value, _item_to_row)
799+
self.assertEqual(iterator._items_key, 'rows')
800+
self.assertIsNone(iterator.max_results)
801+
self.assertEqual(iterator.extra_params, {})
802+
self.assertEqual(iterator._page_start, _rows_page_start)
803+
# Changing attributes.
804+
self.assertEqual(iterator.page_number, 0)
805+
self.assertIsNone(iterator.next_page_token)
806+
self.assertEqual(iterator.num_results, 0)
807+
808+
def test_iterate(self):
809+
from google.cloud.bigquery.table import RowIterator
810+
from google.cloud.bigquery.table import SchemaField
811+
from google.cloud.bigquery._helpers import _field_to_index_mapping
812+
813+
schema = [
814+
SchemaField('name', 'STRING', mode='REQUIRED'),
815+
SchemaField('age', 'INTEGER', mode='REQUIRED')
816+
]
817+
rows = [
818+
{'f': [{'v': 'Phred Phlyntstone'}, {'v': '32'}]},
819+
{'f': [{'v': 'Bharney Rhubble'}, {'v': '33'}]},
820+
]
821+
path = '/foo'
822+
api_request = mock.Mock(return_value={'rows': rows})
823+
row_iterator = RowIterator(
824+
mock.sentinel.client, api_request, path=path)
825+
row_iterator._schema = schema
826+
row_iterator._field_to_index = _field_to_index_mapping(schema)
827+
self.assertEqual(row_iterator.num_results, 0)
828+
829+
rows_iter = iter(row_iterator)
830+
831+
val1 = six.next(rows_iter)
832+
print(val1)
833+
self.assertEqual(val1.name, 'Phred Phlyntstone')
834+
self.assertEqual(row_iterator.num_results, 1)
835+
836+
val2 = six.next(rows_iter)
837+
self.assertEqual(val2.name, 'Bharney Rhubble')
838+
self.assertEqual(row_iterator.num_results, 2)
839+
840+
with self.assertRaises(StopIteration):
841+
six.next(rows_iter)
842+
843+
api_request.assert_called_once_with(
844+
method='GET', path=path, query_params={})
845+
846+
@unittest.skipIf(pandas is None, 'Requires `pandas`')
847+
def test_to_dataframe(self):
848+
from google.cloud.bigquery.table import RowIterator
849+
from google.cloud.bigquery.table import SchemaField
850+
from google.cloud.bigquery._helpers import _field_to_index_mapping
851+
852+
schema = [
853+
SchemaField('name', 'STRING', mode='REQUIRED'),
854+
SchemaField('age', 'INTEGER', mode='REQUIRED')
855+
]
856+
rows = [
857+
{'f': [{'v': 'Phred Phlyntstone'}, {'v': '32'}]},
858+
{'f': [{'v': 'Bharney Rhubble'}, {'v': '33'}]},
859+
{'f': [{'v': 'Wylma Phlyntstone'}, {'v': '29'}]},
860+
{'f': [{'v': 'Bhettye Rhubble'}, {'v': '27'}]},
861+
]
862+
path = '/foo'
863+
api_request = mock.Mock(return_value={'rows': rows})
864+
row_iterator = RowIterator(
865+
mock.sentinel.client, api_request, path=path)
866+
row_iterator._schema = schema
867+
row_iterator._field_to_index = _field_to_index_mapping(schema)
868+
869+
df = row_iterator.to_dataframe()
870+
871+
self.assertIsInstance(df, pandas.DataFrame)
872+
self.assertEqual(len(df), 4) # verify the number of rows
873+
self.assertEqual(list(df), ['name', 'age']) # verify the column names
874+
875+
@unittest.skipIf(pandas is None, 'Requires `pandas`')
876+
def test_to_dataframe_w_empty_results(self):
877+
from google.cloud.bigquery.table import RowIterator
878+
from google.cloud.bigquery.table import SchemaField
879+
from google.cloud.bigquery._helpers import _field_to_index_mapping
880+
881+
schema = [
882+
SchemaField('name', 'STRING', mode='REQUIRED'),
883+
SchemaField('age', 'INTEGER', mode='REQUIRED')
884+
]
885+
path = '/foo'
886+
api_request = mock.Mock(return_value={'rows': []})
887+
row_iterator = RowIterator(
888+
mock.sentinel.client, api_request, path=path)
889+
row_iterator._schema = schema
890+
row_iterator._field_to_index = _field_to_index_mapping(schema)
891+
892+
df = row_iterator.to_dataframe()
893+
894+
self.assertIsInstance(df, pandas.DataFrame)
895+
self.assertEqual(len(df), 0) # verify the number of rows
896+
self.assertEqual(list(df), ['name', 'age']) # verify the column names
897+
898+
@mock.patch('google.cloud.bigquery.table.pandas', new=None)
899+
def test_to_dataframe_error_if_pandas_is_none(self):
900+
from google.cloud.bigquery.table import RowIterator
901+
from google.cloud.bigquery.table import SchemaField
902+
from google.cloud.bigquery._helpers import _field_to_index_mapping
903+
904+
schema = [
905+
SchemaField('name', 'STRING', mode='REQUIRED'),
906+
SchemaField('age', 'INTEGER', mode='REQUIRED')
907+
]
908+
rows = [
909+
{'f': [{'v': 'Phred Phlyntstone'}, {'v': '32'}]},
910+
{'f': [{'v': 'Bharney Rhubble'}, {'v': '33'}]},
911+
]
912+
path = '/foo'
913+
api_request = mock.Mock(return_value={'rows': rows})
914+
row_iterator = RowIterator(
915+
mock.sentinel.client, api_request, path=path)
916+
row_iterator._schema = schema
917+
row_iterator._field_to_index = _field_to_index_mapping(schema)
918+
919+
with self.assertRaises(ValueError):
920+
row_iterator.to_dataframe()

0 commit comments

Comments
 (0)