Skip to content

Commit 19635ba

Browse files
authored
Revert "BigQuery: Add ability to get query results as a Pandas dataframe. (#4354)"
This reverts commit 3511e87.
1 parent b5b473d commit 19635ba

File tree

9 files changed

+16
-437
lines changed

9 files changed

+16
-437
lines changed

bigquery/google/cloud/bigquery/_helpers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ def _rows_page_start(iterator, page, response):
468468
total_rows = response.get('totalRows')
469469
if total_rows is not None:
470470
total_rows = int(total_rows)
471-
iterator._total_rows = total_rows
471+
iterator.total_rows = total_rows
472472
# pylint: enable=unused-argument
473473

474474

bigquery/google/cloud/bigquery/client.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333

3434
from google.cloud.bigquery._helpers import DEFAULT_RETRY
3535
from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_ROW
36+
from google.cloud.bigquery._helpers import _field_to_index_mapping
37+
from google.cloud.bigquery._helpers import _item_to_row
38+
from google.cloud.bigquery._helpers import _rows_page_start
3639
from google.cloud.bigquery._helpers import _snake_to_camel_case
3740
from google.cloud.bigquery._http import Connection
3841
from google.cloud.bigquery.dataset import Dataset
@@ -45,7 +48,6 @@
4548
from google.cloud.bigquery.table import Table
4649
from google.cloud.bigquery.table import TableListItem
4750
from google.cloud.bigquery.table import TableReference
48-
from google.cloud.bigquery.table import RowIterator
4951
from google.cloud.bigquery.table import _TABLE_HAS_NO_SCHEMA
5052
from google.cloud.bigquery.table import _row_from_mapping
5153

@@ -1187,7 +1189,7 @@ def list_rows(self, table, selected_fields=None, max_results=None,
11871189
:type retry: :class:`google.api_core.retry.Retry`
11881190
:param retry: (Optional) How to retry the RPC.
11891191
1190-
:rtype: :class:`~google.cloud.bigquery.table.RowIterator`
1192+
:rtype: :class:`~google.api_core.page_iterator.Iterator`
11911193
:returns: Iterator of row data
11921194
:class:`~google.cloud.bigquery.table.Row`-s. During each
11931195
page, the iterator will have the ``total_rows`` attribute
@@ -1215,15 +1217,20 @@ def list_rows(self, table, selected_fields=None, max_results=None,
12151217
if start_index is not None:
12161218
params['startIndex'] = start_index
12171219

1218-
row_iterator = RowIterator(
1220+
iterator = page_iterator.HTTPIterator(
12191221
client=self,
12201222
api_request=functools.partial(self._call_api, retry),
12211223
path='%s/data' % (table.path,),
1222-
schema=schema,
1224+
item_to_value=_item_to_row,
1225+
items_key='rows',
12231226
page_token=page_token,
1227+
next_token='pageToken',
12241228
max_results=max_results,
1229+
page_start=_rows_page_start,
12251230
extra_params=params)
1226-
return row_iterator
1231+
iterator.schema = schema
1232+
iterator._field_to_index = _field_to_index_mapping(schema)
1233+
return iterator
12271234

12281235
def list_partitions(self, table, retry=DEFAULT_RETRY):
12291236
"""List the partitions in a table.

bigquery/google/cloud/bigquery/job.py

+1-14
Original file line numberDiff line numberDiff line change
@@ -1929,7 +1929,7 @@ def result(self, timeout=None, retry=DEFAULT_RETRY):
19291929
:type retry: :class:`google.api_core.retry.Retry`
19301930
:param retry: (Optional) How to retry the call that retrieves rows.
19311931
1932-
:rtype: :class:`~google.cloud.bigquery.table.RowIterator`
1932+
:rtype: :class:`~google.api_core.page_iterator.Iterator`
19331933
:returns:
19341934
Iterator of row data :class:`~google.cloud.bigquery.table.Row`-s.
19351935
During each page, the iterator will have the ``total_rows``
@@ -1949,19 +1949,6 @@ def result(self, timeout=None, retry=DEFAULT_RETRY):
19491949
return self._client.list_rows(dest_table, selected_fields=schema,
19501950
retry=retry)
19511951

1952-
def to_dataframe(self):
1953-
"""Return a pandas DataFrame from a QueryJob
1954-
1955-
Returns:
1956-
A :class:`~pandas.DataFrame` populated with row data and column
1957-
headers from the query results. The column headers are derived
1958-
from the destination table's schema.
1959-
1960-
Raises:
1961-
ValueError: If the `pandas` library cannot be imported.
1962-
"""
1963-
return self.result().to_dataframe()
1964-
19651952
def __iter__(self):
19661953
return iter(self.result())
19671954

bigquery/google/cloud/bigquery/table.py

-78
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,10 @@
2121
import operator
2222

2323
import six
24-
try:
25-
import pandas
26-
except ImportError: # pragma: NO COVER
27-
pandas = None
28-
29-
from google.api_core.page_iterator import HTTPIterator
3024

3125
from google.cloud._helpers import _datetime_from_microseconds
3226
from google.cloud._helpers import _millis_from_datetime
33-
from google.cloud.bigquery._helpers import _item_to_row
34-
from google.cloud.bigquery._helpers import _rows_page_start
3527
from google.cloud.bigquery._helpers import _snake_to_camel_case
36-
from google.cloud.bigquery._helpers import _field_to_index_mapping
3728
from google.cloud.bigquery.schema import SchemaField
3829
from google.cloud.bigquery.schema import _build_schema_resource
3930
from google.cloud.bigquery.schema import _parse_schema_resource
@@ -1032,72 +1023,3 @@ def __repr__(self):
10321023
key=operator.itemgetter(1))
10331024
f2i = '{' + ', '.join('%r: %d' % item for item in items) + '}'
10341025
return 'Row({}, {})'.format(self._xxx_values, f2i)
1035-
1036-
1037-
class RowIterator(HTTPIterator):
1038-
"""A class for iterating through HTTP/JSON API row list responses.
1039-
1040-
Args:
1041-
client (google.cloud.bigquery.Client): The API client.
1042-
api_request (Callable[google.cloud._http.JSONConnection.api_request]):
1043-
The function to use to make API requests.
1044-
path (str): The method path to query for the list of items.
1045-
page_token (str): A token identifying a page in a result set to start
1046-
fetching results from.
1047-
max_results (int): The maximum number of results to fetch.
1048-
extra_params (dict): Extra query string parameters for the API call.
1049-
1050-
.. autoattribute:: pages
1051-
"""
1052-
1053-
def __init__(self, client, api_request, path, schema, page_token=None,
1054-
max_results=None, extra_params=None):
1055-
super(RowIterator, self).__init__(
1056-
client, api_request, path, item_to_value=_item_to_row,
1057-
items_key='rows', page_token=page_token, max_results=max_results,
1058-
extra_params=extra_params, page_start=_rows_page_start,
1059-
next_token='pageToken')
1060-
self._schema = schema
1061-
self._field_to_index = _field_to_index_mapping(schema)
1062-
self._total_rows = None
1063-
1064-
@property
1065-
def schema(self):
1066-
"""Schema for the table containing the rows
1067-
1068-
Returns:
1069-
list of :class:`~google.cloud.bigquery.schema.SchemaField`:
1070-
fields describing the schema
1071-
"""
1072-
return list(self._schema)
1073-
1074-
@property
1075-
def total_rows(self):
1076-
"""The total number of rows in the table.
1077-
1078-
Returns:
1079-
int: the row count.
1080-
"""
1081-
return self._total_rows
1082-
1083-
def to_dataframe(self):
1084-
"""Create a pandas DataFrame from the query results.
1085-
1086-
Returns:
1087-
A :class:`~pandas.DataFrame` populated with row data and column
1088-
headers from the query results. The column headers are derived
1089-
from the destination table's schema.
1090-
1091-
Raises:
1092-
ValueError: If the `pandas` library cannot be imported.
1093-
1094-
"""
1095-
if pandas is None:
1096-
raise ValueError('The pandas library is not installed, please '
1097-
'install pandas to use the to_dataframe() '
1098-
'function.')
1099-
1100-
column_headers = [field.name for field in self.schema]
1101-
rows = [row.values() for row in iter(self)]
1102-
1103-
return pandas.DataFrame(rows, columns=column_headers)

bigquery/nox.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def default(session):
3636
"""
3737
# Install all test dependencies, then install this package in-place.
3838
session.install('mock', 'pytest', 'pytest-cov', *LOCAL_DEPS)
39-
session.install('-e', '.[pandas]')
39+
session.install('-e', '.')
4040

4141
# Run py.test against the unit tests.
4242
session.run(
@@ -89,7 +89,7 @@ def system(session, py):
8989
os.path.join('..', 'storage'),
9090
os.path.join('..', 'test_utils'),
9191
)
92-
session.install('-e', '.[pandas]')
92+
session.install('-e', '.')
9393

9494
# Run py.test against the system tests.
9595
session.run(

bigquery/setup.py

-5
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,6 @@
5858
'requests >= 2.18.0',
5959
]
6060

61-
EXTRAS_REQUIREMENTS = {
62-
'pandas': ['pandas >= 0.17.1'],
63-
}
64-
6561
setup(
6662
name='google-cloud-bigquery',
6763
version='0.28.1.dev1',
@@ -73,6 +69,5 @@
7369
],
7470
packages=find_packages(exclude=('tests*',)),
7571
install_requires=REQUIREMENTS,
76-
extras_require=EXTRAS_REQUIREMENTS,
7772
**SETUP_BASE
7873
)

bigquery/tests/system.py

-76
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,6 @@
2424
import uuid
2525

2626
import six
27-
try:
28-
import pandas
29-
except ImportError: # pragma: NO COVER
30-
pandas = None
3127

3228
from google.api_core.exceptions import PreconditionFailed
3329
from google.cloud import bigquery
@@ -1248,28 +1244,6 @@ def test_query_iter(self):
12481244
row_tuples = [r.values() for r in query_job]
12491245
self.assertEqual(row_tuples, [(1,)])
12501246

1251-
@unittest.skipIf(pandas is None, 'Requires `pandas`')
1252-
def test_query_results_to_dataframe(self):
1253-
QUERY = """
1254-
SELECT id, author, time_ts, dead
1255-
from `bigquery-public-data.hacker_news.comments`
1256-
LIMIT 10
1257-
"""
1258-
1259-
df = Config.CLIENT.query(QUERY).result().to_dataframe()
1260-
1261-
self.assertIsInstance(df, pandas.DataFrame)
1262-
self.assertEqual(len(df), 10) # verify the number of rows
1263-
column_names = ['id', 'author', 'time_ts', 'dead']
1264-
self.assertEqual(list(df), column_names) # verify the column names
1265-
exp_datatypes = {'id': int, 'author': str,
1266-
'time_ts': pandas.Timestamp, 'dead': bool}
1267-
for index, row in df.iterrows():
1268-
for col in column_names:
1269-
# all the schema fields are nullable, so None is acceptable
1270-
if not row[col] is None:
1271-
self.assertIsInstance(row[col], exp_datatypes[col])
1272-
12731247
def test_query_table_def(self):
12741248
gs_url = self._write_csv_to_storage(
12751249
'bq_external_test' + unique_resource_id(), 'person_ages.csv',
@@ -1445,56 +1419,6 @@ def test_create_table_rows_fetch_nested_schema(self):
14451419
e_favtime = datetime.datetime(*parts[0:6])
14461420
self.assertEqual(found[7], e_favtime)
14471421

1448-
def _fetch_dataframe(self, query):
1449-
return Config.CLIENT.query(query).result().to_dataframe()
1450-
1451-
@unittest.skipIf(pandas is None, 'Requires `pandas`')
1452-
def test_nested_table_to_dataframe(self):
1453-
SF = bigquery.SchemaField
1454-
schema = [
1455-
SF('string_col', 'STRING', mode='NULLABLE'),
1456-
SF('record_col', 'RECORD', mode='NULLABLE', fields=[
1457-
SF('nested_string', 'STRING', mode='NULLABLE'),
1458-
SF('nested_repeated', 'INTEGER', mode='REPEATED'),
1459-
SF('nested_record', 'RECORD', mode='NULLABLE', fields=[
1460-
SF('nested_nested_string', 'STRING', mode='NULLABLE'),
1461-
]),
1462-
]),
1463-
]
1464-
record = {
1465-
'nested_string': 'another string value',
1466-
'nested_repeated': [0, 1, 2],
1467-
'nested_record': {'nested_nested_string': 'some deep insight'},
1468-
}
1469-
to_insert = [
1470-
('Some value', record)
1471-
]
1472-
table_id = 'test_table'
1473-
dataset = self.temp_dataset(_make_dataset_id('nested_df'))
1474-
table_arg = Table(dataset.table(table_id), schema=schema)
1475-
table = retry_403(Config.CLIENT.create_table)(table_arg)
1476-
self.to_delete.insert(0, table)
1477-
Config.CLIENT.create_rows(table, to_insert)
1478-
QUERY = 'SELECT * from `{}.{}.{}`'.format(
1479-
Config.CLIENT.project, dataset.dataset_id, table_id)
1480-
1481-
retry = RetryResult(_has_rows, max_tries=8)
1482-
df = retry(self._fetch_dataframe)(QUERY)
1483-
1484-
self.assertIsInstance(df, pandas.DataFrame)
1485-
self.assertEqual(len(df), 1) # verify the number of rows
1486-
exp_columns = ['string_col', 'record_col']
1487-
self.assertEqual(list(df), exp_columns) # verify the column names
1488-
row = df.iloc[0]
1489-
# verify the row content
1490-
self.assertEqual(row['string_col'], 'Some value')
1491-
self.assertEqual(row['record_col'], record)
1492-
# verify that nested data can be accessed with indices/keys
1493-
self.assertEqual(row['record_col']['nested_repeated'][0], 0)
1494-
self.assertEqual(
1495-
row['record_col']['nested_record']['nested_nested_string'],
1496-
'some deep insight')
1497-
14981422
def temp_dataset(self, dataset_id):
14991423
dataset = retry_403(Config.CLIENT.create_dataset)(
15001424
Dataset(Config.CLIENT.dataset(dataset_id)))

bigquery/tests/unit/test_job.py

-39
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@
1616

1717
from six.moves import http_client
1818
import unittest
19-
try:
20-
import pandas
21-
except ImportError: # pragma: NO COVER
22-
pandas = None
2319

2420
from google.cloud.bigquery.job import ExtractJobConfig, CopyJobConfig
2521
from google.cloud.bigquery.job import LoadJobConfig
@@ -2724,41 +2720,6 @@ def test_reload_w_alternate_client(self):
27242720
self.assertEqual(req['path'], PATH)
27252721
self._verifyResourceProperties(job, RESOURCE)
27262722

2727-
@unittest.skipIf(pandas is None, 'Requires `pandas`')
2728-
def test_to_dataframe(self):
2729-
begun_resource = self._make_resource()
2730-
query_resource = {
2731-
'jobComplete': True,
2732-
'jobReference': {
2733-
'projectId': self.PROJECT,
2734-
'jobId': self.JOB_ID,
2735-
},
2736-
'schema': {
2737-
'fields': [
2738-
{'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
2739-
{'name': 'age', 'type': 'INTEGER', 'mode': 'NULLABLE'},
2740-
],
2741-
},
2742-
'rows': [
2743-
{'f': [{'v': 'Phred Phlyntstone'}, {'v': '32'}]},
2744-
{'f': [{'v': 'Bharney Rhubble'}, {'v': '33'}]},
2745-
{'f': [{'v': 'Wylma Phlyntstone'}, {'v': '29'}]},
2746-
{'f': [{'v': 'Bhettye Rhubble'}, {'v': '27'}]},
2747-
],
2748-
}
2749-
done_resource = copy.deepcopy(begun_resource)
2750-
done_resource['status'] = {'state': 'DONE'}
2751-
connection = _Connection(
2752-
begun_resource, query_resource, done_resource, query_resource)
2753-
client = _make_client(project=self.PROJECT, connection=connection)
2754-
job = self._make_one(self.JOB_ID, self.QUERY, client)
2755-
2756-
df = job.to_dataframe()
2757-
2758-
self.assertIsInstance(df, pandas.DataFrame)
2759-
self.assertEqual(len(df), 4) # verify the number of rows
2760-
self.assertEqual(list(df), ['name', 'age']) # verify the column names
2761-
27622723
def test_iter(self):
27632724
import types
27642725

0 commit comments

Comments
 (0)