Revert "BigQuery: Add ability to get query results as a Pandas dataframe. (#4354)"

alixhami · web-flow · commit 19635baba110 · 2017-12-04T15:41:57.000-08:00
This reverts commit 3511e87.
diff --git a/bigquery/google/cloud/bigquery/_helpers.py b/bigquery/google/cloud/bigquery/_helpers.py
@@ -468,7 +468,7 @@ def _rows_page_start(iterator, page, response):
     total_rows = response.get('totalRows')
     if total_rows is not None:
         total_rows = int(total_rows)
-    iterator._total_rows = total_rows
+    iterator.total_rows = total_rows
 # pylint: enable=unused-argument
 
 
diff --git a/bigquery/google/cloud/bigquery/client.py b/bigquery/google/cloud/bigquery/client.py
@@ -33,6 +33,9 @@
 
 from google.cloud.bigquery._helpers import DEFAULT_RETRY
 from google.cloud.bigquery._helpers import _SCALAR_VALUE_TO_JSON_ROW
+from google.cloud.bigquery._helpers import _field_to_index_mapping
+from google.cloud.bigquery._helpers import _item_to_row
+from google.cloud.bigquery._helpers import _rows_page_start
 from google.cloud.bigquery._helpers import _snake_to_camel_case
 from google.cloud.bigquery._http import Connection
 from google.cloud.bigquery.dataset import Dataset
@@ -45,7 +48,6 @@
 from google.cloud.bigquery.table import Table
 from google.cloud.bigquery.table import TableListItem
 from google.cloud.bigquery.table import TableReference
-from google.cloud.bigquery.table import RowIterator
 from google.cloud.bigquery.table import _TABLE_HAS_NO_SCHEMA
 from google.cloud.bigquery.table import _row_from_mapping
 
@@ -1187,7 +1189,7 @@ def list_rows(self, table, selected_fields=None, max_results=None,
         :type retry: :class:`google.api_core.retry.Retry`
         :param retry: (Optional) How to retry the RPC.
 
-        :rtype: :class:`~google.cloud.bigquery.table.RowIterator`
+        :rtype: :class:`~google.api_core.page_iterator.Iterator`
         :returns: Iterator of row data
                   :class:`~google.cloud.bigquery.table.Row`-s. During each
                   page, the iterator will have the ``total_rows`` attribute
@@ -1215,15 +1217,20 @@ def list_rows(self, table, selected_fields=None, max_results=None,
         if start_index is not None:
             params['startIndex'] = start_index
 
-        row_iterator = RowIterator(
+        iterator = page_iterator.HTTPIterator(
             client=self,
             api_request=functools.partial(self._call_api, retry),
             path='%s/data' % (table.path,),
-            schema=schema,
+            item_to_value=_item_to_row,
+            items_key='rows',
             page_token=page_token,
+            next_token='pageToken',
             max_results=max_results,
+            page_start=_rows_page_start,
             extra_params=params)
-        return row_iterator
+        iterator.schema = schema
+        iterator._field_to_index = _field_to_index_mapping(schema)
+        return iterator
 
     def list_partitions(self, table, retry=DEFAULT_RETRY):
         """List the partitions in a table.
diff --git a/bigquery/google/cloud/bigquery/job.py b/bigquery/google/cloud/bigquery/job.py
@@ -1929,7 +1929,7 @@ def result(self, timeout=None, retry=DEFAULT_RETRY):
         :type retry: :class:`google.api_core.retry.Retry`
         :param retry: (Optional) How to retry the call that retrieves rows.
 
-        :rtype: :class:`~google.cloud.bigquery.table.RowIterator`
+        :rtype: :class:`~google.api_core.page_iterator.Iterator`
         :returns:
             Iterator of row data :class:`~google.cloud.bigquery.table.Row`-s.
             During each page, the iterator will have the ``total_rows``
@@ -1949,19 +1949,6 @@ def result(self, timeout=None, retry=DEFAULT_RETRY):
         return self._client.list_rows(dest_table, selected_fields=schema,
                                       retry=retry)
 
-    def to_dataframe(self):
-        """Return a pandas DataFrame from a QueryJob
-
-        Returns:
-            A :class:`~pandas.DataFrame` populated with row data and column
-            headers from the query results. The column headers are derived
-            from the destination table's schema.
-
-        Raises:
-            ValueError: If the `pandas` library cannot be imported.
-        """
-        return self.result().to_dataframe()
-
     def __iter__(self):
         return iter(self.result())
 
diff --git a/bigquery/google/cloud/bigquery/table.py b/bigquery/google/cloud/bigquery/table.py
@@ -21,19 +21,10 @@
 import operator
 
 import six
-try:
-    import pandas
-except ImportError:  # pragma: NO COVER
-    pandas = None
-
-from google.api_core.page_iterator import HTTPIterator
 
 from google.cloud._helpers import _datetime_from_microseconds
 from google.cloud._helpers import _millis_from_datetime
-from google.cloud.bigquery._helpers import _item_to_row
-from google.cloud.bigquery._helpers import _rows_page_start
 from google.cloud.bigquery._helpers import _snake_to_camel_case
-from google.cloud.bigquery._helpers import _field_to_index_mapping
 from google.cloud.bigquery.schema import SchemaField
 from google.cloud.bigquery.schema import _build_schema_resource
 from google.cloud.bigquery.schema import _parse_schema_resource
@@ -1032,72 +1023,3 @@ def __repr__(self):
                        key=operator.itemgetter(1))
         f2i = '{' + ', '.join('%r: %d' % item for item in items) + '}'
         return 'Row({}, {})'.format(self._xxx_values, f2i)
-
-
-class RowIterator(HTTPIterator):
-    """A class for iterating through HTTP/JSON API row list responses.
-
-    Args:
-        client (google.cloud.bigquery.Client): The API client.
-        api_request (Callable[google.cloud._http.JSONConnection.api_request]):
-            The function to use to make API requests.
-        path (str): The method path to query for the list of items.
-        page_token (str): A token identifying a page in a result set to start
-            fetching results from.
-        max_results (int): The maximum number of results to fetch.
-        extra_params (dict): Extra query string parameters for the API call.
-
-    .. autoattribute:: pages
-    """
-
-    def __init__(self, client, api_request, path, schema, page_token=None,
-                 max_results=None, extra_params=None):
-        super(RowIterator, self).__init__(
-            client, api_request, path, item_to_value=_item_to_row,
-            items_key='rows', page_token=page_token, max_results=max_results,
-            extra_params=extra_params, page_start=_rows_page_start,
-            next_token='pageToken')
-        self._schema = schema
-        self._field_to_index = _field_to_index_mapping(schema)
-        self._total_rows = None
-
-    @property
-    def schema(self):
-        """Schema for the table containing the rows
-
-        Returns:
-            list of :class:`~google.cloud.bigquery.schema.SchemaField`:
-                fields describing the schema
-        """
-        return list(self._schema)
-
-    @property
-    def total_rows(self):
-        """The total number of rows in the table.
-
-        Returns:
-            int: the row count.
-        """
-        return self._total_rows
-
-    def to_dataframe(self):
-        """Create a pandas DataFrame from the query results.
-
-        Returns:
-            A :class:`~pandas.DataFrame` populated with row data and column
-            headers from the query results. The column headers are derived
-            from the destination table's schema.
-
-        Raises:
-            ValueError: If the `pandas` library cannot be imported.
-
-        """
-        if pandas is None:
-            raise ValueError('The pandas library is not installed, please '
-                             'install pandas to use the to_dataframe() '
-                             'function.')
-
-        column_headers = [field.name for field in self.schema]
-        rows = [row.values() for row in iter(self)]
-
-        return pandas.DataFrame(rows, columns=column_headers)
diff --git a/bigquery/nox.py b/bigquery/nox.py
@@ -36,7 +36,7 @@ def default(session):
     """
     # Install all test dependencies, then install this package in-place.
     session.install('mock', 'pytest', 'pytest-cov', *LOCAL_DEPS)
-    session.install('-e', '.[pandas]')
+    session.install('-e', '.')
 
     # Run py.test against the unit tests.
     session.run(
@@ -89,7 +89,7 @@ def system(session, py):
         os.path.join('..', 'storage'),
         os.path.join('..', 'test_utils'),
     )
-    session.install('-e', '.[pandas]')
+    session.install('-e', '.')
 
     # Run py.test against the system tests.
     session.run(
diff --git a/bigquery/setup.py b/bigquery/setup.py
@@ -58,10 +58,6 @@
     'requests >= 2.18.0',
 ]
 
-EXTRAS_REQUIREMENTS = {
-    'pandas': ['pandas >= 0.17.1'],
-}
-
 setup(
     name='google-cloud-bigquery',
     version='0.28.1.dev1',
@@ -73,6 +69,5 @@
     ],
     packages=find_packages(exclude=('tests*',)),
     install_requires=REQUIREMENTS,
-    extras_require=EXTRAS_REQUIREMENTS,
     **SETUP_BASE
 )
diff --git a/bigquery/tests/system.py b/bigquery/tests/system.py
@@ -24,10 +24,6 @@
 import uuid
 
 import six
-try:
-    import pandas
-except ImportError:  # pragma: NO COVER
-    pandas = None
 
 from google.api_core.exceptions import PreconditionFailed
 from google.cloud import bigquery
@@ -1248,28 +1244,6 @@ def test_query_iter(self):
         row_tuples = [r.values() for r in query_job]
         self.assertEqual(row_tuples, [(1,)])
 
-    @unittest.skipIf(pandas is None, 'Requires `pandas`')
-    def test_query_results_to_dataframe(self):
-        QUERY = """
-            SELECT id, author, time_ts, dead
-            from `bigquery-public-data.hacker_news.comments`
-            LIMIT 10
-        """
-
-        df = Config.CLIENT.query(QUERY).result().to_dataframe()
-
-        self.assertIsInstance(df, pandas.DataFrame)
-        self.assertEqual(len(df), 10)  # verify the number of rows
-        column_names = ['id', 'author', 'time_ts', 'dead']
-        self.assertEqual(list(df), column_names)  # verify the column names
-        exp_datatypes = {'id': int, 'author': str,
-                         'time_ts': pandas.Timestamp, 'dead': bool}
-        for index, row in df.iterrows():
-            for col in column_names:
-                # all the schema fields are nullable, so None is acceptable
-                if not row[col] is None:
-                    self.assertIsInstance(row[col], exp_datatypes[col])
-
     def test_query_table_def(self):
         gs_url = self._write_csv_to_storage(
             'bq_external_test' + unique_resource_id(), 'person_ages.csv',
@@ -1445,56 +1419,6 @@ def test_create_table_rows_fetch_nested_schema(self):
             e_favtime = datetime.datetime(*parts[0:6])
             self.assertEqual(found[7], e_favtime)
 
-    def _fetch_dataframe(self, query):
-        return Config.CLIENT.query(query).result().to_dataframe()
-
-    @unittest.skipIf(pandas is None, 'Requires `pandas`')
-    def test_nested_table_to_dataframe(self):
-        SF = bigquery.SchemaField
-        schema = [
-            SF('string_col', 'STRING', mode='NULLABLE'),
-            SF('record_col', 'RECORD', mode='NULLABLE', fields=[
-                SF('nested_string', 'STRING', mode='NULLABLE'),
-                SF('nested_repeated', 'INTEGER', mode='REPEATED'),
-                SF('nested_record', 'RECORD', mode='NULLABLE', fields=[
-                    SF('nested_nested_string', 'STRING', mode='NULLABLE'),
-                ]),
-            ]),
-        ]
-        record = {
-            'nested_string': 'another string value',
-            'nested_repeated': [0, 1, 2],
-            'nested_record': {'nested_nested_string': 'some deep insight'},
-        }
-        to_insert = [
-            ('Some value', record)
-        ]
-        table_id = 'test_table'
-        dataset = self.temp_dataset(_make_dataset_id('nested_df'))
-        table_arg = Table(dataset.table(table_id), schema=schema)
-        table = retry_403(Config.CLIENT.create_table)(table_arg)
-        self.to_delete.insert(0, table)
-        Config.CLIENT.create_rows(table, to_insert)
-        QUERY = 'SELECT * from `{}.{}.{}`'.format(
-            Config.CLIENT.project, dataset.dataset_id, table_id)
-
-        retry = RetryResult(_has_rows, max_tries=8)
-        df = retry(self._fetch_dataframe)(QUERY)
-
-        self.assertIsInstance(df, pandas.DataFrame)
-        self.assertEqual(len(df), 1)  # verify the number of rows
-        exp_columns = ['string_col', 'record_col']
-        self.assertEqual(list(df), exp_columns)  # verify the column names
-        row = df.iloc[0]
-        # verify the row content
-        self.assertEqual(row['string_col'], 'Some value')
-        self.assertEqual(row['record_col'], record)
-        # verify that nested data can be accessed with indices/keys
-        self.assertEqual(row['record_col']['nested_repeated'][0], 0)
-        self.assertEqual(
-            row['record_col']['nested_record']['nested_nested_string'],
-            'some deep insight')
-
     def temp_dataset(self, dataset_id):
         dataset = retry_403(Config.CLIENT.create_dataset)(
             Dataset(Config.CLIENT.dataset(dataset_id)))
diff --git a/bigquery/tests/unit/test_job.py b/bigquery/tests/unit/test_job.py
@@ -16,10 +16,6 @@
 
 from six.moves import http_client
 import unittest
-try:
-    import pandas
-except ImportError:  # pragma: NO COVER
-    pandas = None
 
 from google.cloud.bigquery.job import ExtractJobConfig, CopyJobConfig
 from google.cloud.bigquery.job import LoadJobConfig
@@ -2724,41 +2720,6 @@ def test_reload_w_alternate_client(self):
         self.assertEqual(req['path'], PATH)
         self._verifyResourceProperties(job, RESOURCE)
 
-    @unittest.skipIf(pandas is None, 'Requires `pandas`')
-    def test_to_dataframe(self):
-        begun_resource = self._make_resource()
-        query_resource = {
-            'jobComplete': True,
-            'jobReference': {
-                'projectId': self.PROJECT,
-                'jobId': self.JOB_ID,
-            },
-            'schema': {
-                'fields': [
-                    {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
-                    {'name': 'age', 'type': 'INTEGER', 'mode': 'NULLABLE'},
-                ],
-            },
-            'rows': [
-                {'f': [{'v': 'Phred Phlyntstone'}, {'v': '32'}]},
-                {'f': [{'v': 'Bharney Rhubble'}, {'v': '33'}]},
-                {'f': [{'v': 'Wylma Phlyntstone'}, {'v': '29'}]},
-                {'f': [{'v': 'Bhettye Rhubble'}, {'v': '27'}]},
-            ],
-        }
-        done_resource = copy.deepcopy(begun_resource)
-        done_resource['status'] = {'state': 'DONE'}
-        connection = _Connection(
-            begun_resource, query_resource, done_resource, query_resource)
-        client = _make_client(project=self.PROJECT, connection=connection)
-        job = self._make_one(self.JOB_ID, self.QUERY, client)
-
-        df = job.to_dataframe()
-
-        self.assertIsInstance(df, pandas.DataFrame)
-        self.assertEqual(len(df), 4)  # verify the number of rows
-        self.assertEqual(list(df), ['name', 'age'])  # verify the column names
-
     def test_iter(self):
         import types
 
diff --git a/bigquery/tests/unit/test_table.py b/bigquery/tests/unit/test_table.py