Skip to content

feat: enable read_csv() to process other files #940

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,10 +1008,12 @@ def _check_file_size(self, filepath: str):
blob = bucket.blob(blob_name)
blob.reload()
file_size = blob.size
else: # local file path
elif os.path.exists(filepath): # local file path
file_size = os.path.getsize(filepath)
else:
file_size = None

if file_size > max_size:
if file_size is not None and file_size > max_size:
# Convert to GB
file_size = round(file_size / (1024**3), 1)
max_size = int(max_size / 1024**3)
Expand Down
8 changes: 7 additions & 1 deletion bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import dataclasses
import datetime
import itertools
import os
import typing
from typing import Dict, Hashable, IO, Iterable, List, Optional, Sequence, Tuple, Union

Expand Down Expand Up @@ -421,11 +422,16 @@ def _read_bigquery_load_job(
load_job = self._bqclient.load_table_from_uri(
filepath_or_buffer, table, job_config=job_config
)
else:
elif os.path.exists(filepath_or_buffer): # local file path
with open(filepath_or_buffer, "rb") as source_file:
load_job = self._bqclient.load_table_from_file(
source_file, table, job_config=job_config
)
else:
raise NotImplementedError(
f"BigQuery engine only supports a local file path or GCS path. "
f"{constants.FEEDBACK_LINK}"
)
else:
load_job = self._bqclient.load_table_from_file(
filepath_or_buffer, table, job_config=job_config
Expand Down
19 changes: 19 additions & 0 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,25 @@ def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine):
assert len(df.columns) == 1


@pytest.mark.parametrize(
"engine",
[
pytest.param(
"bigquery",
id="bq_engine",
marks=pytest.mark.xfail(
raises=NotImplementedError,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it only available in default engine? We should update the docs to be explicit. Also should update the doc that we support global paths(currently says local only).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! Done.

),
),
pytest.param(None, id="default_engine"),
],
)
def test_read_csv_others(session, engine):
uri = "https://raw.githubusercontent.com/googleapis/python-bigquery-dataframes/main/tests/data/people.csv"
df = session.read_csv(uri, engine=engine)
assert len(df.columns) == 3


@pytest.mark.parametrize(
"engine",
[
Expand Down
6 changes: 3 additions & 3 deletions third_party/bigframes_vendored/pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,16 @@ def read_csv(
encoding: Optional[str] = None,
**kwargs,
):
"""Loads DataFrame from comma-separated values (csv) file locally or from
Cloud Storage.
"""Loads data from a comma-separated values (csv) file into a DataFrame.

The CSV file data will be persisted as a temporary BigQuery table, which can be
automatically recycled after the Session is closed.

.. note::
using `engine="bigquery"` will not guarantee the same ordering as the
file. Instead, set a serialized index column as the index and sort by
that in the resulting DataFrame.
that in the resulting DataFrame. Only files stored on your local machine
or in Google Cloud Storage are supported.

.. note::
For non-bigquery engine, data is inlined in the query SQL if it is
Expand Down