diff --git a/bigframes/series.py b/bigframes/series.py index 193eea7ee3..e0413b1b61 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -22,7 +22,7 @@ import numbers import textwrap import typing -from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, cast, List, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.series as vendored_pandas_series @@ -30,6 +30,7 @@ import numpy import pandas import pandas.core.dtypes.common +import pyarrow as pa import typing_extensions import bigframes.core @@ -181,6 +182,14 @@ def _info_axis(self) -> indexes.Index: def _session(self) -> bigframes.Session: return self._get_block().expr.session + @property + def _struct_fields(self) -> List[str]: + if not bigframes.dtypes.is_struct_like(self._dtype): + return [] + + struct_type = typing.cast(pa.StructType, self._dtype.pyarrow_dtype) + return [struct_type.field(i).name for i in range(struct_type.num_fields)] + @validations.requires_ordering() def transpose(self) -> Series: return self @@ -1096,6 +1105,9 @@ def __pos__(self) -> Series: def __neg__(self) -> Series: return self._apply_unary_op(ops.neg_op) + def __dir__(self) -> List[str]: + return dir(type(self)) + self._struct_fields + def eq(self, other: object) -> Series: # TODO: enforce stricter alignment return self._apply_binary_op(other, ops.eq_op) @@ -1240,7 +1252,15 @@ def __getitem__(self, indexer): __getitem__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__getitem__) def __getattr__(self, key: str): - if hasattr(pandas.Series, key): + # Protect against recursion errors with uninitialized Series objects. + # We use "_block" attribute to check whether the instance is initialized. + # See: + # https://github.com/googleapis/python-bigquery-dataframes/issues/728 + # and + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html + if key == "_block": + raise AttributeError(key) + elif hasattr(pandas.Series, key): raise AttributeError( textwrap.dedent( f""" @@ -1249,6 +1269,8 @@ def __getattr__(self, key: str): """ ) ) + elif key in self._struct_fields: + return self.struct.field(key) else: raise AttributeError(key) diff --git a/tests/data/nested_structs.jsonl b/tests/data/nested_structs.jsonl new file mode 100644 index 0000000000..f57214b0b3 --- /dev/null +++ b/tests/data/nested_structs.jsonl @@ -0,0 +1,2 @@ +{"id": 1, "person": {"name": "Alice", "age":30, "address": {"city": "New York", "country": "USA"}}} +{"id": 2, "person": {"name": "Bob", "age":25, "address": {"city": "London", "country": "UK"}}} \ No newline at end of file diff --git a/tests/data/nested_structs_schema.json b/tests/data/nested_structs_schema.json new file mode 100644 index 0000000000..6692615cef --- /dev/null +++ b/tests/data/nested_structs_schema.json @@ -0,0 +1,39 @@ +[ + { + "name": "id", + "type": "INTEGER", + "mode": "REQUIRED" + }, + { + "name": "person", + "type": "RECORD", + "fields": [ + { + "name": "name", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "age", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "address", + "type": "RECORD", + "fields": [ + { + "name": "city", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "country", + "type": "STRING", + "mode": "NULLABLE" + } + ] + } + ] + } +] diff --git a/tests/system/conftest.py b/tests/system/conftest.py index d9246eecfb..217cf71e0c 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -32,6 +32,7 @@ import ibis.backends import numpy as np import pandas as pd +import pyarrow as pa import pytest import pytz import test_utils.prefixer @@ -290,6 +291,7 @@ def load_test_data_tables( ("scalars", "scalars_schema.json", "scalars.jsonl"), ("scalars_too", "scalars_schema.json", "scalars.jsonl"), ("nested", "nested_schema.json", "nested.jsonl"), + ("nested_structs", "nested_structs_schema.json", "nested_structs.jsonl"), ("repeated", "repeated_schema.json", "repeated.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), @@ -367,6 +369,11 @@ def nested_table_id(test_data_tables) -> str: return test_data_tables["nested"] +@pytest.fixture(scope="session") +def nested_structs_table_id(test_data_tables) -> str: + return test_data_tables["nested_structs"] + + @pytest.fixture(scope="session") def repeated_table_id(test_data_tables) -> str: return test_data_tables["repeated"] @@ -412,6 +419,43 @@ def nested_pandas_df() -> pd.DataFrame: return df +@pytest.fixture(scope="session") +def nested_structs_df( + nested_structs_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return session.read_gbq(nested_structs_table_id, index_col="id") + + +@pytest.fixture(scope="session") +def nested_structs_pandas_df() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "nested_structs.jsonl", + lines=True, + ) + df = df.set_index("id") + return df + + +@pytest.fixture(scope="session") +def nested_structs_pandas_type() -> pd.ArrowDtype: + address_struct_schema = pa.struct( + [pa.field("city", pa.string()), pa.field("country", pa.string())] + ) + + person_struct_schema = pa.struct( + [ + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("address", address_struct_schema), + ] + ) + + return pd.ArrowDtype(person_struct_schema) + + @pytest.fixture(scope="session") def repeated_df( repeated_table_id: str, session: bigframes.Session diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 793a4062c5..aa70b7c655 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -19,6 +19,7 @@ import geopandas as gpd # type: ignore import numpy +from packaging.version import Version import pandas as pd import pyarrow as pa # type: ignore import pytest @@ -3912,3 +3913,41 @@ def test_series_explode_null(data): s.to_pandas().explode(), check_dtype=False, ) + + +def test_series_struct_get_field_by_attribute( + nested_structs_df, nested_structs_pandas_df, nested_structs_pandas_type +): + if Version(pd.__version__) < Version("2.2.0"): + pytest.skip("struct accessor is not supported before pandas 2.2") + + bf_series = nested_structs_df["person"] + df_series = nested_structs_pandas_df["person"].astype(nested_structs_pandas_type) + + pd.testing.assert_series_equal( + bf_series.address.city.to_pandas(), + df_series.struct.field("address").struct.field("city"), + check_dtype=False, + check_index=False, + ) + pd.testing.assert_series_equal( + bf_series.address.country.to_pandas(), + df_series.struct.field("address").struct.field("country"), + check_dtype=False, + check_index=False, + ) + + +def test_series_struct_fields_in_dir(nested_structs_df): + series = nested_structs_df["person"] + + assert "age" in dir(series) + assert "address" in dir(series) + assert "city" in dir(series.address) + assert "country" in dir(series.address) + + +def test_series_struct_class_attributes_shadow_struct_fields(nested_structs_df): + series = nested_structs_df["person"] + + assert series.name == "person"