Skip to content

import pl.from_arrow fails for dict type + struct  #17443

Closed
@artemru

Description

@artemru

Checks

  • I have checked that this issue has not already been reported.
  • I have confirmed this bug exists on the latest version of Polars.

Reproducible example

import pyarrow as pa
import polars as pl # polars-1.0.0
tt = pa.Table.from_pydict({"col" : ["a", "a", "b", "b"],
                           "col1": [{"x" : 1, "y" : 2}] * 4})
tt = tt.cast(pa.schema([pa.field("col", pa.dictionary(pa.int32(), pa.string())),
                        pa.field("col1", pa.struct([("x", pa.int32()), ("y", pa.int32())]))]))
pl.from_arrow(tt) # OK

tt_bad = pa.concat_tables([tt.slice(0, 2), tt.slice(2, 2)])  # same table but chunked into two pieces
pl.from_arrow(tt_bad.select(["col"])) # works
pl.from_arrow(tt_bad.select(["col1"])) # works
pl.from_arrow(tt_bad) # This fails

Log output

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File .../python3.8/site-packages/polars/_utils/construction/series.py:296, in _construct_series_with_fallbacks(constructor, name, values, dtype, strict)
    295 try:
--> 296     return constructor(name, values, strict)
    297 except TypeError:

TypeError: 'str' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
File .../python3.8/site-packages/polars/_utils/getitem.py:156, in get_df_item_by_key(df, key)
    155 try:
--> 156     return _select_rows(df, key)  # type: ignore[arg-type]
    157 except TypeError:

File .../python3.8/site-packages/polars/_utils/getitem.py:295, in _select_rows(df, key)
    294     _raise_on_boolean_mask()
--> 295 s = pl.Series("", key, dtype=Int64)
    296 indices = _convert_series_to_indices(s, df.height)

File .../python3.8/site-packages/polars/series/series.py:287, in Series.__init__(self, name, values, dtype, strict, nan_to_null)
    286 if isinstance(values, Sequence):
--> 287     self._s = sequence_to_pyseries(
    288         name,
    289         values,
    290         dtype=dtype,
    291         strict=strict,
    292         nan_to_null=nan_to_null,
    293     )
    295 elif values is None:

File .../python3.8/site-packages/polars/_utils/construction/series.py:134, in sequence_to_pyseries(name, values, dtype, strict, nan_to_null)
    133 constructor = polars_type_to_constructor(dtype)
--> 134 pyseries = _construct_series_with_fallbacks(
    135     constructor, name, values, dtype, strict=strict
    136 )
    137 if dtype in (
    138     Date,
    139     Datetime,
   (...)
    145     Decimal,
    146 ):

File .../python3.8/site-packages/polars/_utils/construction/series.py:301, in _construct_series_with_fallbacks(constructor, name, values, dtype, strict)
    300 else:
--> 301     return PySeries.new_from_any_values_and_dtype(
    302         name, values, dtype, strict=strict
    303     )

TypeError: unexpected value while building Series of type Int64; found value of type String: "col"

Hint: Try setting `strict=False` to allow passing data with mixed types.

During handling of the above exception, another exception occurred:

ColumnNotFoundError                       Traceback (most recent call last)
Cell In[54], line 12
     10 pl.from_arrow(tt_bad.select(["col"])) # works
     11 pl.from_arrow(tt_bad.select(["col1"])) # works
---> 12 pl.from_arrow(tt_bad) # This fails

File .../python3.8/site-packages/polars/convert/general.py:433, in from_arrow(data, schema, schema_overrides, rechunk)
    370 """
    371 Create a DataFrame or Series from an Arrow Table or Array.
    372 
   (...)
    429 ]
    430 """  # noqa: W505
    431 if isinstance(data, (pa.Table, pa.RecordBatch)):
    432     return wrap_df(
--> 433         arrow_to_pydf(
    434             data=data,
    435             rechunk=rechunk,
    436             schema=schema,
    437             schema_overrides=schema_overrides,
    438         )
    439     )
    440 elif isinstance(data, (pa.Array, pa.ChunkedArray)):
    441     name = getattr(data, "_name", "") or ""

File .../python3.8/site-packages/polars/_utils/construction/dataframe.py:1182, in arrow_to_pydf(data, schema, schema_overrides, strict, rechunk)
   1179     reset_order = True
   1181 if reset_order:
-> 1182     df = df[names]
   1183     pydf = df._df
   1185 if column_names != original_schema and (schema_overrides or original_schema):

File .../python3.8/site-packages/polars/dataframe/frame.py:1183, in DataFrame.__getitem__(self, key)
   1169 def __getitem__(
   1170     self,
   1171     key: (
   (...)
   1180     ),
   1181 ) -> DataFrame | Series | Any:
   1182     """Get part of the DataFrame as a new DataFrame, Series, or scalar."""
-> 1183     return get_df_item_by_key(self, key)

File .../python3.8/site-packages/polars/_utils/getitem.py:158, in get_df_item_by_key(df, key)
    156     return _select_rows(df, key)  # type: ignore[arg-type]
    157 except TypeError:
--> 158     return _select_columns(df, key)

File .../python3.8/site-packages/polars/_utils/getitem.py:206, in _select_columns(df, key)
    204     return _select_columns_by_index(df, key)  # type: ignore[arg-type]
    205 elif isinstance(first, str):
--> 206     return _select_columns_by_name(df, key)  # type: ignore[arg-type]
    207 else:
    208     msg = f"cannot select columns using Sequence with elements of type {type(first).__name__!r}"

File .../python3.8/site-packages/polars/_utils/getitem.py:254, in _select_columns_by_name(df, key)
    253 def _select_columns_by_name(df: DataFrame, key: Iterable[str]) -> DataFrame:
--> 254     return df._from_pydf(df._df.select(key))

ColumnNotFoundError: col

Issue description

polars cannot exported pyarrow.Table with containing dict and struct columns and split into several pieces.

Expected behavior

Export should work and provide the same result as pl.from_arrow(tt_bad.combine_chunks()) (current workaround).

Installed versions

--------Version info---------
Polars:               1.0.0
Index type:           UInt32
Platform:             Linux-5.15.0-1063-aws-x86_64-with-glibc2.17
Python:               3.8.18 (default, Sep 11 2023, 13:40:15) 
[GCC 11.2.0]

----Optional dependencies----
adbc_driver_manager:  <not installed>
cloudpickle:          3.0.0
connectorx:           <not installed>
deltalake:            <not installed>
fastexcel:            <not installed>
fsspec:               2023.10.0
gevent:               <not installed>
great_tables:         <not installed>
hvplot:               <not installed>
matplotlib:           3.4.3
nest_asyncio:         1.6.0
numpy:                1.24.4
openpyxl:             <not installed>
pandas:               2.0.3
pyarrow:              16.1.0
pydantic:             1.10.13
pyiceberg:            <not installed>
sqlalchemy:           2.0.23
torch:                2.2.1+cu121
xlsx2csv:             <not installed>
xlsxwriter:           <not installed>

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-interop-arrowArea: interoperability with other Arrow implementations (such as pyarrow)bugSomething isn't workingneeds triageAwaiting prioritization by a maintainerpythonRelated to Python Polars

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions