Closed
Description
Checks
- I have checked that this issue has not already been reported.
- I have confirmed this bug exists on the latest version of Polars.
Reproducible example
import pyarrow as pa
import polars as pl # polars-1.0.0
tt = pa.Table.from_pydict({"col" : ["a", "a", "b", "b"],
"col1": [{"x" : 1, "y" : 2}] * 4})
tt = tt.cast(pa.schema([pa.field("col", pa.dictionary(pa.int32(), pa.string())),
pa.field("col1", pa.struct([("x", pa.int32()), ("y", pa.int32())]))]))
pl.from_arrow(tt) # OK
tt_bad = pa.concat_tables([tt.slice(0, 2), tt.slice(2, 2)]) # same table but chunked into two pieces
pl.from_arrow(tt_bad.select(["col"])) # works
pl.from_arrow(tt_bad.select(["col1"])) # works
pl.from_arrow(tt_bad) # This fails
Log output
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File .../python3.8/site-packages/polars/_utils/construction/series.py:296, in _construct_series_with_fallbacks(constructor, name, values, dtype, strict)
295 try:
--> 296 return constructor(name, values, strict)
297 except TypeError:
TypeError: 'str' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
File .../python3.8/site-packages/polars/_utils/getitem.py:156, in get_df_item_by_key(df, key)
155 try:
--> 156 return _select_rows(df, key) # type: ignore[arg-type]
157 except TypeError:
File .../python3.8/site-packages/polars/_utils/getitem.py:295, in _select_rows(df, key)
294 _raise_on_boolean_mask()
--> 295 s = pl.Series("", key, dtype=Int64)
296 indices = _convert_series_to_indices(s, df.height)
File .../python3.8/site-packages/polars/series/series.py:287, in Series.__init__(self, name, values, dtype, strict, nan_to_null)
286 if isinstance(values, Sequence):
--> 287 self._s = sequence_to_pyseries(
288 name,
289 values,
290 dtype=dtype,
291 strict=strict,
292 nan_to_null=nan_to_null,
293 )
295 elif values is None:
File .../python3.8/site-packages/polars/_utils/construction/series.py:134, in sequence_to_pyseries(name, values, dtype, strict, nan_to_null)
133 constructor = polars_type_to_constructor(dtype)
--> 134 pyseries = _construct_series_with_fallbacks(
135 constructor, name, values, dtype, strict=strict
136 )
137 if dtype in (
138 Date,
139 Datetime,
(...)
145 Decimal,
146 ):
File .../python3.8/site-packages/polars/_utils/construction/series.py:301, in _construct_series_with_fallbacks(constructor, name, values, dtype, strict)
300 else:
--> 301 return PySeries.new_from_any_values_and_dtype(
302 name, values, dtype, strict=strict
303 )
TypeError: unexpected value while building Series of type Int64; found value of type String: "col"
Hint: Try setting `strict=False` to allow passing data with mixed types.
During handling of the above exception, another exception occurred:
ColumnNotFoundError Traceback (most recent call last)
Cell In[54], line 12
10 pl.from_arrow(tt_bad.select(["col"])) # works
11 pl.from_arrow(tt_bad.select(["col1"])) # works
---> 12 pl.from_arrow(tt_bad) # This fails
File .../python3.8/site-packages/polars/convert/general.py:433, in from_arrow(data, schema, schema_overrides, rechunk)
370 """
371 Create a DataFrame or Series from an Arrow Table or Array.
372
(...)
429 ]
430 """ # noqa: W505
431 if isinstance(data, (pa.Table, pa.RecordBatch)):
432 return wrap_df(
--> 433 arrow_to_pydf(
434 data=data,
435 rechunk=rechunk,
436 schema=schema,
437 schema_overrides=schema_overrides,
438 )
439 )
440 elif isinstance(data, (pa.Array, pa.ChunkedArray)):
441 name = getattr(data, "_name", "") or ""
File .../python3.8/site-packages/polars/_utils/construction/dataframe.py:1182, in arrow_to_pydf(data, schema, schema_overrides, strict, rechunk)
1179 reset_order = True
1181 if reset_order:
-> 1182 df = df[names]
1183 pydf = df._df
1185 if column_names != original_schema and (schema_overrides or original_schema):
File .../python3.8/site-packages/polars/dataframe/frame.py:1183, in DataFrame.__getitem__(self, key)
1169 def __getitem__(
1170 self,
1171 key: (
(...)
1180 ),
1181 ) -> DataFrame | Series | Any:
1182 """Get part of the DataFrame as a new DataFrame, Series, or scalar."""
-> 1183 return get_df_item_by_key(self, key)
File .../python3.8/site-packages/polars/_utils/getitem.py:158, in get_df_item_by_key(df, key)
156 return _select_rows(df, key) # type: ignore[arg-type]
157 except TypeError:
--> 158 return _select_columns(df, key)
File .../python3.8/site-packages/polars/_utils/getitem.py:206, in _select_columns(df, key)
204 return _select_columns_by_index(df, key) # type: ignore[arg-type]
205 elif isinstance(first, str):
--> 206 return _select_columns_by_name(df, key) # type: ignore[arg-type]
207 else:
208 msg = f"cannot select columns using Sequence with elements of type {type(first).__name__!r}"
File .../python3.8/site-packages/polars/_utils/getitem.py:254, in _select_columns_by_name(df, key)
253 def _select_columns_by_name(df: DataFrame, key: Iterable[str]) -> DataFrame:
--> 254 return df._from_pydf(df._df.select(key))
ColumnNotFoundError: col
Issue description
polars cannot exported pyarrow.Table with containing dict and struct columns and split into several pieces.
Expected behavior
Export should work and provide the same result as pl.from_arrow(tt_bad.combine_chunks())
(current workaround).
Installed versions
--------Version info---------
Polars: 1.0.0
Index type: UInt32
Platform: Linux-5.15.0-1063-aws-x86_64-with-glibc2.17
Python: 3.8.18 (default, Sep 11 2023, 13:40:15)
[GCC 11.2.0]
----Optional dependencies----
adbc_driver_manager: <not installed>
cloudpickle: 3.0.0
connectorx: <not installed>
deltalake: <not installed>
fastexcel: <not installed>
fsspec: 2023.10.0
gevent: <not installed>
great_tables: <not installed>
hvplot: <not installed>
matplotlib: 3.4.3
nest_asyncio: 1.6.0
numpy: 1.24.4
openpyxl: <not installed>
pandas: 2.0.3
pyarrow: 16.1.0
pydantic: 1.10.13
pyiceberg: <not installed>
sqlalchemy: 2.0.23
torch: 2.2.1+cu121
xlsx2csv: <not installed>
xlsxwriter: <not installed>