Skip to content

Commit 3e64135

Browse files
authored
fix: Reading of reencoded categorical in Parquet (#22436)
1 parent 180d740 commit 3e64135

File tree

3 files changed

+31
-9
lines changed

3 files changed

+31
-9
lines changed

crates/polars-parquet/src/arrow/read/deserialize/simple.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -387,9 +387,13 @@ pub fn page_iter_to_array(
387387
// - Int -> String which can be turned into categoricals
388388
assert_eq!(value_type.as_ref(), &ArrowDataType::Utf8View);
389389

390-
if field.metadata.is_none_or(|md| {
391-
!md.contains_key(DTYPE_ENUM_VALUES) && !md.contains_key(DTYPE_CATEGORICAL)
392-
}) {
390+
if field.metadata.is_some_and(|md| {
391+
md.contains_key(DTYPE_ENUM_VALUES) || md.contains_key(DTYPE_CATEGORICAL)
392+
}) && matches!(key_type, IntegerType::UInt32)
393+
{
394+
PageDecoder::new(pages, dtype, CategoricalDecoder::new(), init_nested)?
395+
.collect_boxed(filter)?
396+
} else {
393397
let (nested, array, ptm) = PageDecoder::new(
394398
pages,
395399
ArrowDataType::Utf8View,
@@ -404,10 +408,6 @@ pub fn page_iter_to_array(
404408
.unwrap(),
405409
ptm,
406410
)
407-
} else {
408-
assert_eq!(key_type, &IntegerType::UInt32);
409-
PageDecoder::new(pages, dtype, CategoricalDecoder::new(), init_nested)?
410-
.collect_boxed(filter)?
411411
}
412412
},
413413
(from, to) => {

crates/polars-parquet/src/arrow/read/schema/metadata.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use arrow::datatypes::{
2-
ArrowDataType, ArrowSchema, DTYPE_CATEGORICAL, DTYPE_ENUM_VALUES, Field, Metadata,
2+
ArrowDataType, ArrowSchema, DTYPE_CATEGORICAL, DTYPE_ENUM_VALUES, Field, IntegerType, Metadata,
33
};
44
use arrow::io::ipc::read::deserialize_schema;
55
use base64::Engine as _;
@@ -27,7 +27,8 @@ fn convert_field(field: &mut Field) {
2727
ArrowDataType::Dictionary(key_type, value_type, sorted) => {
2828
let is_pl_enum_or_categorical = field.metadata.as_ref().is_some_and(|md| {
2929
md.contains_key(DTYPE_ENUM_VALUES) || md.contains_key(DTYPE_CATEGORICAL)
30-
});
30+
}) && matches!(key_type, IntegerType::UInt32)
31+
&& matches!(value_type.as_ref(), ArrowDataType::Utf8View);
3132
let is_int_to_str = matches!(
3233
value_type.as_ref(),
3334
ArrowDataType::Utf8View | ArrowDataType::Utf8 | ArrowDataType::LargeUtf8

py-polars/tests/unit/io/test_parquet.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3180,3 +3180,24 @@ def test_filter_nan_22289() -> None:
31803180
lf.collect().filter(pl.col.a.is_nan()),
31813181
lf.filter(pl.col.a.is_nan()).collect(),
31823182
)
3183+
3184+
3185+
def test_reencode_categoricals_22385() -> None:
3186+
tbl = pl.Series("a", ["abc"], pl.Categorical()).to_frame().to_arrow()
3187+
tbl = tbl.cast(
3188+
pa.schema(
3189+
[
3190+
pa.field(
3191+
"a",
3192+
pa.dictionary(pa.int32(), pa.large_string()),
3193+
metadata=tbl.schema[0].metadata,
3194+
),
3195+
]
3196+
)
3197+
)
3198+
3199+
f = io.BytesIO()
3200+
pq.write_table(tbl, f)
3201+
3202+
f.seek(0)
3203+
pl.scan_parquet(f).collect()

0 commit comments

Comments
 (0)