Skip to content

chore: Purge old parquet and scan code #22226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Apr 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ streaming-iterator = "0.1.9"
strength_reduce = "0.2"
strum = "0.26"
strum_macros = "0.26"
tokio = "1.44"
tokio = { version = "1.44", default-features = false }
tokio-util = "0.7.8"
unicode-normalization = "0.1.24"
unicode-reverse = "1.0.8"
Expand Down
155 changes: 0 additions & 155 deletions crates/polars-expr/src/expressions/apply.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@ use polars_core::POOL;
use polars_core::chunked_array::builder::get_list_builder;
use polars_core::chunked_array::from_iterator_par::try_list_from_par_iter;
use polars_core::prelude::*;
#[cfg(feature = "parquet")]
use polars_io::predicates::{BatchStats, StatsEvaluator};
#[cfg(feature = "is_between")]
use polars_ops::prelude::ClosedInterval;
use rayon::prelude::*;

use super::*;
Expand Down Expand Up @@ -425,23 +421,6 @@ impl PhysicalExpr for ApplyExpr {
fn to_field(&self, input_schema: &Schema) -> PolarsResult<Field> {
self.expr.to_field(input_schema, Context::Default)
}
#[cfg(feature = "parquet")]
fn as_stats_evaluator(&self) -> Option<&dyn StatsEvaluator> {
let function = match &self.expr {
Expr::Function { function, .. } => function,
_ => return None,
};

match function {
FunctionExpr::Boolean(BooleanFunction::IsNull) => Some(self),
#[cfg(feature = "is_in")]
FunctionExpr::Boolean(BooleanFunction::IsIn { .. }) => Some(self),
#[cfg(feature = "is_between")]
FunctionExpr::Boolean(BooleanFunction::IsBetween { closed: _ }) => Some(self),
FunctionExpr::Boolean(BooleanFunction::IsNotNull) => Some(self),
_ => None,
}
}
fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> {
if self.inputs.len() == 1 && matches!(self.collect_groups, ApplyOptions::ElementWise) {
Some(self)
Expand Down Expand Up @@ -518,140 +497,6 @@ fn apply_multiple_elementwise<'a>(
}
}

#[cfg(feature = "parquet")]
impl StatsEvaluator for ApplyExpr {
fn should_read(&self, stats: &BatchStats) -> PolarsResult<bool> {
let read = self.should_read_impl(stats)?;
Ok(read)
}
}

#[cfg(feature = "parquet")]
impl ApplyExpr {
fn should_read_impl(&self, stats: &BatchStats) -> PolarsResult<bool> {
let (function, input) = match &self.expr {
Expr::Function {
function, input, ..
} => (function, input),
_ => return Ok(true),
};
// Ensure the input of the function is only a `col(..)`.
// If it does any arithmetic the code below is flawed.
if !matches!(input[0], Expr::Column(_)) {
return Ok(true);
}

match function {
FunctionExpr::Boolean(BooleanFunction::IsNull) => {
let root = expr_to_leaf_column_name(&self.expr)?;

match stats.get_stats(&root).ok() {
Some(st) => match st.null_count() {
Some(0) => Ok(false),
_ => Ok(true),
},
None => Ok(true),
}
},
FunctionExpr::Boolean(BooleanFunction::IsNotNull) => {
let root = expr_to_leaf_column_name(&self.expr)?;

match stats.get_stats(&root).ok() {
Some(st) => match st.null_count() {
Some(null_count)
if stats
.num_rows()
.is_some_and(|num_rows| num_rows == null_count) =>
{
Ok(false)
},
_ => Ok(true),
},
None => Ok(true),
}
},
#[cfg(feature = "is_in")]
FunctionExpr::Boolean(BooleanFunction::IsIn { .. }) => {
let should_read = || -> Option<bool> {
let root = expr_to_leaf_column_name(&input[0]).ok()?;

let input = self.inputs[1].evaluate_inline()?;
let input = input.as_materialized_series();

let st = stats.get_stats(&root).ok()?;
let min = st.to_min()?;
let max = st.to_max()?;

if max.get(0).unwrap() == min.get(0).unwrap() {
let one_equals =
|value: &Series| Some(ChunkCompareEq::equal(input, value).ok()?.any());
return one_equals(min);
}

let smaller = ChunkCompareIneq::lt(input, min).ok()?;
let bigger = ChunkCompareIneq::gt(input, max).ok()?;

Some(!(smaller | bigger).all())
};

Ok(should_read().unwrap_or(true))
},
#[cfg(feature = "is_between")]
FunctionExpr::Boolean(BooleanFunction::IsBetween { closed }) => {
let should_read = || -> Option<bool> {
let root: PlSmallStr = expr_to_leaf_column_name(&input[0]).ok()?;

let left = self.inputs[1]
.evaluate_inline()?
.as_materialized_series()
.clone();
let right = self.inputs[2]
.evaluate_inline()?
.as_materialized_series()
.clone();

let st = stats.get_stats(&root).ok()?;
let min = st.to_min()?;
let max = st.to_max()?;

// don't read the row_group anyways as
// the condition will evaluate to false.
// e.g. in_between(10, 5)
if ChunkCompareIneq::gt(&left, &right).ok()?.all() {
return Some(false);
}

let (left_open, right_open) = match closed {
ClosedInterval::None => (true, true),
ClosedInterval::Both => (false, false),
ClosedInterval::Left => (false, true),
ClosedInterval::Right => (true, false),
};
// check the right limit of the interval.
// if the end is open, we should be stricter (lt_eq instead of lt).
if right_open && ChunkCompareIneq::lt_eq(&right, min).ok()?.all()
|| !right_open && ChunkCompareIneq::lt(&right, min).ok()?.all()
{
return Some(false);
}
// we couldn't conclude anything using the right limit,
// check the left limit of the interval
if left_open && ChunkCompareIneq::gt_eq(&left, max).ok()?.all()
|| !left_open && ChunkCompareIneq::gt(&left, max).ok()?.all()
{
return Some(false);
}
// read the row_group
Some(true)
};

Ok(should_read().unwrap_or(true))
},
_ => Ok(true),
}
}
}

impl PartitionedAggregation for ApplyExpr {
fn evaluate_partitioned(
&self,
Expand Down
Loading
Loading