delta-io · zachschuermann · Mar 14, 2024 · Mar 5, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml
@@ -9,8 +9,6 @@ readme.workspace = true
 version.workspace = true
 
 [dependencies]
-arrow-array = { version = "^49.0" }
-arrow-select = { version = "^49.0" }
 bytes = "1.4"
 chrono = { version = "0.4" }
 either = "1.8"
@@ -33,6 +31,8 @@ z85 = "3.0.5"
 visibility = "0.1.0"
 
 # Used in default client
+arrow-array = { version = "^49.0", optional = true }
+arrow-select = { version = "^49.0", optional = true }
 arrow-arith = { version = "^49.0", optional = true }
 arrow-json = { version = "^49.0", optional = true }
 arrow-ord = { version = "^49.0", optional = true }
@@ -49,11 +49,13 @@ tokio = { version = "1", optional = true, features = ["rt-multi-thread"] }
 arrow-conversion = ["arrow-schema"]
 default = ["simple-client"]
 default-client = [
+  "arrow-array",
   "arrow-conversion",
   "arrow-arith",
   "arrow-json",
   "arrow-ord",
   "arrow-schema",
+  "arrow-select",
   "futures",
   "object_store",
   "parquet/async",
@@ -63,8 +65,10 @@ default-client = [
 
 developer-visibility = []
 simple-client = [
+  "arrow-array",
   "arrow-conversion",
   "arrow-json",
+  "arrow-select",
   "parquet"
 ]
 

diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs
@@ -62,6 +62,21 @@ impl DataVisitor for MetadataVisitor {
     }
 }
 
+#[derive(Default)]
+pub(crate) struct SelectionVectorVisitor {
+    pub(crate) selection_vector: Vec<bool>,
+}
+
+impl DataVisitor for SelectionVectorVisitor {
+    fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
+        for i in 0..row_count {
+            self.selection_vector
+                .push(getters[0].get(i, "selectionvector.output")?)
+        }
+        Ok(())
+    }
+}
+
 #[derive(Default)]
 pub(crate) struct ProtocolVisitor {
     pub(crate) protocol: Option<Protocol>,

diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs
@@ -95,7 +95,7 @@ pub struct FileMeta {
 /// Connectors can implement this interface to optimize the evaluation using the
 /// connector specific capabilities.
 pub trait ExpressionEvaluator {
-    /// Evaluate the expression on given ColumnarBatch data.
+    /// Evaluate the expression on a given EngineData.
     ///
     /// Contains one value for each row of the input.
     /// The data type of the output is same as the type output of the expression this evaluator is using.

diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs
@@ -1,14 +1,12 @@
 use std::collections::HashSet;
 use std::sync::Arc;
 
-use arrow_array::{Array, BooleanArray};
-use arrow_select::filter::filter_record_batch;
 use tracing::debug;
 
-use crate::error::{DeltaResult, Error};
+use crate::actions::visitors::SelectionVectorVisitor;
+use crate::error::DeltaResult;
 use crate::expressions::{BinaryOperator, Expression as Expr, VariadicOperator};
 use crate::schema::{DataType, SchemaRef, StructField, StructType};
-use crate::simple_client::data::SimpleData;
 use crate::{EngineData, EngineInterface, ExpressionEvaluator, JsonHandler};
 
 /// Returns <op2> (if any) such that B <op2> A is equivalent to A <op> B.
@@ -116,8 +114,8 @@ impl DataSkippingFilter {
             static ref PREDICATE_SCHEMA: DataType = StructType::new(vec![
                 StructField::new("predicate", DataType::BOOLEAN, true),
             ]).into();
-            static ref FILTER_EXPR: Expr = Expr::column("predicate").distinct(Expr::literal(false));
             static ref STATS_EXPR: Expr = Expr::column("add.stats");
+            static ref FILTER_EXPR: Expr = Expr::column("predicate").distinct(Expr::literal(false));
         );
 
         let predicate = match predicate {
@@ -147,17 +145,21 @@ impl DataSkippingFilter {
 
         // Skipping happens in several steps:
         //
-        // 1. The predicate produces false for any file whose stats prove we can safely skip it. A
-        //    value of true means the stats say we must keep the file, and null means we could not
-        //    determine whether the file is safe to skip, because its stats were missing/null.
-        //
-        // 2. The nullif(skip, skip) converts true (= keep) to null, producing a result
-        //    that contains only false (= skip) and null (= keep) values.
+        // 1. The stats selector fetches add.stats from the metadata
         //
-        // 3. The is_null converts null to true, producing a result that contains only true (=
-        //    keep) and false (= skip) values.
+        // 2. The predicate (skipping evaluator) produces false for any file whose stats prove we
+        //    can safely skip it. A value of true means the stats say we must keep the file, and
+        //    null means we could not determine whether the file is safe to skip, because its stats
+        //    were missing/null.
         //
-        // 4. The filter discards every file whose selection vector entry is false.
+        // 3. The selection evaluator does DISTINCT(col(predicate), 'false') to produce true (= keep) when
+        //    the predicate is true/null and false (= skip) when the predicate is false.
+        let select_stats_evaluator = table_client.get_expression_handler().get_evaluator(
+            stats_schema.clone(),
+            STATS_EXPR.clone(),
+            DataType::STRING,
+        );
+
         let skipping_evaluator = table_client.get_expression_handler().get_evaluator(
             stats_schema.clone(),
             Expr::struct_expr([as_data_skipping_predicate(predicate)?]),
@@ -170,12 +172,6 @@ impl DataSkippingFilter {
             DataType::BOOLEAN,
         );
 
-        let select_stats_evaluator = table_client.get_expression_handler().get_evaluator(
-            stats_schema.clone(),
-            STATS_EXPR.clone(),
-            DataType::STRING,
-        );
-
         Some(Self {
             stats_schema,
             select_stats_evaluator,
@@ -185,44 +181,35 @@ impl DataSkippingFilter {
         })
     }
 
-    // TODO(nick): This should not be expressed in terms of SimpleData, but should use only the
-    // expression API
-    pub(crate) fn apply(&self, actions: &dyn EngineData) -> DeltaResult<Box<dyn EngineData>> {
+    /// Apply the DataSkippingFilter to an EngineData batch of actions. Returns a selection vector
+    /// which can be applied to the actions to find those that passed data skipping.
+    pub(crate) fn apply(&self, actions: &dyn EngineData) -> DeltaResult<Vec<bool>> {
+        // retrieve and parse stats from actions data
         let stats = self.select_stats_evaluator.evaluate(actions)?;
         let parsed_stats = self
             .json_handler
             .parse_json(stats, self.stats_schema.clone())?;
 
+        // evaluate the predicate on the parsed stats, then convert to selection vector
         let skipping_predicate = self.skipping_evaluator.evaluate(&*parsed_stats)?;
-
-        let skipping_vector = self
+        let selection_vector = self
             .filter_evaluator
             .evaluate(skipping_predicate.as_ref())?;
-        let skipping_vector = skipping_vector
-            .as_any()
-            .downcast_ref::<SimpleData>()
-            .ok_or(Error::engine_data_type("SimpleData"))?
-            .record_batch()
-            .column(0);
-        let skipping_vector = skipping_vector
-            .as_any()
-            .downcast_ref::<BooleanArray>()
-            .ok_or(Error::unexpected_column_type(
-                "Expected type 'BooleanArray'.",
-            ))?;
 
-        let before_count = actions.length();
-        let actions = actions
-            .as_any()
-            .downcast_ref::<SimpleData>()
-            .ok_or(Error::engine_data_type("SimpleData"))?
-            .record_batch();
-        let after = filter_record_batch(actions, skipping_vector)?;
-        debug!(
-            "number of actions before/after data skipping: {before_count} / {}",
-            after.num_rows()
-        );
-        Ok(Box::new(SimpleData::new(after)))
+        // visit the engine's selection vector to produce a Vec<bool>
+        let mut visitor = SelectionVectorVisitor::default();
+        let schema = StructType::new(vec![StructField::new("output", DataType::BOOLEAN, false)]);
+        selection_vector
+            .as_ref()
+            .extract(Arc::new(schema), &mut visitor)?;
+        Ok(visitor.selection_vector)
+
+        // TODO(zach): add some debug info about data skipping that occurred
+        // let before_count = actions.length();
+        // debug!(
+        //     "number of actions before/after data skipping: {before_count} / {}",
+        //     filtered_actions.num_rows()
+        // );
     }
 }
 

diff --git a/kernel/src/scan/file_stream.rs b/kernel/src/scan/file_stream.rs
@@ -24,21 +24,39 @@ struct LogReplayScanner {
 struct AddRemoveVisitor {
     adds: Vec<Add>,
     removes: Vec<Remove>,
+    selection_vector: Option<Vec<bool>>,
 }
 
 const ADD_FIELD_COUNT: usize = 15;
 
+impl AddRemoveVisitor {
+    fn new(selection_vector: Option<Vec<bool>>) -> Self {
+        AddRemoveVisitor {
+            selection_vector,
+            ..Default::default()
+        }
+    }
+}
+
 impl DataVisitor for AddRemoveVisitor {
     fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> {
         for i in 0..row_count {
             // Add will have a path at index 0 if it is valid
             if let Some(path) = getters[0].get_opt(i, "add.path")? {
-                self.adds
-                    .push(AddVisitor::visit_add(i, path, &getters[..ADD_FIELD_COUNT])?);
+                // Keep the file unless the selection vector is present and is false for this row
+                if !self
+                    .selection_vector
+                    .as_ref()
+                    .is_some_and(|selection| !selection[i])
+                {
+                    self.adds
+                        .push(AddVisitor::visit_add(i, path, &getters[..ADD_FIELD_COUNT])?)
+                }
             }
             // Remove will have a path at index 15 if it is valid
             // TODO(nick): Should count the fields in Add to ensure we don't get this wrong if more
             // are added
+            // TODO(zach): add a check for selection vector that we never skip a remove
             else if let Some(path) = getters[ADD_FIELD_COUNT].get_opt(i, "remove.path")? {
                 let remove_getters = &getters[ADD_FIELD_COUNT..];
                 self.removes
@@ -70,15 +88,13 @@ impl LogReplayScanner {
         actions: &dyn EngineData,
         is_log_batch: bool,
     ) -> DeltaResult<Vec<Add>> {
-        let filtered_actions = self
+        // apply data skipping to get back a selection vector for actions that passed skipping
+        // note: None implies all files passed data skipping.
+        let selection_vector = self
             .filter
             .as_ref()
             .map(|filter| filter.apply(actions))
             .transpose()?;
-        let actions = match filtered_actions {
-            Some(ref filtered_actions) => filtered_actions.as_ref(),
-            None => actions,
-        };
 
         let schema_to_use = StructType::new(if is_log_batch {
             vec![
@@ -90,12 +106,12 @@ impl LogReplayScanner {
             // only serve as tombstones for vacuum jobs. So no need to load them here.
             vec![crate::actions::schemas::ADD_FIELD.clone()]
         });
-        let mut visitor = AddRemoveVisitor::default();
+        let mut visitor = AddRemoveVisitor::new(selection_vector);
         actions.extract(Arc::new(schema_to_use), &mut visitor)?;
 
         for remove in visitor.removes.into_iter() {
-            self.seen
-                .insert((remove.path.clone(), remove.dv_unique_id()));
+            let dv_id = remove.dv_unique_id();
+            self.seen.insert((remove.path, dv_id));
         }
 
         visitor