add order-preserving probes

orlp · orlp · commit 530965ef50e4 · 2025-03-06T13:27:01.000+01:00
diff --git a/crates/polars-expr/src/chunked_idx_table/row_encoded.rs b/crates/polars-expr/src/chunked_idx_table/row_encoded.rs
@@ -70,9 +70,6 @@ impl RowEncodedChunkedIdxTable {
         probe_match: &mut Vec<IdxSize>,
         limit: IdxSize,
     ) -> IdxSize {
-        table_match.clear();
-        probe_match.clear();
-
         let mut keys_processed = 0;
         for (key_idx, hash, key) in hash_keys {
             let found_match = if let Some(key) = key {
diff --git a/crates/polars-expr/src/hash_keys.rs b/crates/polars-expr/src/hash_keys.rs
@@ -74,10 +74,25 @@ impl HashKeys {
         self.len() == 0
     }
 
+    /// After this call partitions will be extended with the partition for each
+    /// hash. Nulls are assigned IdxSize::MAX or a specific partition depending
+    /// on whether partition_nulls is true.
+    pub fn gen_partitions(
+        &self,
+        partitioner: &HashPartitioner,
+        partitions: &mut Vec<IdxSize>,
+        partition_nulls: bool,
+    ) {
+        match self {
+            Self::RowEncoded(s) => s.gen_partitions(partitioner, partitions, partition_nulls),
+            Self::Single(s) => s.gen_partitions(partitioner, partitions, partition_nulls),
+        }
+    }
+
     /// After this call partition_idxs[p] will be extended with the indices of
     /// hashes that belong to partition p, and the cardinality sketches are
     /// updated accordingly.
-    pub fn gen_partition_idxs(
+    pub fn gen_idxs_per_partition(
         &self,
         partitioner: &HashPartitioner,
         partition_idxs: &mut [Vec<IdxSize>],
@@ -86,13 +101,13 @@ impl HashKeys {
     ) {
         if sketches.is_empty() {
             match self {
-                Self::RowEncoded(s) => s.gen_partition_idxs::<false>(
+                Self::RowEncoded(s) => s.gen_idxs_per_partition::<false>(
                     partitioner,
                     partition_idxs,
                     sketches,
                     partition_nulls,
                 ),
-                Self::Single(s) => s.gen_partition_idxs::<false>(
+                Self::Single(s) => s.gen_idxs_per_partition::<false>(
                     partitioner,
                     partition_idxs,
                     sketches,
@@ -101,13 +116,13 @@ impl HashKeys {
             }
         } else {
             match self {
-                Self::RowEncoded(s) => s.gen_partition_idxs::<true>(
+                Self::RowEncoded(s) => s.gen_idxs_per_partition::<true>(
                     partitioner,
                     partition_idxs,
                     sketches,
                     partition_nulls,
                 ),
-                Self::Single(s) => s.gen_partition_idxs::<true>(
+                Self::Single(s) => s.gen_idxs_per_partition::<true>(
                     partitioner,
                     partition_idxs,
                     sketches,
@@ -159,7 +174,33 @@ pub struct RowEncodedKeys {
 }
 
 impl RowEncodedKeys {
-    pub fn gen_partition_idxs<const BUILD_SKETCHES: bool>(
+    pub fn gen_partitions(
+        &self,
+        partitioner: &HashPartitioner,
+        partitions: &mut Vec<IdxSize>,
+        partition_nulls: bool,
+    ) {
+        partitions.reserve(self.hashes.len());
+        if let Some(validity) = self.keys.validity() {
+            // Arbitrarily put nulls in partition 0.
+            let null_p = if partition_nulls { 0 } else { IdxSize::MAX };
+            partitions.extend(self.hashes.values_iter().zip(validity).map(|(h, is_v)| {
+                if is_v {
+                    partitioner.hash_to_partition(*h) as IdxSize
+                } else {
+                    null_p
+                }
+            }))
+        } else {
+            partitions.extend(
+                self.hashes
+                    .values_iter()
+                    .map(|h| partitioner.hash_to_partition(*h) as IdxSize),
+            )
+        }
+    }
+
+    pub fn gen_idxs_per_partition<const BUILD_SKETCHES: bool>(
         &self,
         partitioner: &HashPartitioner,
         partition_idxs: &mut [Vec<IdxSize>],
@@ -261,7 +302,16 @@ pub struct SingleKeys {
 }
 
 impl SingleKeys {
-    pub fn gen_partition_idxs<const BUILD_SKETCHES: bool>(
+    pub fn gen_partitions(
+        &self,
+        _partitioner: &HashPartitioner,
+        _partitions: &mut Vec<IdxSize>,
+        _partition_nulls: bool,
+    ) {
+        todo!()
+    }
+
+    pub fn gen_idxs_per_partition<const BUILD_SKETCHES: bool>(
         &self,
         partitioner: &HashPartitioner,
         partition_idxs: &mut [Vec<IdxSize>],
diff --git a/crates/polars-expr/src/idx_table/row_encoded.rs b/crates/polars-expr/src/idx_table/row_encoded.rs
@@ -68,9 +68,6 @@ impl RowEncodedIdxTable {
         probe_match: &mut Vec<IdxSize>,
         limit: IdxSize,
     ) -> IdxSize {
-        table_match.clear();
-        probe_match.clear();
-
         let mut keys_processed = 0;
         for (key_idx, hash, key) in hash_keys {
             let found_match = if let Some(key) = key {
diff --git a/crates/polars-stream/src/nodes/joins/equi_join.rs b/crates/polars-stream/src/nodes/joins/equi_join.rs
@@ -503,7 +503,7 @@ impl BuildState {
                 for p in partition_idxs.iter_mut() {
                     p.clear();
                 }
-                hash_keys.gen_partition_idxs(
+                hash_keys.gen_idxs_per_partition(
                     &partitioner,
                     &mut partition_idxs,
                     &mut sketches,
@@ -678,7 +678,7 @@ impl ProbeState {
                 for p in partition_idxs.iter_mut() {
                     p.clear();
                 }
-                hash_keys.gen_partition_idxs(
+                hash_keys.gen_idxs_per_partition(
                     &partitioner,
                     &mut partition_idxs,
                     &mut [],
@@ -690,6 +690,8 @@ impl ProbeState {
                     let mut out_per_partition = Vec::with_capacity(partitioner.num_partitions());
                     let name = PlSmallStr::from_static("__POLARS_PROBE_PRESERVE_ORDER_IDX");
                     for (p, idxs_in_p) in partitions.iter().zip(&partition_idxs) {
+                        table_match.clear();
+                        probe_match.clear();
                         p.hash_table.probe_subset(
                             &hash_keys,
                             idxs_in_p,
@@ -759,6 +761,8 @@ impl ProbeState {
                     for (p, idxs_in_p) in partitions.iter().zip(&partition_idxs) {
                         let mut offset = 0;
                         while offset < idxs_in_p.len() {
+                            table_match.clear();
+                            probe_match.clear();
                             offset += p.hash_table.probe_subset(
                                 &hash_keys,
                                 &idxs_in_p[offset..],
diff --git a/crates/polars-stream/src/nodes/joins/new_equi_join.rs b/crates/polars-stream/src/nodes/joins/new_equi_join.rs
@@ -516,7 +516,7 @@ impl BuildState {
             let mut payload = select_payload(morsel.df().clone(), payload_selector);
             payload.rechunk_mut();
 
-            hash_keys.gen_partition_idxs(
+            hash_keys.gen_idxs_per_partition(
                 &partitioner,
                 &mut local.morsel_idxs_values_per_p,
                 &mut local.sketch_per_p,
@@ -644,6 +644,8 @@ impl ProbeState {
     ) -> PolarsResult<MorselSeq> {
         // TODO: shuffle after partitioning and keep probe tables thread-local.
         let mut partition_idxs = vec![Vec::new(); partitioner.num_partitions()];
+        let mut probe_partitions = Vec::new();
+        let mut materialized_idxsize_range = Vec::new();
         let mut table_match = Vec::new();
         let mut probe_match = Vec::new();
         let mut max_seq = MorselSeq::default();
@@ -690,79 +692,142 @@ impl ProbeState {
             let max_match_per_key_est = selectivity_estimate as usize + 16;
             let out_est_size = ((selectivity_estimate * 1.2 * df_height as f64) as usize).min(probe_limit as usize);
             build_out.reserve(out_est_size + max_match_per_key_est);
-            probe_out.reserve(out_est_size + max_match_per_key_est);
 
             unsafe {
-                // Partition and probe the tables.
-                for p in partition_idxs.iter_mut() {
-                    p.clear();
-                }
-                hash_keys.gen_partition_idxs(
-                    &partitioner,
-                    &mut partition_idxs,
-                    &mut [],
-                    emit_unmatched,
-                );
+                let new_morsel = |build: &mut DataFrameBuilder, probe: &mut DataFrameBuilder| {
+                    let mut build_df = build.freeze_reset();
+                    let mut probe_df = probe.freeze_reset();
+                    let out_df = if params.left_is_build.unwrap() {
+                        build_df.hstack_mut_unchecked(probe_df.get_columns());
+                        build_df
+                    } else {
+                        probe_df.hstack_mut_unchecked(build_df.get_columns());
+                        probe_df
+                    };
+                    let out_df = postprocess_join(out_df, params);
+                    Morsel::new(out_df, seq, src_token.clone())
+                };
+
                 if params.preserve_order_probe {
-                    todo!()
-                } else {
-                    let new_morsel = |mut build_df: DataFrame, mut probe_df: DataFrame| {
-                        let out_df = if params.left_is_build.unwrap() {
-                            build_df.hstack_mut_unchecked(probe_df.get_columns());
-                            build_df
-                        } else {
-                            probe_df.hstack_mut_unchecked(build_df.get_columns());
-                            probe_df
+                    // To preserve the order we can't do bulk probes per partition and must follow
+                    // the order of the probe morsel. We can still group probes that are
+                    // consecutively on the same partition.
+                    hash_keys.gen_partitions(&partitioner, &mut probe_partitions, emit_unmatched);
+                    let mut probe_group_start = 0;
+                    while probe_group_start < probe_partitions.len() {
+                        let p_idx = probe_partitions[probe_group_start];
+                        let mut probe_group_end = probe_group_start + 1;
+                        while probe_partitions.get(probe_group_end) == Some(&p_idx) {
+                            probe_group_end += 1;
+                        }
+                        let Some(p) = partitions.get(p_idx as usize) else {
+                            probe_group_start = probe_group_end;
+                            continue;
                         };
-                        let out_df = postprocess_join(out_df, params);
-                        Morsel::new(out_df, seq, src_token.clone())
-                    };
+
+                        materialized_idxsize_range.extend(materialized_idxsize_range.len() as IdxSize..probe_group_end as IdxSize);
+                        
+                        while probe_group_start < probe_group_end {
+                            let matches_before_limit = probe_limit - probe_match.len() as IdxSize;
+                            table_match.clear();
+                            probe_group_start += p.hash_table.probe_subset(
+                                &hash_keys,
+                                &materialized_idxsize_range[probe_group_start..probe_group_end],
+                                &mut table_match,
+                                &mut probe_match,
+                                mark_matches,
+                                emit_unmatched,
+                                matches_before_limit,
+                            ) as usize;
+                            
+                            if emit_unmatched {
+                                build_out.opt_gather_extend(&p.payload, &table_match, ShareStrategy::Always);
+                            } else {
+                                build_out.gather_extend(&p.payload, &table_match, ShareStrategy::Always);
+                            };
+
+                            if probe_match.len() >= probe_limit as usize || probe_group_start == probe_partitions.len() {
+                                if !payload_rechunked {
+                                    payload.rechunk_mut();
+                                    payload_rechunked = true;
+                                }
+                                probe_out.gather_extend(&payload, &probe_match, ShareStrategy::Always);
+                                probe_match.clear();
+                                let out_morsel = new_morsel(&mut build_out, &mut probe_out);
+                                if send.send(out_morsel).await.is_err() {
+                                    return Ok(max_seq);
+                                }
+                                if probe_group_end != probe_partitions.len() {
+                                    // We had enough matches to need a mid-partition flush, let's assume there are a lot of
+                                    // matches and just do a large reserve.
+                                    build_out.reserve(probe_limit as usize + max_match_per_key_est);
+                                }
+                            }
+                        }
+                    }
+                } else {
+                    // Partition and probe the tables.
+                    for p in partition_idxs.iter_mut() {
+                        p.clear();
+                    }
+                    hash_keys.gen_idxs_per_partition(
+                        &partitioner,
+                        &mut partition_idxs,
+                        &mut [],
+                        emit_unmatched,
+                    );
 
                     for (p, idxs_in_p) in partitions.iter().zip(&partition_idxs) {
                         let mut offset = 0;
                         while offset < idxs_in_p.len() {
+                            let matches_before_limit = probe_limit - probe_match.len() as IdxSize;
+                            table_match.clear();
                             offset += p.hash_table.probe_subset(
                                 &hash_keys,
                                 &idxs_in_p[offset..],
                                 &mut table_match,
                                 &mut probe_match,
                                 mark_matches,
                                 emit_unmatched,
-                                probe_limit - probe_out.len() as IdxSize,
+                                matches_before_limit,
                             ) as usize;
                             
-                            if probe_match.is_empty() {
+                            if table_match.is_empty() {
                                 continue;
                             }
-                            total_matches += probe_match.len();
+                            total_matches += table_match.len();
 
-                            // Gather output and send.
                             if emit_unmatched {
                                 build_out.opt_gather_extend(&p.payload, &table_match, ShareStrategy::Always);
                             } else {
                                 build_out.gather_extend(&p.payload, &table_match, ShareStrategy::Always);
                             };
-                            if !payload_rechunked {
-                                payload.rechunk_mut();
-                                payload_rechunked = true;
-                            }
-                            probe_out.gather_extend(&payload, &probe_match, ShareStrategy::Always);
                             
-                            if probe_out.len() >= probe_limit as usize {
-                                let out_morsel = new_morsel(build_out.freeze_reset(), probe_out.freeze_reset());
+                            if probe_match.len() >= probe_limit as usize {
+                                if !payload_rechunked {
+                                    payload.rechunk_mut();
+                                    payload_rechunked = true;
+                                }
+                                probe_out.gather_extend(&payload, &probe_match, ShareStrategy::Always);
+                                probe_match.clear();
+                                let out_morsel = new_morsel(&mut build_out, &mut probe_out);
                                 if send.send(out_morsel).await.is_err() {
                                     return Ok(max_seq);
                                 }
                                 // We had enough matches to need a mid-partition flush, let's assume there are a lot of
                                 // matches and just do a large reserve.
                                 build_out.reserve(probe_limit as usize + max_match_per_key_est);
-                                probe_out.reserve(probe_limit as usize + max_match_per_key_est);
                             }
                         }
                     }
 
-                    if !probe_out.is_empty() {
-                        let out_morsel = new_morsel(build_out.freeze_reset(), probe_out.freeze_reset());
+                    if !probe_match.is_empty() {
+                        if !payload_rechunked {
+                            payload.rechunk_mut();
+                        }
+                        probe_out.gather_extend(&payload, &probe_match, ShareStrategy::Always);
+                        probe_match.clear();
+                        let out_morsel = new_morsel(&mut build_out, &mut probe_out);
                         if send.send(out_morsel).await.is_err() {
                             return Ok(max_seq);
                         }
diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs
@@ -809,8 +809,12 @@ fn to_graph_rec<'a>(
                 .map(|e| create_stream_expr(e, ctx, &right_input_schema))
                 .try_collect_vec()?;
 
-            // TODO: implement order-maintaining join in new join impl.
-            if args.maintain_order == MaintainOrderJoin::None {
+            // TODO: implement build-side order-maintaining join in new join impl.
+            let preserve_order_build = matches!(
+                args.maintain_order,
+                MaintainOrderJoin::LeftRight | MaintainOrderJoin::RightLeft
+            );
+            if !preserve_order_build {
                 ctx.graph.add_node(
                     nodes::joins::new_equi_join::EquiJoinNode::new(
                         left_input_schema,