Skip to content

Commit f867c78

Browse files
committed
fix the deletion of items to be ISO with main, discovered a bug in the process
1 parent 714d474 commit f867c78

File tree

2 files changed

+204
-52
lines changed

2 files changed

+204
-52
lines changed

src/tests/writer.rs

+82-25
Original file line numberDiff line numberDiff line change
@@ -609,8 +609,7 @@ fn delete_one_item() {
609609
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 4, 5]>, roots: [0], distance: "euclidean" }
610610
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(0), right: Tree(4), normal: [1.0000, 0.0000] })
611611
Tree 1: Descendants(Descendants { descendants: [1, 5] })
612-
Tree 2: Descendants(Descendants { descendants: [4] })
613-
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(2), right: Item(2), normal: [0.0000, 0.0000] })
612+
Tree 3: Descendants(Descendants { descendants: [2, 4] })
614613
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(1), right: Tree(3), normal: [0.0000, 0.0000] })
615614
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] })
616615
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] })
@@ -634,8 +633,7 @@ fn delete_one_item() {
634633
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 2, 4, 5]>, roots: [0], distance: "euclidean" }
635634
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(0), right: Tree(4), normal: [1.0000, 0.0000] })
636635
Tree 1: Descendants(Descendants { descendants: [5] })
637-
Tree 2: Descendants(Descendants { descendants: [4] })
638-
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(2), right: Item(2), normal: [0.0000, 0.0000] })
636+
Tree 3: Descendants(Descendants { descendants: [2, 4] })
639637
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(1), right: Tree(3), normal: [0.0000, 0.0000] })
640638
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] })
641639
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [2.0000, 0.0000] })
@@ -909,8 +907,10 @@ fn delete_extraneous_tree() {
909907
"###);
910908
}
911909

910+
// If we have multiple split node and empty all the branch of the top split node (root node)
911+
// See https://github.com/meilisearch/arroy/issues/117
912912
#[test]
913-
fn reuse_node_id() {
913+
fn create_root_split_node_with_empty_child() {
914914
let handle = create_database::<Euclidean>();
915915
let mut rng = rng();
916916
let mut wtxn = handle.env.write_txn().unwrap();
@@ -942,32 +942,38 @@ fn reuse_node_id() {
942942
let mut wtxn = handle.env.write_txn().unwrap();
943943
let writer = Writer::new(handle.database, 0, 2);
944944

945-
// if we delete the 1 it should free the node id 0
945+
// if we delete the 1 and 5 the tree node 4 should remove itself and be replaced by the 3rd one
946946
writer.del_item(&mut wtxn, 1).unwrap();
947+
writer.del_item(&mut wtxn, 5).unwrap();
947948
writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap();
948949
wtxn.commit().unwrap();
949950

950951
insta::assert_snapshot!(handle, @r###"
951952
==================
952953
Dumping index 0
953-
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 2, 3, 4, 5]>, roots: [0], distance: "euclidean" }
954+
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 2, 3, 4]>, roots: [0], distance: "euclidean" }
954955
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(0), right: Tree(4), normal: [1.0000, 0.0000] })
955-
Tree 1: Descendants(Descendants { descendants: [5] })
956+
Tree 1: Descendants(Descendants { descendants: [] })
956957
Tree 2: Descendants(Descendants { descendants: [3, 4] })
957958
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(2), right: Item(2), normal: [0.0000, 0.0000] })
958959
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(1), right: Tree(3), normal: [0.0000, 0.0000] })
959960
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] })
960961
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [2.0000, 0.0000] })
961962
Item 3: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [3.0000, 0.0000] })
962963
Item 4: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [4.0000, 0.0000] })
963-
Item 5: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [5.0000, 0.0000] })
964964
"###);
965+
}
965966

967+
#[test]
968+
fn reuse_node_id() {
969+
let handle = create_database::<Euclidean>();
970+
let mut rng = rng();
966971
let mut wtxn = handle.env.write_txn().unwrap();
967972
let writer = Writer::new(handle.database, 0, 2);
968973

969-
// if we re-insert the 1 the node id 0 should be re-used
970-
writer.add_item(&mut wtxn, 1, &[1., 0.]).unwrap();
974+
for i in 0..6 {
975+
writer.add_item(&mut wtxn, i, &[i as f32, 0.]).unwrap();
976+
}
971977
writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap();
972978
wtxn.commit().unwrap();
973979

@@ -976,11 +982,59 @@ fn reuse_node_id() {
976982
Dumping index 0
977983
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4, 5]>, roots: [0], distance: "euclidean" }
978984
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(0), right: Tree(4), normal: [1.0000, 0.0000] })
979-
Tree 1: Descendants(Descendants { descendants: [5] })
980-
Tree 2: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(5), right: Item(1), normal: [0.0000, 0.0000] })
985+
Tree 1: Descendants(Descendants { descendants: [1, 5] })
986+
Tree 2: Descendants(Descendants { descendants: [3, 4] })
981987
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(2), right: Item(2), normal: [0.0000, 0.0000] })
982988
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(1), right: Tree(3), normal: [0.0000, 0.0000] })
983-
Tree 5: Descendants(Descendants { descendants: [3, 4] })
989+
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] })
990+
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] })
991+
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [2.0000, 0.0000] })
992+
Item 3: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [3.0000, 0.0000] })
993+
Item 4: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [4.0000, 0.0000] })
994+
Item 5: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [5.0000, 0.0000] })
995+
"###);
996+
997+
let mut wtxn = handle.env.write_txn().unwrap();
998+
let writer = Writer::new(handle.database, 0, 2);
999+
1000+
// if we delete the 3 and 4 it should free the tree node 2
1001+
writer.del_item(&mut wtxn, 3).unwrap();
1002+
writer.del_item(&mut wtxn, 4).unwrap();
1003+
writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap();
1004+
wtxn.commit().unwrap();
1005+
1006+
insta::assert_snapshot!(handle, @r###"
1007+
==================
1008+
Dumping index 0
1009+
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 5]>, roots: [0], distance: "euclidean" }
1010+
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(0), right: Tree(4), normal: [1.0000, 0.0000] })
1011+
Tree 1: Descendants(Descendants { descendants: [1, 5] })
1012+
Tree 3: Descendants(Descendants { descendants: [2] })
1013+
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(1), right: Tree(3), normal: [0.0000, 0.0000] })
1014+
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] })
1015+
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] })
1016+
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [2.0000, 0.0000] })
1017+
Item 5: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [5.0000, 0.0000] })
1018+
"###);
1019+
1020+
let mut wtxn = handle.env.write_txn().unwrap();
1021+
let writer = Writer::new(handle.database, 0, 2);
1022+
1023+
// if we re-insert both nodes, the id 2 should be re-used
1024+
writer.add_item(&mut wtxn, 3, &[3., 0.]).unwrap();
1025+
writer.add_item(&mut wtxn, 4, &[4., 0.]).unwrap();
1026+
writer.builder(&mut rng).n_trees(1).build(&mut wtxn).unwrap();
1027+
wtxn.commit().unwrap();
1028+
1029+
insta::assert_snapshot!(handle, @r###"
1030+
==================
1031+
Dumping index 0
1032+
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4, 5]>, roots: [0], distance: "euclidean" }
1033+
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(0), right: Tree(4), normal: [1.0000, 0.0000] })
1034+
Tree 1: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(2), right: Item(1), normal: [0.0000, 0.0000] })
1035+
Tree 2: Descendants(Descendants { descendants: [4, 5] })
1036+
Tree 3: Descendants(Descendants { descendants: [2, 3] })
1037+
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(1), right: Tree(3), normal: [0.0000, 0.0000] })
9841038
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] })
9851039
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] })
9861040
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [2.0000, 0.0000] })
@@ -999,17 +1053,20 @@ fn reuse_node_id() {
9991053
insta::assert_snapshot!(handle, @r###"
10001054
==================
10011055
Dumping index 0
1002-
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4, 5]>, roots: [4, 9], distance: "euclidean" }
1003-
Tree 0: Descendants(Descendants { descendants: [2, 3] })
1004-
Tree 1: Descendants(Descendants { descendants: [1, 3] })
1005-
Tree 2: Descendants(Descendants { descendants: [4, 5] })
1006-
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(5), right: Tree(2), normal: [0.0000, 0.0000] })
1007-
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(0), right: Tree(3), normal: [1.0000, 0.0000] })
1008-
Tree 5: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(0), right: Item(1), normal: [0.0000, 0.0000] })
1009-
Tree 6: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(1), right: Item(5), normal: [0.0000, 0.0000] })
1010-
Tree 7: Descendants(Descendants { descendants: [2, 4] })
1011-
Tree 8: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(6), right: Tree(7), normal: [0.0000, 0.0000] })
1012-
Tree 9: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(8), right: Item(0), normal: [-1.0000, 0.0000] })
1056+
Root: Metadata { dimensions: 2, items: RoaringBitmap<[0, 1, 2, 3, 4, 5]>, roots: [0, 6], distance: "euclidean" }
1057+
Tree 0: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(0), right: Tree(4), normal: [1.0000, 0.0000] })
1058+
Tree 1: Descendants(Descendants { descendants: [5] })
1059+
Tree 2: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(5), right: Item(1), normal: [0.0000, 0.0000] })
1060+
Tree 3: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(2), right: Item(2), normal: [0.0000, 0.0000] })
1061+
Tree 4: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(1), right: Tree(3), normal: [0.0000, 0.0000] })
1062+
Tree 5: Descendants(Descendants { descendants: [3, 4] })
1063+
Tree 6: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(12), right: Item(0), normal: [-1.0000, 0.0000] })
1064+
Tree 7: Descendants(Descendants { descendants: [4, 5] })
1065+
Tree 8: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Item(2), right: Tree(7), normal: [0.0000, 0.0000] })
1066+
Tree 9: Descendants(Descendants { descendants: [] })
1067+
Tree 10: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(8), right: Tree(9), normal: [0.0000, 0.0000] })
1068+
Tree 11: Descendants(Descendants { descendants: [1, 3] })
1069+
Tree 12: SplitPlaneNormal(SplitPlaneNormal<euclidean> { left: Tree(10), right: Tree(11), normal: [0.0000, 0.0000] })
10131070
Item 0: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [0.0000, 0.0000] })
10141071
Item 1: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [1.0000, 0.0000] })
10151072
Item 2: Leaf(Leaf { header: NodeHeaderEuclidean { bias: 0.0 }, vector: [2.0000, 0.0000] })

src/writer.rs

+122-27
Original file line numberDiff line numberDiff line change
@@ -491,13 +491,13 @@ impl<D: Distance> Writer<D> {
491491
return Err(Error::BuildCancelled);
492492
}
493493

494-
// Before taking any references on the DB, remove all the items we must remove.
495-
self.scan_tree_nodes_and_delete_items(wtxn, options, &to_delete)?;
496-
497494
let used_node_ids = self.used_tree_node(wtxn)?;
498495
let nb_tree_nodes = used_node_ids.len();
499496
let concurrent_node_ids = ConcurrentNodeIds::new(used_node_ids);
500497

498+
// Before taking any references on the DB, remove all the items we must remove.
499+
self.scan_trees_and_delete_items(wtxn, options, &roots, &to_delete)?;
500+
501501
let mut descendants_too_big = self.insert_items_in_current_trees(
502502
wtxn,
503503
rng,
@@ -696,11 +696,6 @@ impl<D: Distance> Writer<D> {
696696
let tmp_descendant_to_write =
697697
self.insert_items_in_tree(options, rng, roots, &to_insert, &frozzen_reader)?;
698698

699-
// TODO: The only way to avoid this synchronization point is to forbid the use of item_id directly in the split nodes.
700-
// That would ensure we only ever overwrite descendant nodes and we could do the synchronization to the DB only
701-
// once after the whole loop.
702-
// Doing the synchronization after the loop means we would not need to retrieve the item_id for every iteration.
703-
// This requires making a new version of arroy.
704699
for (tmp_node, descendants) in tmp_descendant_to_write.iter() {
705700
descendants_too_big |= descendants;
706701

@@ -813,40 +808,140 @@ impl<D: Distance> Writer<D> {
813808

814809
// TODO: Should we return the list of empty descendants to double check at the very end of the indexing process if
815810
// they're still empty and should be removed?
816-
fn scan_tree_nodes_and_delete_items(
811+
fn scan_trees_and_delete_items(
817812
&self,
818813
wtxn: &mut RwTxn,
819814
options: &BuildOption,
815+
roots: &[u32],
820816
to_delete: &RoaringBitmap,
821817
) -> Result<()> {
822818
(options.progress)(WriterProgress { main: MainStep::RemoveItems, sub: None });
823-
let mut iter = self
824-
.database
825-
.remap_key_type::<PrefixCodec>()
826-
.prefix_iter_mut(wtxn, &Prefix::tree(self.index))?
827-
.remap_key_type::<KeyCodec>();
828819

829-
while let Some(entry) = iter.next() {
830-
let (key, value) = entry?;
831-
// TODO: If the item is located in a split node we must handle it as well
832-
if let Node::Descendants(Descendants { descendants }) = value {
820+
let mut tmp_nodes: TmpNodes<NodeCodec<D>> = match self.tmpdir.as_ref() {
821+
Some(path) => TmpNodes::new_in(path)?,
822+
None => TmpNodes::new()?,
823+
};
824+
825+
// TODO: Parallelize
826+
for root in roots.iter().copied() {
827+
self.delete_items_in_file(options, wtxn, root, &mut tmp_nodes, to_delete)?;
828+
}
829+
830+
let tmp_nodes = tmp_nodes.into_bytes_reader()?;
831+
for item_id in tmp_nodes.to_delete() {
832+
let key = Key::tree(self.index, item_id);
833+
self.database.remap_data_type::<Bytes>().delete(wtxn, &key)?;
834+
}
835+
for (item_id, item_bytes) in tmp_nodes.to_insert() {
836+
let key = Key::tree(self.index, item_id);
837+
self.database.remap_data_type::<Bytes>().put(wtxn, &key, item_bytes)?;
838+
}
839+
Ok(())
840+
}
841+
842+
/// Remove items in O(n). We must explore the whole list of items.
843+
/// That could be reduced to O(log(n)) if we had a `RoTxn` of the previous state of the database.
844+
fn delete_items_in_file(
845+
&self,
846+
opt: &BuildOption,
847+
rtxn: &RoTxn,
848+
current_node: ItemId,
849+
tmp_nodes: &mut TmpNodes<NodeCodec<D>>,
850+
to_delete: &RoaringBitmap,
851+
) -> Result<(ItemId, RoaringBitmap)> {
852+
if (opt.cancel)() {
853+
return Err(Error::BuildCancelled);
854+
}
855+
match self.database.get(rtxn, &Key::tree(self.index, current_node))?.unwrap() {
856+
Node::Leaf(_) => unreachable!(),
857+
Node::Descendants(Descendants { descendants }) => {
833858
let len = descendants.len();
834-
let new_descendants = descendants.into_owned() - to_delete;
859+
let mut new_descendants = descendants.into_owned();
860+
new_descendants -= to_delete;
861+
835862
if len != new_descendants.len() {
836-
// Safety: The key is copied and the roaring bitmap lives in RAM.
837-
unsafe {
838-
iter.put_current(
839-
&key,
840-
&Node::Descendants(Descendants {
841-
descendants: Cow::Owned(new_descendants),
863+
// update the descendants
864+
tmp_nodes.put(
865+
current_node,
866+
&Node::Descendants(Descendants {
867+
descendants: Cow::Borrowed(&new_descendants),
868+
}),
869+
)?;
870+
}
871+
Ok((current_node, new_descendants))
872+
}
873+
Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => {
874+
let (new_left, left_items) = match left.mode {
875+
NodeMode::Tree => {
876+
let (id, items) =
877+
self.delete_items_in_file(opt, rtxn, left.item, tmp_nodes, to_delete)?;
878+
(NodeId::tree(id), items)
879+
}
880+
NodeMode::Item => {
881+
if to_delete.contains(left.item) {
882+
(left, RoaringBitmap::new())
883+
} else {
884+
(left, RoaringBitmap::from_sorted_iter(Some(left.item)).unwrap())
885+
}
886+
}
887+
NodeMode::Metadata | NodeMode::Updated => unreachable!(),
888+
};
889+
let (new_right, right_items) = match right.mode {
890+
NodeMode::Tree => {
891+
let (id, items) =
892+
self.delete_items_in_file(opt, rtxn, right.item, tmp_nodes, to_delete)?;
893+
(NodeId::tree(id), items)
894+
}
895+
NodeMode::Item => {
896+
if to_delete.contains(right.item) {
897+
(right, RoaringBitmap::new())
898+
} else {
899+
(right, RoaringBitmap::from_sorted_iter(Some(right.item)).unwrap())
900+
}
901+
}
902+
NodeMode::Metadata | NodeMode::Updated => unreachable!(),
903+
};
904+
905+
let total_items = left_items | right_items;
906+
907+
if self.fit_in_descendant(opt, total_items.len()) {
908+
// Since we're shrinking we KNOW that new_left and new_right are descendants
909+
// thus we can delete them directly knowing there is no sub-tree to look at.
910+
if new_left.mode == NodeMode::Tree {
911+
tmp_nodes.remove(new_left.item);
912+
}
913+
if new_right.mode == NodeMode::Tree {
914+
tmp_nodes.remove(new_right.item);
915+
}
916+
917+
tmp_nodes.put(
918+
current_node,
919+
&Node::Descendants(Descendants {
920+
descendants: Cow::Owned(total_items.clone()),
921+
}),
922+
)?;
923+
924+
// we should merge both branch and update ourselves to be a single descendant node
925+
Ok((current_node, total_items))
926+
} else {
927+
// if either the left or the right changed we must update ourselves inplace
928+
if new_left != left || new_right != right {
929+
tmp_nodes.put(
930+
current_node,
931+
&Node::SplitPlaneNormal(SplitPlaneNormal {
932+
normal,
933+
left: new_left,
934+
right: new_right,
842935
}),
843936
)?;
844937
}
938+
939+
// TODO: Should we update the normals if something changed?
940+
941+
Ok((current_node, total_items))
845942
}
846943
}
847944
}
848-
849-
Ok(())
850945
}
851946

852947
/// Insert items in the specified trees without creating new tree.

0 commit comments

Comments
 (0)