Skip to content

LLP: Failure to serialize labels causes a segfault #95

Open
@progval

Description

@progval

For some reason, when we early-return from this line (eg. because we ran out of disk in the temp dir):

.context("Could not serialize labels")?;

then dropping the LabelStore segfaults.

For example, with this patch:

diff --git a/src/algo/llp/mod.rs b/src/algo/llp/mod.rs
index 2b5f514..6f14043 100644
--- a/src/algo/llp/mod.rs
+++ b/src/algo/llp/mod.rs
@@ -43,6 +43,7 @@ use rand::SeedableRng;
 use rayon::prelude::*;
 use std::collections::HashMap;
 use std::env::temp_dir;
+use std::mem::ManuallyDrop;
 use std::path::PathBuf;
 use std::sync::atomic::Ordering;
 use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize};
@@ -152,6 +153,7 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
             .iter()
             .for_each(|x| x.store(true, Ordering::Relaxed));
 
+        /*
         for update in 0.. {
             update_pl.start(format!("Starting update {}...", update));
 
@@ -270,6 +272,7 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
                 break;
             }
         }
+        */
 
         iter_pl.done();
 
@@ -295,11 +298,16 @@ pub fn layered_label_propagation<R: RandomAccessGraph + Sync>(
         costs.push(cost);
 
         // storing the perms
+        let path = labels_path(gamma_index);
+        info!("Creating {}", path.display());
         let mut file =
-            std::fs::File::create(labels_path(gamma_index)).context("Could not write labels")?;
-        labels
-            .serialize(&mut file)
-            .context("Could not serialize labels")?;
+            std::fs::File::create(&path).context("Could not write labels")?;
+        info!("Writing {}", path.display());
+        let res = labels
+            .serialize(&mut file);
+        info!("Res {:?}", res);
+        res.context("Could not serialize labels")?;
+        info!("Done writing {}", path.display());
 
         gamma_pl.update_and_display();
     }
diff --git a/src/cli/llp.rs b/src/cli/llp.rs
index 99c0151..d77a809 100644
--- a/src/cli/llp.rs
+++ b/src/cli/llp.rs
@@ -85,7 +85,7 @@ pub fn cli(command: Command) -> Command {
 pub fn main(submatches: &ArgMatches) -> Result<()> {
     let args = CliArgs::from_arg_matches(submatches)?;
 
-    match get_endianness(&args.basename)?.as_str() {
+    let main_res = match get_endianness(&args.basename)?.as_str() {
         #[cfg(any(
             feature = "be_bins",
             not(any(feature = "be_bins", feature = "le_bins"))
@@ -97,7 +97,10 @@ pub fn main(submatches: &ArgMatches) -> Result<()> {
         ))]
         LE::NAME => llp_impl::<LE>(args),
         e => panic!("Unknown endianness: {}", e),
-    }
+    };
+
+    log::info!("main res {:?}", main_res);
+    main_res
 }
 
 fn llp_impl<E: Endianness + 'static + Send + Sync>(args: CliArgs) -> Result<()>
@@ -157,7 +160,7 @@ where
     }
 
     // compute the LLP
-    let labels = llp::layered_label_propagation(
+    let res2 = llp::layered_label_propagation(
         &graph,
         &*deg_cumul,
         gammas,
@@ -166,8 +169,10 @@ where
         args.granularity,
         args.seed,
         predicate,
-    )
-    .context("Could not compute the LLP")?;
+    );
+    log::info!("res2 {:?}", res2);
+    let labels = res2.context("Could not compute the LLP")?;
+    log::info!("labels ok");
 
     let mut llp_perm = (0..graph.num_nodes()).collect::<Vec<_>>();
     llp_perm.par_sort_by(|&a, &b| labels[a].cmp(&labels[b]));

llp prints:

[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Log-gap cost: 68596432338
[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Creating /tmp/labels_0.bin
[2024-03-23T11:33:45Z INFO  webgraph::algo::llp] Writing /tmp/labels_0.bin
[2024-03-23T11:33:48Z INFO  webgraph::algo::llp] Res Err(WriteError)
Segmentation fault

and here is the traceback:

Thread 1 "webgraph" received signal SIGSEGV, Segmentation fault.                                                                        
__GI___libc_free (mem=0x7ffb32dfd010) at malloc.c:3102                                                                                  
3102    malloc.c: No such file or directory.                                                                                            
(gdb) bt                                                                                                                                
#0  __GI___libc_free (mem=0x7ffb32dfd010) at malloc.c:3102                                                                              
#1  0x00005555556459fa in alloc::alloc::dealloc (ptr=<optimized out>, layout=...) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/alloc.rs:117                                                                                                              
#2  alloc::alloc::{impl#1}::deallocate (ptr=..., layout=..., self=<optimized out>) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/alloc.rs:254                                                                                                             
#3  alloc::boxed::{impl#8}::drop<[core::sync::atomic::AtomicUsize], alloc::alloc::Global> (self=<optimized out>) at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/alloc/src/boxed.rs:1243                                                                              
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
#5  core::ptr::drop_in_place<webgraph::algo::llp::label_store::LabelStore> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                                                 
#6  webgraph::algo::llp::layered_label_propagation<webgraph::graphs::bvgraph::random_access::BVGraph<webgraph::graphs::bvgraph::codecs::dec_dyn::DynCodesDecoderFactory<dsi_bitstream::traits::endianness::BigEndian, webgraph::graphs::bvgraph::codecs::factories::MemoryFactory
<dsi_bitstream::traits::endianness::BigEndian, webgraph::utils::mmap_helper::MmapHelper<u32, mmap_rs::mmap::Mmap>>, sux::dict::elias_fano::EliasFano<sux::rank_sel::select_fixed2::SelectFixed2<sux::bits::bit_vec::CountBitVec<&[usize]>, &[u64], 8, 2>, sux::bits::bit_field_ve
c::BitFieldVec<usize, &[usize]>>>>, sux::dict::elias_fano::EliasFano<sux::rank_sel::select_zero_fixed2::SelectZeroFixed2<sux::bits::bit_vec::CountBitVec<&[usize]>, &[u64], 8, 2>, sux::bits::bit_field_vec::BitFieldVec<usize, &[usize]>>, predicates::boxed::BoxPredicate<webgr
aph::algo::llp::preds::PredParams>> (sym_graph=<optimized out>, deg_cumul=<optimized out>, gammas=..., num_threads=..., chunk_size=..., granularity=..., seed=0, predicate=...) at src/algo/llp/mod.rs:357                                                                       
#7  0x00005555556c6b4d in webgraph::cli::llp::llp_impl<dsi_bitstream::traits::endianness::BigEndian> (args=...) at src/cli/llp.rs:163   
#8  webgraph::cli::llp::main (submatches=<optimized out>) at src/cli/llp.rs:93                                                          
#9  0x00005555555e222c in webgraph::main () at src/main.rs:70                                                                           
(gdb) f 4                                                                                                                               
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
507     /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs: No such file or directory.                         
(gdb) f                                                                                                                                 
#4  core::ptr::drop_in_place<alloc::boxed::Box<[core::sync::atomic::AtomicUsize], alloc::alloc::Global>> () at /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs:507                                                                                   
507     in /rustc/07dca489ac2d933c78d3c5158e3f43beefeb02ce/library/core/src/ptr/mod.rs                                                  

I assume this is due to the transmuted label_store.labels, but I don't see why the compiler would drop the transmuted before the original, let alone drop it at all. Wrapping in ManuallyDrop doesn't help.

This happens both in release and debug mode (I commented out the worker loop so it terminates within a reasonable time in debug mode)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions