@@ -12,7 +12,7 @@ use dsi_progress_logger::prelude::*;
12
12
use lender:: prelude:: * ;
13
13
use std:: fs:: File ;
14
14
use std:: io:: { BufReader , BufWriter } ;
15
- use std:: path:: Path ;
15
+ use std:: path:: { Path , PathBuf } ;
16
16
17
17
/// A queue that pulls jobs with ids in a contiguous initial segment of the
18
18
/// natural numbers from an iterator out of order and implement an iterator in
@@ -42,15 +42,17 @@ impl<I: Iterator> TaskQueue<I> {
42
42
43
43
impl < I : Iterator > Iterator for TaskQueue < I >
44
44
where
45
- I :: Item : JobId + Copy ,
45
+ I :: Item : JobId ,
46
46
{
47
47
type Item = I :: Item ;
48
48
49
49
fn next ( & mut self ) -> Option < Self :: Item > {
50
50
loop {
51
- if let Some ( Some ( item) ) = self . jobs . get ( self . next_id ) {
52
- self . next_id += 1 ;
53
- return Some ( * item) ;
51
+ if let Some ( item) = self . jobs . get_mut ( self . next_id ) {
52
+ if item. is_some ( ) {
53
+ self . next_id += 1 ;
54
+ return item. take ( ) ;
55
+ }
54
56
}
55
57
if let Some ( item) = self . iter . next ( ) {
56
58
let id = item. id ( ) ;
@@ -66,12 +68,15 @@ where
66
68
}
67
69
68
70
/// A compression job.
69
- #[ derive( Debug , PartialEq , PartialOrd , Eq , Ord , Clone , Copy ) ]
71
+ #[ derive( Debug , PartialEq , PartialOrd , Eq , Ord , Clone ) ]
70
72
struct Job {
71
73
job_id : usize ,
72
74
first_node : usize ,
73
75
last_node : usize ,
76
+ chunk_graph_path : PathBuf ,
74
77
written_bits : u64 ,
78
+ chunk_offsets_path : PathBuf ,
79
+ offsets_written_bits : u64 ,
75
80
num_arcs : u64 ,
76
81
}
77
82
@@ -284,6 +289,7 @@ impl BVComp<()> {
284
289
let basename = basename. as_ref ( ) ;
285
290
286
291
let graph_path = basename. with_extension ( GRAPH_EXTENSION ) ;
292
+ let offsets_path = basename. with_extension ( OFFSETS_EXTENSION ) ;
287
293
288
294
let ( tx, rx) = std:: sync:: mpsc:: channel ( ) ;
289
295
@@ -293,51 +299,70 @@ impl BVComp<()> {
293
299
let cp_flags = & compression_flags;
294
300
295
301
for ( thread_id, mut thread_lender) in iter. enumerate ( ) {
296
- let file_path = thread_path ( thread_id) ;
302
+ let tmp_path = thread_path ( thread_id) ;
303
+ let chunk_graph_path = tmp_path. with_extension ( GRAPH_EXTENSION ) ;
304
+ let chunk_offsets_path = tmp_path. with_extension ( OFFSETS_EXTENSION ) ;
297
305
let tx = tx. clone ( ) ;
298
306
// Spawn the thread
299
307
s. spawn ( move |_| {
300
308
log:: info!( "Thread {} started" , thread_id) ;
301
309
let first_node;
302
-
303
- let ( mut bvcomp, mut written_bits) =
304
- if let Some ( ( node_id, successors) ) = thread_lender. next ( ) {
310
+ let mut bvcomp;
311
+ let mut offsets_writer;
312
+ let mut written_bits;
313
+ let mut offsets_written_bits;
314
+
315
+ match thread_lender. next ( ) {
316
+ None => return ,
317
+ Some ( ( node_id, successors) ) => {
305
318
first_node = node_id;
319
+
320
+ offsets_writer = <BufBitWriter < BigEndian , _ > >:: new ( <WordAdapter < usize , _ > >:: new (
321
+ BufWriter :: new ( File :: create ( & chunk_offsets_path) . unwrap ( ) ) ,
322
+ ) ) ;
323
+
306
324
let writer = <BufBitWriter < E , _ > >:: new ( <WordAdapter < usize , _ > >:: new (
307
- BufWriter :: new ( File :: create ( & file_path ) . unwrap ( ) ) ,
325
+ BufWriter :: new ( File :: create ( & chunk_graph_path ) . unwrap ( ) ) ,
308
326
) ) ;
309
327
let codes_encoder = <DynCodesEncoder < E , _ > >:: new ( writer, cp_flags) ;
310
328
311
- let mut bvcomp = BVComp :: new (
329
+ bvcomp = BVComp :: new (
312
330
codes_encoder,
313
331
cp_flags. compression_window ,
314
332
cp_flags. max_ref_count ,
315
333
cp_flags. min_interval_length ,
316
334
node_id,
317
335
) ;
318
- let written_bits = bvcomp. push ( successors) . unwrap ( ) ;
319
- ( bvcomp, written_bits)
320
- } else {
321
- return ;
322
- } ;
336
+ written_bits = bvcomp. push ( successors) . unwrap ( ) ;
337
+ offsets_written_bits = offsets_writer. write_gamma ( written_bits) . unwrap ( ) as u64 ;
338
+ }
339
+ } ;
323
340
324
341
let mut last_node = first_node;
325
- written_bits += bvcomp
326
- . extend ( thread_lender. inspect ( |( x, _) | last_node = * x) )
327
- . unwrap ( ) ;
342
+ let iter_nodes = thread_lender. inspect ( |( x, _) | last_node = * x) ;
343
+ for_ ! ( ( _, succ) in iter_nodes {
344
+ let node_bits = bvcomp. push( succ. into_iter( ) ) . unwrap( ) ;
345
+ written_bits += node_bits;
346
+ offsets_written_bits += offsets_writer. write_gamma( node_bits) . unwrap( ) as u64 ;
347
+ } ) ;
348
+
328
349
let num_arcs = bvcomp. arcs ;
329
350
bvcomp. flush ( ) . unwrap ( ) ;
330
- // TODO written_bits += bvcomp.flush().unwrap();
351
+
331
352
log:: info!(
332
- "Finished Compression thread {} and wrote {} bits" ,
353
+ "Finished Compression thread {} and wrote {} bits for the graph and {} bits for the offsets " ,
333
354
thread_id,
334
- written_bits
355
+ written_bits,
356
+ offsets_written_bits,
335
357
) ;
336
358
tx. send ( Job {
337
359
job_id : thread_id,
338
360
first_node,
339
361
last_node,
362
+ chunk_graph_path,
340
363
written_bits,
364
+ chunk_offsets_path,
365
+ offsets_written_bits,
341
366
num_arcs,
342
367
} )
343
368
. unwrap ( )
@@ -346,15 +371,19 @@ impl BVComp<()> {
346
371
347
372
drop ( tx) ;
348
373
349
- // setup the final bitstream from the end, because the first thread
350
- // already wrote the first chunk
351
374
let file = File :: create ( & graph_path)
352
375
. with_context ( || format ! ( "Could not create graph {}" , graph_path. display( ) ) ) ?;
353
-
354
- let mut result_writer =
376
+ let mut graph_writer =
355
377
<BufBitWriter < E , _ > >:: new ( <WordAdapter < usize , _ > >:: new ( BufWriter :: new ( file) ) ) ;
356
378
379
+ let file = File :: create ( & offsets_path)
380
+ . with_context ( || format ! ( "Could not create offsets {}" , offsets_path. display( ) ) ) ?;
381
+ let mut offsets_writer =
382
+ <BufBitWriter < BigEndian , _ > >:: new ( <WordAdapter < usize , _ > >:: new ( BufWriter :: new ( file) ) ) ;
383
+ offsets_writer. write_gamma ( 0 ) ?;
384
+
357
385
let mut total_written_bits: u64 = 0 ;
386
+ let mut total_offsets_written_bits: u64 = 0 ;
358
387
let mut total_arcs: u64 = 0 ;
359
388
360
389
let mut next_node = 0 ;
@@ -364,7 +393,10 @@ impl BVComp<()> {
364
393
job_id,
365
394
first_node,
366
395
last_node,
396
+ chunk_graph_path,
367
397
written_bits,
398
+ chunk_offsets_path,
399
+ offsets_written_bits,
368
400
num_arcs,
369
401
} in TaskQueue :: new ( rx. iter ( ) )
370
402
{
@@ -378,36 +410,60 @@ impl BVComp<()> {
378
410
379
411
next_node = last_node + 1 ;
380
412
total_arcs += num_arcs;
381
- // compute the path of the bitstream created by this thread
382
- let file_path = thread_path ( job_id) ;
383
413
log:: info!(
384
414
"Copying {} [{}..{}) bits from {} to {}" ,
385
415
written_bits,
386
416
total_written_bits,
387
417
total_written_bits + written_bits,
388
- file_path . display( ) ,
389
- basename . display( )
418
+ chunk_graph_path . display( ) ,
419
+ graph_path . display( )
390
420
) ;
391
421
total_written_bits += written_bits;
392
422
393
423
let mut reader =
394
424
<BufBitReader < E , _ > >:: new ( <WordAdapter < u32 , _ > >:: new ( BufReader :: new (
395
- File :: open ( & file_path )
396
- . with_context ( || format ! ( "Could not open {}" , file_path . display( ) ) ) ?,
425
+ File :: open ( & chunk_graph_path )
426
+ . with_context ( || format ! ( "Could not open {}" , chunk_graph_path . display( ) ) ) ?,
397
427
) ) ) ;
398
- result_writer
428
+ graph_writer
399
429
. copy_from ( & mut reader, written_bits)
400
430
. with_context ( || {
401
431
format ! (
402
432
"Could not copy from {} to {}" ,
403
- file_path . display( ) ,
433
+ chunk_graph_path . display( ) ,
404
434
graph_path. display( )
405
435
)
406
436
} ) ?;
437
+
438
+ log:: info!(
439
+ "Copying offsets {} [{}..{}) bits from {} to {}" ,
440
+ written_bits,
441
+ total_offsets_written_bits,
442
+ total_offsets_written_bits + offsets_written_bits,
443
+ chunk_offsets_path. display( ) ,
444
+ offsets_path. display( )
445
+ ) ;
446
+ total_offsets_written_bits += offsets_written_bits;
447
+
448
+ let mut reader =
449
+ <BufBitReader < BigEndian , _ > >:: new ( <WordAdapter < u32 , _ > >:: new ( BufReader :: new (
450
+ File :: open ( & chunk_offsets_path)
451
+ . with_context ( || format ! ( "Could not open {}" , chunk_offsets_path. display( ) ) ) ?,
452
+ ) ) ) ;
453
+ offsets_writer
454
+ . copy_from ( & mut reader, offsets_written_bits)
455
+ . with_context ( || {
456
+ format ! (
457
+ "Could not copy from {} to {}" ,
458
+ chunk_offsets_path. display( ) ,
459
+ offsets_path. display( )
460
+ )
461
+ } ) ?;
407
462
}
408
463
409
- log:: info!( "Flushing the merged Compression bitstream" ) ;
410
- result_writer. flush ( ) ?;
464
+ log:: info!( "Flushing the merged bitstreams" ) ;
465
+ graph_writer. flush ( ) ?;
466
+ offsets_writer. flush ( ) ?;
411
467
412
468
log:: info!( "Writing the .properties file" ) ;
413
469
let properties = compression_flags
@@ -427,6 +483,11 @@ impl BVComp<()> {
427
483
total_written_bits,
428
484
total_written_bits as f64 / total_arcs as f64
429
485
) ;
486
+ log:: info!(
487
+ "Created offsets file with {} bits for {:.4} bits/node" ,
488
+ total_offsets_written_bits,
489
+ total_offsets_written_bits as f64 / num_nodes as f64
490
+ ) ;
430
491
431
492
// cleanup the temp files
432
493
std:: fs:: remove_dir_all ( tmp_dir) . with_context ( || {
0 commit comments