Skip to content

Commit a06f576

Browse files
authored
Merge branch 'main' into vec_graph-copy
2 parents 2501fb5 + 28203fe commit a06f576

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

112 files changed

+5782
-3003
lines changed

.github/workflows/pipeline.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: LLP Pipeline
2+
3+
on:
4+
push:
5+
branches: ["main"]
6+
pull_request:
7+
branches: ["main"]
8+
9+
env:
10+
CARGO_TERM_COLOR: always
11+
12+
jobs:
13+
build:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- uses: actions/checkout@v3
17+
- name: Adding baremetal arm target
18+
run: rustup target add aarch64-unknown-none
19+
- name: Copy Cnr-2000 to a new directory
20+
run: mkdir llp && cp tests/data/cnr-2000.graph ./llp && cp tests/data/cnr-2000.properties ./llp
21+
- name: Run test pipeline on cnr-2000
22+
run: ./pipeline.sh ./llp/cnr-2000

.mailmap

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Matteo Dell'Acqua <[email protected]>
2+
Tommaso Fontana <[email protected]>
3+
4+
5+
Valentin Lorentz <[email protected]>

Cargo.toml

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ categories = ["compression", "data-structures"]
1111

1212

1313
[features]
14-
default = []
14+
default = ["cli"]
15+
cli = ["dep:clap", "dep:clap_complete", "dep:env_logger"] # Enable the compilation of the webgraph binary
1516
slow_tests = [] # Test feature that enables long running tests
1617
be_bins = [] # Enable read / write of only BE bvgraphs (to reduce code size)
1718
le_bins = [] # Enable read / write of only LE bvgraphs (to reduce code size)
@@ -21,24 +22,16 @@ fuzz = ["dep:arbitrary", "dep:zip", "dsi-bitstream/fuzz"] # Expose the fuzzing h
2122
anyhow = { version = "1.0.79", features=["backtrace"]}
2223
java-properties = "2.0.0"
2324
mmap-rs = "0.6.1"
24-
bitvec = { version = "1.0.1", features = ["atomic"] }
2525
num_cpus = "1.16.0"
2626
epserde = "0.4.0"
27-
#sux = "0.2.0"
28-
sux = {git = "https://github.com/vigna/sux-rs"}
29-
dsi-bitstream = "0.3.0"
30-
#dsi-bitstream = {git = "https://github.com/vigna/dsi-bitstream-rs"}
31-
clap = { version = "4.4.18", features = ["derive", "string"] }
32-
clap_complete = "4.4.10"
33-
dsi-progress-logger = "0.2.2"
27+
sux = "0.3.1"
28+
dsi-bitstream = "0.4.0"
29+
dsi-progress-logger = "0.2.4"
3430
log = "0.4.20"
35-
stderrlog = "0.5.4"
3631
rand = { version = "0.8.5", features = ["small_rng"] }
3732
rayon = "1.8.1"
3833
tempfile = "3.5.0"
3934
bytemuck = "1.14.0"
40-
arbitrary = { version = "1.3.2", features = ["derive"], optional = true }
41-
zip = {version="0.6.6", optional=true}
4235
libc = "0.2.147"
4336
itertools = "0.12.0"
4437
lender = "0.2.9"
@@ -49,17 +42,27 @@ dary_heap = "0.3.6"
4942
rdst = { version ="0.20.12", features = ["multi-threaded"] }
5043
sealed = "0.5.0"
5144

52-
[dev-dependencies]
53-
rand = { version = "0.8.5", features = ["small_rng"] }
54-
env_logger = "0.11.0"
45+
# Cli
46+
clap = { version = "4.4.18", features = ["derive", "string"], optional = true }
47+
clap_complete = {version = "4.4.10", optional = true}
48+
env_logger = {version = "0.11.0", optional = true}
49+
50+
# Fuzzing deps
51+
arbitrary = { version = "1.3.2", features = ["derive"], optional = true }
52+
zip = {version="0.6.6", optional=true}
53+
predicates = "3.1.0"
54+
sysinfo = "0.30.8"
55+
56+
[build-dependencies]
57+
built = { version = "0.7", features= ["chrono", "git2"] }
5558

5659
[profile.release] # Used for the examples
5760
opt-level = 3 # like --release
58-
lto = "fat" # Full LTO
61+
#lto = "fat" # Full LTO
5962
overflow-checks = false # Disable integer overflow checks.
6063
debug = true # Include debug info.
6164
debug-assertions = false # Enables debug assertions.
62-
codegen-units=1 # slower compile times, but maybe better perf
65+
#codegen-units=1 # slower compile times, but maybe better perf
6366

6467
[lib]
6568
name = "webgraph"

README.md

Lines changed: 112 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,53 +4,138 @@
44
[![dependents](https://img.shields.io/librariesio/dependents/cargo/webgraph)](https://crates.io/crates/webgraph/reverse_dependencies)
55
![GitHub CI](https://github.com/vigna/webgraph-rs/actions/workflows/rust.yml/badge.svg)
66
![license](https://img.shields.io/crates/l/webgraph)
7-
[![](https://tokei.rs/b1/github/vigna/webgraph-rs)](https://github.com/vigna/webgraph-rs).
7+
[![](https://tokei.rs/b1/github/vigna/webgraph-rs)](https://github.com/vigna/webgraph-rs)
8+
[![Latest version](https://img.shields.io/crates/v/webgraph.svg)](https://crates.io/crates/webgraph)
9+
[![Documentation](https://docs.rs/webgraph/badge.svg)](https://docs.rs/webgraph)
810

9-
A Rust implementation of the [WebGraph framework](https://webgraph.di.unimi.it/)
10-
for graph compression.
11+
A Rust implementation of the [WebGraph framework] for graph compression.
1112

13+
WebGraph is a framework for graph compression aimed at studying web graphs, but
14+
currently being applied to several other type of graphs. It
15+
provides simple ways to manage very large graphs, exploiting modern compression
16+
techniques. More precisely, it is currently made of:
17+
18+
- A set of simple codes, called ζ _codes_, which are particularly suitable for
19+
storing web graphs (or, in general, integers with a power-law distribution in a
20+
certain exponent range).
21+
22+
- Algorithms for compressing web graphs that exploit gap compression and
23+
differential compression (à la
24+
[LINK](http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-175.html)),
25+
intervalisation, and ζ codes to provide a high compression ratio (see [our
26+
datasets](http://law.di.unimi.it/datasets.php)). The algorithms are controlled
27+
by several parameters, which provide different tradeoffs between access speed
28+
and compression ratio.
29+
30+
- Algorithms for accessing a compressed graph without actually decompressing
31+
it, using lazy techniques that delay the decompression until it is actually
32+
necessary.
33+
34+
- Algorithms for analysing very large graphs, such as {@link
35+
it.unimi.dsi.webgraph.algo.HyperBall}, which has been used to show that
36+
Facebook has just [four degrees of
37+
separation](http://vigna.di.unimi.it/papers.php#BBRFDS).
38+
39+
- A [Java implementation](http://webgraph.di.unimi.it/) of the algorithms above,
40+
now in maintenance mode.
41+
42+
- This crate, providing a complete, documented implementation of the algorithms
43+
above in Rust. It is free software distributed under either the [GNU Lesser
44+
General Public License
45+
2.1+](https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html) or the [Apache
46+
Software License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
47+
48+
- [Data sets](http://law.di.unimi.it/datasets.php) for large graph (e.g.,
49+
billions of links).
50+
51+
## Citation
52+
53+
You are welcome to use and improve WebGraph for your research work! If you find
54+
our software useful for research, please cite the following papers in your own:
55+
56+
- [“WebGraph: The Next Generation (Is in
57+
Rust)”](http://vigna.di.unimi.it/papers.php#FVZWNG), by Tommaso Fontana,
58+
Sebastiano Vigna, and Stefano Zacchiroli, in WWW '24: Companion Proceedings
59+
of the ACM on Web Conference 2024, pages 686-689. [DOI
60+
10.1145/3589335.3651581](https://dl.acm.org/doi/10.1145/3589335.3651581)
61+
62+
- [“The WebGraph Framework I: Compression
63+
Techniques”](http://vigna.di.unimi.it/papers.php#BoVWFI), by Paolo Boldi and
64+
Sebastiano Vigna, in _Proc. of the 13th international conference on World
65+
Wide Web, WWW 2004, pages 595-602, ACM. [DOI
66+
10.1145/988672.988752](https://dl.acm.org/doi/10.1145/988672.988752)
67+
1268
## Quick Setup
1369

1470
Assuming you have built all binaries, you will first need a graph in BV format,
15-
for example downloading it from the [LAW website](http://law.di.unimi.it/). You
16-
will need the `.graph` file (the bitstream containing a compressed representation
17-
of the graph), the `.properties` file (metadata) and the `.offsets` file (a
18-
bitstream containing pointers into the graph bitstream). As a first step, if
19-
you need random access to the successors of a node, you need
20-
to build an [Elias--Fano](sux::dict::EliasFano) representation of the
21-
offsets with the command `build_ef` (this part can be skipped if you just need
22-
sequential access), which will generate an `.ef` file. Then, to load a graph
23-
with basename `BASENAME` you need to call
71+
for example downloading it from the [LAW website]. For a graph with basename
72+
BASENAME, you will need the `BASENAME.graph` file (the bitstream containing a
73+
compressed representation of the graph), the `BASENAME.properties` file
74+
(metadata) and the `BASENAME.offsets` file (a bitstream containing pointers into
75+
the graph bitstream).
76+
77+
As a first step, if you need random access to the successors of a node, you need
78+
to build an [Elias–Fano] representation of the offsets (this part can be skipped
79+
if you just need sequential access). There is a CLI command `webgraph` with many
80+
subcommands, among which `build`, and `webgraph build ef BASENAME` will build
81+
the representation for you, serializing it with [ε-serde] in a file
82+
named `BASENAME.ef`.
83+
84+
Then, to load the graph you need to call
2485

2586
```[ignore]
2687
let graph = BVGraph::with_basename("BASENAME").load()?;
2788
```
2889

29-
The [`with_basename`] method returns a [`LoadConfig`] instance that can be further
30-
customized, selecting endianness, type of memory access, etc. By default you
31-
will get big endianness, memory mapping for both the graph and the offsets, and
32-
dynamic code dispatch.
90+
The [`with_basename`] method returns a [`LoadConfig`] instance that can be
91+
further customized, selecting endianness, type of memory access, and so on. By
92+
default you will get big endianness, memory mapping for both the graph and the
93+
offsets, and dynamic code dispatch.
94+
95+
Once you load the graph, you can [retrieve the successors of a node] or
96+
[iterate on the whole graph]. In particular, using the handy [`for_`] macro,
97+
you can write an iteration on the graph as
3398

34-
Once you loaded the [graph](), you can [retrieve the successors of a node]()
35-
or [iterate on the whole graph]().
99+
```[ignore]
100+
for_!((src, succ) in graph {
101+
for dst in succ {
102+
[do something with the arc src -> dst]
103+
}
104+
});
105+
```
36106

37107
## More Options
38108

39-
- By starting from the [`BVGraphSeq`] class you can obtain an instance that
40-
does not need the `.ef` file, but provides only [iteration]().
109+
- By starting from the [`BVGraphSeq`] class you can obtain an instance that does
110+
not need the `BASENAME.ef` file, but provides only [iteration].
41111

42-
- Graphs can be labeled by [zipping]() then together with a [labeling](). In fact,
112+
- Graphs can be labeled by [zipping] them together with a [labeling]. In fact,
43113
graphs are just labelings with `usize` labels.
44114

45115
## Operating on Graphs
46116

47-
There are many operations available on graphs, such as [`transpose`] or [`simplify`].
117+
There are many operations available on graphs, such as [`transpose`] and
118+
[`simplify`]. You can [permute] a graph.
48119

49120
## Acknowledgments
50121

51-
This software has been partially supported by project SERICS (PE00000014) under the NRRP MUR program funded by the EU - NGEU,
52-
and by project ANR COREGRAPHIE, grant ANR-20-CE23-0002 of the French Agence Nationale de la Recherche.
122+
This software has been partially supported by project SERICS (PE00000014) under
123+
the NRRP MUR program funded by the EU - NGEU, and by project ANR COREGRAPHIE,
124+
grant ANR-20-CE23-0002 of the French Agence Nationale de la Recherche.
53125

54-
[`LoadConfig`]:
55-
[`with_basename`]:
56-
[`transpose`]
126+
[`transpose`]: <https://docs.rs/webgraph/latest/webgraph/transform/transpose/index.html>
127+
[`simplify`]: <https://docs.rs/webgraph/latest/webgraph/transform/simplify/index.html>
128+
[`with_basename`]: <https://docs.rs/webgraph/latest/webgraph/struct.BVGraph.html#method.with_basename>
129+
[`BVGraphSeq`]: <https://docs.rs/webgraph/latest/webgraph/struct.BVGraphSeq.html>
130+
[`LoadConfig`]: <https://docs.rs/webgraph/latest/webgraph/struct.LoadConfig.html>
131+
[iterate on the whole graph]: <https://docs.rs/webgraph/latest/webgraph/trait/SequentialLabeling.html#method.iter>
132+
[zipping]: <https://docs.rs/webgraph/latest/webgraph/struct/Zip.html>
133+
[labeling]: <https://docs.rs/webgraph/latest/webgraph/trait/SequentialLabeling.html>
134+
[iteration]: <https://docs.rs/webgraph/latest/webgraph/trait/SequentialLabeling.html#method.iter>
135+
[retrieve the successors of a node]: <https://docs.rs/webgraph/latest/webgraph/trait/RandomAccessGraph.html#method.successors>
136+
[LAW website]: <http://law.di.unimi.it/>
137+
[Elias–Fano]: <sux::dict::EliasFano>
138+
[WebGraph framework]: <https://webgraph.di.unimi.it/>
139+
[permute]: <https://docs.rs/webgraph/latest/webgraph/transform/permute/index.html>
140+
[ε-serde]: <nttps://crates.io/crates/epserde/>
141+
[`for_`]: <https://docs.rs/lender/latest/lender/macro.for_.html>

build.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
fn main() {
2+
built::write_built_file().expect("Failed to acquire build-time information")
3+
}

examples/bench_sort_pairs.rs

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@ use clap::Parser;
1212
use dsi_bitstream::traits::BitRead;
1313
use dsi_bitstream::traits::BitWrite;
1414
use dsi_bitstream::traits::Endianness;
15-
use dsi_progress_logger::*;
15+
use dsi_progress_logger::prelude::{ProgressLog, ProgressLogger};
1616
use rand::rngs::SmallRng;
1717
use rand::RngCore;
1818
use rand::SeedableRng;
1919
use tempfile::Builder;
2020
use webgraph::prelude::*;
21+
2122
#[derive(Parser, Debug)]
2223
#[command(about = "Tests the merge speed of SortPairs", long_about = None)]
2324
struct Args {
@@ -50,14 +51,13 @@ impl<E: Endianness, W: BitRead<E>> BitDeserializer<E, W> for Mock {
5051
}
5152
}
5253

54+
#[allow(dead_code)] // I have no idea why this happens https://github.com/rust-lang/rust/issues/12327
5355
pub fn main() -> Result<()> {
5456
let args = Args::parse();
5557

56-
stderrlog::new()
57-
.verbosity(2)
58-
.timestamp(stderrlog::Timestamp::Second)
59-
.init()
60-
.unwrap();
58+
env_logger::builder()
59+
.filter_level(log::LevelFilter::Info)
60+
.try_init()?;
6161

6262
let dir = Builder::new().prefix("bench_sort_pairs").tempdir()?;
6363

@@ -83,7 +83,6 @@ pub fn main() -> Result<()> {
8383
pl.light_update();
8484
}
8585
pl.done();
86-
return Ok(());
8786
} else {
8887
let mut sp = SortPairs::new(args.batch, dir.path())?;
8988

examples/bench_swh_labels.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77

88
use anyhow::Result;
99
use clap::Parser;
10-
use dsi_progress_logger::*;
10+
use dsi_progress_logger::prelude::*;
1111
use lender::*;
1212
use std::hint::black_box;
1313
use std::path::PathBuf;
14-
use webgraph::labels::swh_labels::SwhLabels;
14+
use webgraph::prelude::swh_labels::SwhLabels;
1515
use webgraph::prelude::*;
1616

1717
#[derive(Parser, Debug)]
@@ -24,11 +24,9 @@ struct Args {
2424
pub fn main() -> Result<()> {
2525
let args = Args::parse();
2626

27-
stderrlog::new()
28-
.verbosity(2)
29-
.timestamp(stderrlog::Timestamp::Second)
30-
.init()
31-
.unwrap();
27+
env_logger::builder()
28+
.filter_level(log::LevelFilter::Info)
29+
.try_init()?;
3230

3331
let labels = SwhLabels::load_from_file(7, &args.basename)?;
3432

examples/bench_unit_graph.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ mod bench_sort_pairs;
1010
use anyhow::Result;
1111
use clap::Parser;
1212
use dsi_bitstream::prelude::*;
13-
use dsi_progress_logger::*;
13+
use dsi_progress_logger::prelude::*;
1414
use lender::*;
1515
use std::hint::black_box;
1616
use std::path::PathBuf;
@@ -96,10 +96,9 @@ where
9696
pub fn main() -> Result<()> {
9797
let args = Args::parse();
9898

99-
stderrlog::new()
100-
.verbosity(2)
101-
.timestamp(stderrlog::Timestamp::Second)
102-
.init()?;
99+
env_logger::builder()
100+
.filter_level(log::LevelFilter::Info)
101+
.try_init()?;
103102

104103
match get_endianness(&args.basename)?.as_str() {
105104
#[cfg(any(

0 commit comments

Comments
 (0)