Skip to content

Commit f21020e

Browse files
Update BC computation to address normalization edge conditions (#5105)
networkx/networkx#7949 implements an update to normalization that was originally described in #4941. We pushed an update in #4974 that addressed the specific examples that the user identified in the original cugraph issue. However, while @eriknw was exploring updates to networkx to address this, he identified a few more edge conditions that needed to be satisfied. This PR addresses those remaining edge conditions. Note that the python tests comparing results to networkx are still disabled. These can't be re-enabled until networkx/networkx#7949 is included in a networkx release. Closes #5006 Closes #5107 Authors: - Chuck Hastings (https://github.com/ChuckHastings) - Rick Ratzel (https://github.com/rlratzel) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - Joseph Nke (https://github.com/jnke2016) - Rick Ratzel (https://github.com/rlratzel) URL: #5105
1 parent 3068f2b commit f21020e

File tree

7 files changed

+147
-73
lines changed

7 files changed

+147
-73
lines changed

cpp/src/centrality/betweenness_centrality_impl.cuh

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -540,37 +540,57 @@ rmm::device_uvector<weight_t> betweenness_centrality(
540540
do_expensive_check);
541541
}
542542

543-
std::optional<weight_t> scale_factor{std::nullopt};
543+
std::optional<weight_t> scale_nonsource{std::nullopt};
544+
std::optional<weight_t> scale_source{std::nullopt};
545+
546+
weight_t num_vertices = static_cast<weight_t>(graph_view.number_of_vertices());
547+
if (!include_endpoints) num_vertices = num_vertices - 1;
548+
549+
if ((static_cast<edge_t>(num_sources) == num_vertices) || include_endpoints) {
550+
if (normalized) {
551+
scale_nonsource = static_cast<weight_t>(num_sources * (num_vertices - 1));
552+
} else if (graph_view.is_symmetric()) {
553+
scale_nonsource =
554+
static_cast<weight_t>(num_sources * 2) / static_cast<weight_t>(num_vertices);
555+
} else {
556+
scale_nonsource = static_cast<weight_t>(num_sources) / static_cast<weight_t>(num_vertices);
557+
}
544558

545-
if (normalized) {
546-
if (include_endpoints) {
547-
if (graph_view.number_of_vertices() >= 2) {
548-
scale_factor = static_cast<weight_t>(
549-
std::min(static_cast<vertex_t>(num_sources), graph_view.number_of_vertices()) *
550-
(graph_view.number_of_vertices() - 1));
551-
}
552-
} else if (graph_view.number_of_vertices() > 2) {
553-
scale_factor = static_cast<weight_t>(
554-
std::min(static_cast<vertex_t>(num_sources), graph_view.number_of_vertices() - 1) *
555-
(graph_view.number_of_vertices() - 2));
559+
scale_source = scale_nonsource;
560+
} else if (normalized) {
561+
scale_nonsource = static_cast<weight_t>(num_sources) * (num_vertices - 1);
562+
scale_source = static_cast<weight_t>(num_sources - 1) * (num_vertices - 1);
563+
} else {
564+
scale_nonsource = static_cast<weight_t>(num_sources) / num_vertices;
565+
scale_source = static_cast<weight_t>(num_sources - 1) / num_vertices;
566+
567+
if (graph_view.is_symmetric()) {
568+
*scale_nonsource *= 2;
569+
*scale_source *= 2;
556570
}
557-
} else if (num_sources < static_cast<size_t>(graph_view.number_of_vertices())) {
558-
if ((graph_view.number_of_vertices() > 1) && (num_sources > 0))
559-
scale_factor =
560-
(graph_view.is_symmetric() ? weight_t{2} : weight_t{1}) *
561-
static_cast<weight_t>(num_sources) /
562-
(include_endpoints ? static_cast<weight_t>(graph_view.number_of_vertices())
563-
: static_cast<weight_t>(graph_view.number_of_vertices() - 1));
564-
} else if (graph_view.is_symmetric()) {
565-
scale_factor = weight_t{2};
566571
}
567572

568-
if (scale_factor) {
569-
thrust::transform(handle.get_thrust_policy(),
570-
centralities.begin(),
571-
centralities.end(),
572-
centralities.begin(),
573-
[sf = *scale_factor] __device__(auto centrality) { return centrality / sf; });
573+
if (scale_nonsource) {
574+
auto iter = thrust::make_zip_iterator(
575+
thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()),
576+
centralities.begin());
577+
578+
thrust::transform(
579+
handle.get_thrust_policy(),
580+
iter,
581+
iter + centralities.size(),
582+
centralities.begin(),
583+
[nonsource = *scale_nonsource,
584+
source = *scale_source,
585+
vertices_begin,
586+
vertices_end] __device__(auto t) {
587+
vertex_t v = thrust::get<0>(t);
588+
weight_t centrality = thrust::get<1>(t);
589+
590+
return (thrust::find(thrust::seq, vertices_begin, vertices_end, v) == vertices_end)
591+
? centrality / nonsource
592+
: centrality / source;
593+
});
574594
}
575595

576596
return centralities;

cpp/tests/c_api/betweenness_centrality_test.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,16 @@ int generic_betweenness_centrality_test(vertex_t* h_src,
113113
TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
114114

115115
for (int i = 0; (i < num_vertices) && (test_ret_value == 0); ++i) {
116-
TEST_ASSERT(test_ret_value,
117-
nearlyEqual(h_result[h_vertices[i]], h_centralities[i], 0.0001),
118-
"centralities results don't match");
116+
if (isnan(h_result[h_vertices[i]])) {
117+
TEST_ASSERT(test_ret_value, isnan(h_centralities[i]), "expected NaN, got a non-NaN value");
118+
} else {
119+
if (!nearlyEqual(h_result[h_vertices[i]], h_centralities[i], 0.0001))
120+
printf(" expected: %g, got %g\n", h_result[h_vertices[i]], h_centralities[i]);
121+
122+
TEST_ASSERT(test_ret_value,
123+
nearlyEqual(h_result[h_vertices[i]], h_centralities[i], 0.0001),
124+
"centralities results don't match");
125+
}
119126
}
120127

121128
cugraph_centrality_result_free(p_result);
@@ -169,7 +176,7 @@ int test_betweenness_centrality_specific_normalized()
169176
weight_t h_wgt[] = {
170177
0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f, 0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
171178
vertex_t h_seeds[] = {0, 3};
172-
weight_t h_result[] = {0, 0.395833, 0.16667, 0.0833333, 0.0416667, 0.0625};
179+
weight_t h_result[] = {0, 0.395833, 0.166667, 0.166667, 0.0416667, 0.0625};
173180

174181
return generic_betweenness_centrality_test(h_src,
175182
h_dst,
@@ -197,7 +204,7 @@ int test_betweenness_centrality_specific_unnormalized()
197204
weight_t h_wgt[] = {
198205
0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f, 0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
199206
vertex_t h_seeds[] = {0, 3};
200-
weight_t h_result[] = {0, 7.91667, 3.33333, 1.666667, 0.833333, 1.25};
207+
weight_t h_result[] = {0, 7.91667, 3.33333, 3.33333, 0.833333, 1.25};
201208

202209
return generic_betweenness_centrality_test(h_src,
203210
h_dst,
@@ -312,17 +319,17 @@ int test_issue_4941()
312319
{TRUE, TRUE, FALSE, 0, {1.0, 0.4, 0.4, 0.4, 0.4}},
313320
{TRUE, TRUE, FALSE, 1, {1.0, 1.0, 0.25, 0.25, 0.25}},
314321
{TRUE, FALSE, TRUE, 0, {1.0, 0.0, 0.0, 0.0, 0.0}},
315-
{TRUE, FALSE, TRUE, 1, {1.0, 0.0, 0.0, 0.0, 0.0}},
322+
{TRUE, FALSE, TRUE, 1, {1.0, NAN, 0.0, 0.0, 0.0}},
316323
{TRUE, FALSE, FALSE, 0, {1.0, 0.0, 0.0, 0.0, 0.0}},
317-
{TRUE, FALSE, FALSE, 1, {1.0, 0.0, 0.0, 0.0, 0.0}},
324+
{TRUE, FALSE, FALSE, 1, {1.0, NAN, 0.0, 0.0, 0.0}},
318325
{FALSE, TRUE, TRUE, 0, {20.0, 8.0, 8.0, 8.0, 8.0}},
319326
{FALSE, TRUE, TRUE, 1, {20.0, 20.0, 5.0, 5.0, 5.0}},
320327
{FALSE, TRUE, FALSE, 0, {10.0, 4.0, 4.0, 4.0, 4.0}},
321328
{FALSE, TRUE, FALSE, 1, {10.0, 10.0, 2.5, 2.5, 2.5}},
322329
{FALSE, FALSE, TRUE, 0, {12.0, 0.0, 0.0, 0.0, 0.0}},
323-
{FALSE, FALSE, TRUE, 1, {12.0, 0.0, 0.0, 0.0, 0.0}},
330+
{FALSE, FALSE, TRUE, 1, {12, NAN, 0.0, 0.0, 0.0}},
324331
{FALSE, FALSE, FALSE, 0, {6.0, 0.0, 0.0, 0.0, 0.0}},
325-
{FALSE, FALSE, FALSE, 1, {6.0, 0.0, 0.0, 0.0, 0.0}},
332+
{FALSE, FALSE, FALSE, 1, {6.0, NAN, 0.0, 0.0, 0.0}},
326333
};
327334

328335
int test_result = 0;

cpp/tests/centrality/betweenness_centrality_reference.hpp

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -130,41 +130,54 @@ void ref_edge_accumulation(std::vector<weight_t>& result,
130130
}
131131
}
132132

133-
template <typename result_t>
133+
template <typename vertex_t, typename result_t>
134134
void reference_rescale(result_t* result,
135+
vertex_t const* sources,
135136
bool directed,
136137
bool normalize,
137138
bool endpoints,
138139
size_t const number_of_vertices,
139140
size_t const number_of_sources)
140141
{
141-
result_t rescale_factor = static_cast<result_t>(1);
142142
result_t casted_number_of_sources = static_cast<result_t>(number_of_sources);
143143
result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices);
144+
if (!endpoints) casted_number_of_vertices = casted_number_of_vertices - 1;
144145

145-
if (normalize) {
146-
if (number_of_vertices > 2) {
147-
if (endpoints) {
148-
rescale_factor /=
149-
(number_of_sources > 0 ? casted_number_of_sources
150-
: casted_number_of_vertices * (casted_number_of_vertices - 1));
151-
} else {
152-
rescale_factor /= (number_of_sources > 0
153-
? casted_number_of_sources
154-
: (casted_number_of_vertices - 1) * (casted_number_of_vertices - 2));
155-
}
146+
if ((number_of_sources == number_of_vertices) || endpoints) {
147+
result_t rescale_factor = static_cast<result_t>(1);
148+
149+
if (normalize) {
150+
rescale_factor = result_t{1} / (casted_number_of_sources * (casted_number_of_vertices - 1));
151+
} else if (!directed) {
152+
rescale_factor = casted_number_of_vertices / (2 * casted_number_of_sources);
153+
} else {
154+
rescale_factor = casted_number_of_vertices / casted_number_of_sources;
156155
}
157-
} else if (number_of_sources < number_of_vertices) {
158-
rescale_factor = (endpoints ? casted_number_of_vertices : casted_number_of_vertices - 1) /
159-
(directed ? casted_number_of_sources : 2 * casted_number_of_sources);
160-
} else if (!directed) {
161-
rescale_factor = 2;
162-
}
163156

164-
if (rescale_factor != result_t{1}) {
165-
for (auto idx = 0; idx < number_of_vertices; ++idx) {
157+
for (vertex_t idx = 0; idx < number_of_vertices; ++idx) {
166158
result[idx] *= rescale_factor;
167159
}
160+
} else {
161+
result_t rescale_source = static_cast<result_t>(1);
162+
result_t rescale_non_source = static_cast<result_t>(1);
163+
164+
if (normalize) {
165+
rescale_source = 1 / ((casted_number_of_sources - 1) * (casted_number_of_vertices - 1));
166+
rescale_non_source = 1 / (casted_number_of_sources * (casted_number_of_vertices - 1));
167+
} else if (directed) {
168+
rescale_source = casted_number_of_vertices / (casted_number_of_sources - 1);
169+
rescale_non_source = casted_number_of_vertices / casted_number_of_sources;
170+
} else {
171+
rescale_source = casted_number_of_vertices / (2 * (casted_number_of_sources - 1));
172+
rescale_non_source = casted_number_of_vertices / (2 * casted_number_of_sources);
173+
}
174+
175+
for (vertex_t idx = 0; idx < number_of_vertices; ++idx) {
176+
if (std::find(sources, sources + number_of_sources, idx) == (sources + number_of_sources))
177+
result[idx] *= rescale_non_source;
178+
else
179+
result[idx] *= rescale_source;
180+
}
168181
}
169182
}
170183

@@ -235,8 +248,13 @@ std::vector<weight_t> betweenness_centrality_reference(
235248
}
236249
}
237250

238-
reference_rescale(
239-
result.data(), directed, normalize, include_endpoints, offsets.size() - 1, seeds.size());
251+
reference_rescale(result.data(),
252+
seeds.data(),
253+
directed,
254+
normalize,
255+
include_endpoints,
256+
offsets.size() - 1,
257+
seeds.size());
240258

241259
return result;
242260
}

python/cugraph/cugraph/tests/centrality/test_batch_betweenness_centrality_mg.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import pytest
1717
import numpy as np
18+
import networkx as nx
1819

1920
from cugraph.dask.common.mg_utils import is_single_gpu
2021
from cugraph.datasets import karate
@@ -55,7 +56,10 @@ def setup_function():
5556
# =============================================================================
5657

5758

58-
@pytest.mark.skip(reason="https://github.com/networkx/networkx/pull/7908")
59+
@pytest.mark.skipif(
60+
float(".".join(nx.__version__.split(".")[:2])) < 3.5,
61+
reason="Requires networkx >= 3.5",
62+
)
5963
@pytest.mark.mg
6064
@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
6165
@pytest.mark.parametrize("dataset", DATASETS)

python/cugraph/cugraph/tests/centrality/test_batch_edge_betweenness_centrality_mg.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import pytest
1717
import numpy as np
18+
import networkx as nx
1819

1920
from cugraph.dask.common.mg_utils import is_single_gpu
2021
from cugraph.datasets import karate, netscience
@@ -53,7 +54,10 @@ def setup_function():
5354

5455

5556
# FIXME: Fails for directed = False(bc score twice as much) and normalized = True.
56-
@pytest.mark.skip(reason="https://github.com/networkx/networkx/pull/7908")
57+
@pytest.mark.skipif(
58+
float(".".join(nx.__version__.split(".")[:2])) < 3.5,
59+
reason="Requires networkx >= 3.5",
60+
)
5761
@pytest.mark.mg
5862
@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
5963
@pytest.mark.parametrize("dataset", DATASETS)

python/cugraph/cugraph/tests/centrality/test_betweenness_centrality.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import networkx as nx
2020

2121
import cudf
22-
import cupy
2322
import cugraph
2423
from cugraph.datasets import karate_disjoint
2524
from cugraph.testing import utils, SMALL_DATASETS
@@ -102,7 +101,7 @@ def calc_betweenness_centrality(
102101
Contains 'vertex' and 'cu_bc' 'ref_bc' columns, where 'cu_bc'
103102
and 'ref_bc' are the two betweenness centrality scores to compare.
104103
The dataframe is expected to be sorted based on 'vertex', so that we
105-
can use cupy.isclose to compare the scores.
104+
can use np.isclose to compare the scores.
106105
"""
107106
G = None
108107
Gnx = None
@@ -289,8 +288,15 @@ def _calc_bc_full(G, Gnx, normalized, weight, endpoints, k, seed, result_dtype):
289288
# i.e: sorted_df[idx][first_key] should be compared to
290289
# sorted_df[idx][second_key]
291290
def compare_scores(sorted_df, first_key, second_key, epsilon=DEFAULT_EPSILON):
291+
# Compare with numpy and pandas since presence of NaNs in cudf Series
292+
# results in "ValueError: CuPy currently does not support masked arrays."
292293
errors = sorted_df[
293-
~cupy.isclose(sorted_df[first_key], sorted_df[second_key], rtol=epsilon)
294+
~np.isclose(
295+
sorted_df[first_key].to_pandas(),
296+
sorted_df[second_key].to_pandas(),
297+
rtol=epsilon,
298+
equal_nan=True,
299+
)
294300
]
295301
num_errors = len(errors)
296302
if num_errors > 0:
@@ -305,7 +311,10 @@ def compare_scores(sorted_df, first_key, second_key, epsilon=DEFAULT_EPSILON):
305311
# =============================================================================
306312
# Tests
307313
# =============================================================================
308-
@pytest.mark.skip(reason="https://github.com/networkx/networkx/pull/7908")
314+
@pytest.mark.skipif(
315+
float(".".join(nx.__version__.split(".")[:2])) < 3.5,
316+
reason="Requires networkx >= 3.5",
317+
)
309318
@pytest.mark.sg
310319
@pytest.mark.parametrize("graph_file", SMALL_DATASETS)
311320
@pytest.mark.parametrize("directed", [False, True])
@@ -542,17 +551,17 @@ def test_betweenness_centrality_nx(graph_file, directed, edgevals):
542551
(True, True, False, None, {0: 1.0, 1: 0.4, 2: 0.4, 3: 0.4, 4: 0.4}),
543552
(True, True, False, 1, {0: 1.0, 1: 1.0, 2: 0.25, 3: 0.25, 4: 0.25}),
544553
(True, False, True, None, {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0}),
545-
(True, False, True, 1, {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0}),
554+
(True, False, True, 1, {0: 1.0, 1: np.nan, 2: 0.0, 3: 0.0, 4: 0.0}),
546555
(True, False, False, None, {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0}),
547-
(True, False, False, 1, {0: 1.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0}),
556+
(True, False, False, 1, {0: 1.0, 1: np.nan, 2: 0.0, 3: 0.0, 4: 0.0}),
548557
(False, True, True, None, {0: 20.0, 1: 8.0, 2: 8.0, 3: 8.0, 4: 8.0}),
549558
(False, True, True, 1, {0: 20.0, 1: 20.0, 2: 5.0, 3: 5.0, 4: 5.0}),
550559
(False, True, False, None, {0: 10.0, 1: 4.0, 2: 4.0, 3: 4.0, 4: 4.0}),
551560
(False, True, False, 1, {0: 10.0, 1: 10.0, 2: 2.5, 3: 2.5, 4: 2.5}),
552561
(False, False, True, None, {0: 12.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0}),
553-
(False, False, True, 1, {0: 12.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0}),
562+
(False, False, True, 1, {0: 12.0, 1: np.nan, 2: 0.0, 3: 0.0, 4: 0.0}),
554563
(False, False, False, None, {0: 6.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0}),
555-
(False, False, False, 1, {0: 6.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0}),
564+
(False, False, False, 1, {0: 6.0, 1: np.nan, 2: 0.0, 3: 0.0, 4: 0.0}),
556565
],
557566
)
558567
def test_scale_with_k_on_star_graph(normalized, endpoints, is_directed, k, expected):

python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,10 @@ def generate_upper_triangle(dataframe):
312312
return dataframe
313313

314314

315-
@pytest.mark.skip(reason="https://github.com/networkx/networkx/pull/7908")
315+
@pytest.mark.skipif(
316+
float(".".join(nx.__version__.split(".")[:2])) < 3.5,
317+
reason="Requires networkx >= 3.5",
318+
)
316319
@pytest.mark.sg
317320
@pytest.mark.parametrize("graph_file", SMALL_DATASETS)
318321
@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
@@ -343,7 +346,10 @@ def test_edge_betweenness_centrality(
343346
compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
344347

345348

346-
@pytest.mark.skip(reason="https://github.com/networkx/networkx/pull/7908")
349+
@pytest.mark.skipif(
350+
float(".".join(nx.__version__.split(".")[:2])) < 3.5,
351+
reason="Requires networkx >= 3.5",
352+
)
347353
@pytest.mark.sg
348354
@pytest.mark.parametrize("graph_file", SMALL_DATASETS)
349355
@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
@@ -383,7 +389,10 @@ def test_edge_betweenness_centrality_k_full(
383389
# the function operating the comparison inside is first proceeding
384390
# to a random sampling over the number of vertices (thus direct offsets)
385391
# in the graph structure instead of actual vertices identifiers
386-
@pytest.mark.skip(reason="https://github.com/networkx/networkx/pull/7908")
392+
@pytest.mark.skipif(
393+
float(".".join(nx.__version__.split(".")[:2])) < 3.5,
394+
reason="Requires networkx >= 3.5",
395+
)
387396
@pytest.mark.sg
388397
@pytest.mark.parametrize("graph_file", [karate_disjoint])
389398
@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
@@ -487,7 +496,10 @@ def test_edge_betweenness_invalid_dtype(
487496
compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
488497

489498

490-
@pytest.mark.skip(reason="https://github.com/networkx/networkx/pull/7908")
499+
@pytest.mark.skipif(
500+
float(".".join(nx.__version__.split(".")[:2])) < 3.5,
501+
reason="Requires networkx >= 3.5",
502+
)
491503
@pytest.mark.sg
492504
@pytest.mark.parametrize("graph_file", SMALL_DATASETS)
493505
@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)

0 commit comments

Comments
 (0)