@@ -86,7 +86,7 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
86
86
local_vertices.size (),
87
87
current_graph_view.local_vertex_partition_range_first ());
88
88
89
- using flag_t = uint8_t ;
89
+ using flag_t = uint32_t ;
90
90
edge_src_property_t <graph_view_t , vertex_t > src_key_cache (handle);
91
91
cugraph::edge_src_property_t <graph_view_t , flag_t > src_match_flags (handle);
92
92
cugraph::edge_dst_property_t <graph_view_t , flag_t > dst_match_flags (handle);
@@ -101,6 +101,28 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
101
101
102
102
vertex_t loop_counter = 0 ;
103
103
while (true ) {
104
+ std::cout << " #V: " << current_graph_view.number_of_vertices ()
105
+ << " #E: " << current_graph_view.compute_number_of_edges (handle) << std::endl;
106
+ cugraph::edge_property_t <graph_view_t , bool > temp_eps (handle, current_graph_view);
107
+ auto sg = graph_view_t ::is_multi_gpu;
108
+ cugraph::transform_e (
109
+ handle,
110
+ current_graph_view,
111
+ cugraph::edge_src_dummy_property_t {}.view (),
112
+ cugraph::edge_dst_dummy_property_t {}.view (),
113
+ edge_weight_view,
114
+ [loop_counter, sg] __device__ (
115
+ auto src, auto dst, thrust::nullopt_t , thrust::nullopt_t , auto wgt) {
116
+ printf (" \n %d => %d %d %f [%d]\n " ,
117
+ static_cast <int >(loop_counter),
118
+ static_cast <int >(src),
119
+ static_cast <int >(dst),
120
+ static_cast <float >(wgt),
121
+ static_cast <int >(sg));
122
+ return false ;
123
+ },
124
+ temp_eps.mutable_view ());
125
+
104
126
if constexpr (graph_view_t ::is_multi_gpu) {
105
127
update_edge_src_property (handle, current_graph_view, local_vertices.begin (), src_key_cache);
106
128
}
@@ -150,34 +172,21 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
150
172
auto & minor_comm = handle.get_subcomm (cugraph::partition_manager::minor_comm_name ());
151
173
auto const minor_comm_size = minor_comm.get_size ();
152
174
153
- auto func = cugraph::detail::compute_gpu_id_from_int_vertex_t <vertex_t >{
175
+ auto key_func = cugraph::detail::compute_gpu_id_from_int_vertex_t <vertex_t >{
154
176
raft::device_span<vertex_t const >(d_vertex_partition_range_lasts.data (),
155
177
d_vertex_partition_range_lasts.size ()),
156
178
major_comm_size,
157
179
minor_comm_size};
158
180
159
- rmm::device_uvector<size_t > d_tx_value_counts (0 , handle.get_stream ());
160
-
161
- auto triplet_first = thrust::make_zip_iterator (
162
- candidates.begin (), offers_from_candidates.begin (), targets.begin ());
163
-
164
- d_tx_value_counts = cugraph::groupby_and_count (
165
- triplet_first,
166
- triplet_first + candidates.size (),
167
- [func] __device__ (auto val) { return func (thrust::get<2 >(val)); },
168
- handle.get_comms ().get_size (),
169
- std::numeric_limits<vertex_t >::max (),
170
- handle.get_stream ());
171
-
172
- std::vector<size_t > h_tx_value_counts (d_tx_value_counts.size ());
173
- raft::update_host (h_tx_value_counts.data (),
174
- d_tx_value_counts.data (),
175
- d_tx_value_counts.size (),
176
- handle.get_stream ());
177
- handle.sync_stream ();
178
-
179
181
std::forward_as_tuple (std::tie (candidates, offers_from_candidates, targets), std::ignore) =
180
- shuffle_values (handle.get_comms (), triplet_first, h_tx_value_counts, handle.get_stream ());
182
+ cugraph::groupby_gpu_id_and_shuffle_values (
183
+ handle.get_comms (),
184
+ thrust::make_zip_iterator (thrust::make_tuple (
185
+ candidates.begin (), offers_from_candidates.begin (), targets.begin ())),
186
+ thrust::make_zip_iterator (
187
+ thrust::make_tuple (candidates.end (), offers_from_candidates.end (), targets.end ())),
188
+ [key_func] __device__ (auto val) { return key_func (thrust::get<2 >(val)); },
189
+ handle.get_stream ());
181
190
}
182
191
183
192
auto itr_to_tuples = thrust::make_zip_iterator (
@@ -254,6 +263,7 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
254
263
candidates.begin (),
255
264
candidates.end (),
256
265
vertex_to_gpu_id_op);
266
+
257
267
} else {
258
268
candidates_of_candidates.resize (candidates.size (), handle.get_stream ());
259
269
@@ -263,6 +273,35 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
263
273
handle.get_stream ());
264
274
}
265
275
276
+ auto const comm_rank = graph_view_t ::is_multi_gpu ? handle.get_comms ().get_rank () : 0 ;
277
+
278
+ RAFT_CUDA_TRY (cudaDeviceSynchronize ());
279
+ auto targetss_title = std::string (" targets_" ).append (std::to_string (comm_rank)).append (" _" );
280
+
281
+ raft::print_device_vector (targetss_title.c_str (), targets.begin (), targets.size (), std::cout);
282
+
283
+ RAFT_CUDA_TRY (cudaDeviceSynchronize ());
284
+ auto cands_title = std::string (" cands_" ).append (std::to_string (comm_rank)).append (" _" );
285
+
286
+ raft::print_device_vector (
287
+ cands_title.c_str (), candidates.begin (), candidates.size (), std::cout);
288
+
289
+ RAFT_CUDA_TRY (cudaDeviceSynchronize ());
290
+ auto offers_title = std::string (" offers_" ).append (std::to_string (comm_rank)).append (" _" );
291
+
292
+ raft::print_device_vector (offers_title.c_str (),
293
+ offers_from_candidates.begin (),
294
+ offers_from_candidates.size (),
295
+ std::cout);
296
+
297
+ RAFT_CUDA_TRY (cudaDeviceSynchronize ());
298
+ auto ccs_title = std::string (" ccs_" ).append (std::to_string (comm_rank)).append (" _" );
299
+
300
+ raft::print_device_vector (ccs_title.c_str (),
301
+ candidates_of_candidates.begin (),
302
+ candidates_of_candidates.size (),
303
+ std::cout);
304
+
266
305
//
267
306
// Mask out neighborhood of matched vertices
268
307
//
@@ -302,6 +341,12 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
302
341
}
303
342
});
304
343
344
+ RAFT_CUDA_TRY (cudaDeviceSynchronize ());
345
+ auto ivm_title = std::string (" ivm_" ).append (std::to_string (comm_rank)).append (" _" );
346
+
347
+ raft::print_device_vector (
348
+ ivm_title.c_str (), is_vertex_matched.begin (), is_vertex_matched.size (), std::cout);
349
+
305
350
if (current_graph_view.compute_number_of_edges (handle) == 0 ) { break ; }
306
351
307
352
if constexpr (graph_view_t ::is_multi_gpu) {
@@ -324,6 +369,15 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
324
369
cugraph::edge_dummy_property_t {}.view (),
325
370
[loop_counter] __device__ (
326
371
auto src, auto dst, auto is_src_matched, auto is_dst_matched, thrust::nullopt_t ) {
372
+ bool flag = !((is_src_matched == uint8_t {true }) || (is_dst_matched == uint8_t {true }));
373
+ if (flag) {
374
+ printf (" \n ** %d => src %d dst %d sm %d dm %d\n " ,
375
+ static_cast <int >(loop_counter),
376
+ static_cast <int >(src),
377
+ static_cast <int >(dst),
378
+ static_cast <int >(is_src_matched),
379
+ static_cast <int >(is_dst_matched));
380
+ }
327
381
return !((is_src_matched == uint8_t {true }) || (is_dst_matched == uint8_t {true }));
328
382
},
329
383
edge_masks_odd.mutable_view ());
@@ -356,6 +410,14 @@ std::tuple<rmm::device_uvector<vertex_t>, weight_t> approximate_weighted_matchin
356
410
loop_counter++;
357
411
}
358
412
413
+ auto const comm_rank = graph_view_t ::is_multi_gpu ? handle.get_comms ().get_rank () : 0 ;
414
+
415
+ RAFT_CUDA_TRY (cudaDeviceSynchronize ());
416
+ auto ofp_title = std::string (" ofp_" ).append (std::to_string (comm_rank)).append (" _" );
417
+
418
+ raft::print_device_vector (
419
+ ofp_title.c_str (), offers_from_partners.begin (), offers_from_partners.size (), std::cout);
420
+
359
421
weight_t sum_matched_edge_weights = thrust::reduce (
360
422
handle.get_thrust_policy (), offers_from_partners.begin (), offers_from_partners.end ());
361
423
0 commit comments