-
Notifications
You must be signed in to change notification settings - Fork 329
[FEA] cuGraph GNN NCCL-only Setup and Distributed Sampling #4278
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
46a8b0b
2d18d56
39d139d
ca732a9
2669596
8e17ee7
4e36719
2fe5084
50c5a80
9e393a0
b35ad1f
9bc944b
5e9135b
0593278
b44ea3b
e7af045
06dfdd5
4cd2f10
4dfae1f
983a684
03c27c9
b9cd529
30ab324
e1d0b10
6bf4f03
eb94b72
d0d3d4d
6814a65
92ca361
f255b61
9f985b5
414e30d
da3b0fb
d747dff
7f5abff
71ea76e
dd9ed2d
1163a81
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This example shows how to use cuGraph nccl-only comms, pylibcuGraph, | ||
# and PyTorch DDP to run a multi-GPU sampling workflow. Most users of the | ||
# GNN packages will not interact with cuGraph directly. This example | ||
# is intented for users who want to extend cuGraph within a DDP workflow. | ||
|
||
import os | ||
import re | ||
import tempfile | ||
|
||
import numpy as np | ||
import torch | ||
import torch.multiprocessing as tmp | ||
import torch.distributed as dist | ||
|
||
import cudf | ||
|
||
from cugraph.gnn import ( | ||
cugraph_comms_init, | ||
cugraph_comms_shutdown, | ||
cugraph_comms_create_unique_id, | ||
cugraph_comms_get_raft_handle, | ||
DistSampleWriter, | ||
UniformNeighborSampler, | ||
) | ||
|
||
from pylibcugraph import MGGraph, ResourceHandle, GraphProperties | ||
|
||
from ogb.nodeproppred import NodePropPredDataset | ||
|
||
|
||
def init_pytorch(rank, world_size): | ||
os.environ["MASTER_ADDR"] = "localhost" | ||
os.environ["MASTER_PORT"] = "12355" | ||
dist.init_process_group("nccl", rank=rank, world_size=world_size) | ||
|
||
|
||
def sample(rank: int, world_size: int, uid, edgelist, directory): | ||
init_pytorch(rank, world_size) | ||
|
||
device = rank | ||
cugraph_comms_init(rank, world_size, uid, device) | ||
|
||
print(f"rank {rank} initialized cugraph") | ||
|
||
src = cudf.Series(np.array_split(edgelist[0], world_size)[rank]) | ||
dst = cudf.Series(np.array_split(edgelist[1], world_size)[rank]) | ||
|
||
seeds = cudf.Series(np.arange(rank * 50, (rank + 1) * 50)) | ||
handle = ResourceHandle(cugraph_comms_get_raft_handle().getHandle()) | ||
|
||
print("constructing graph") | ||
G = MGGraph( | ||
handle, | ||
GraphProperties(is_multigraph=True, is_symmetric=False), | ||
[src], | ||
[dst], | ||
) | ||
print("graph constructed") | ||
|
||
sample_writer = DistSampleWriter(directory=directory, batches_per_partition=2) | ||
sampler = UniformNeighborSampler( | ||
G, | ||
sample_writer, | ||
fanout=[5, 5], | ||
) | ||
|
||
sampler.sample_from_nodes(seeds, batch_size=16, random_state=62) | ||
|
||
dist.barrier() | ||
cugraph_comms_shutdown() | ||
print(f"rank {rank} shut down cugraph") | ||
|
||
|
||
def main(): | ||
world_size = torch.cuda.device_count() | ||
uid = cugraph_comms_create_unique_id() | ||
|
||
dataset = NodePropPredDataset("ogbn-products") | ||
el = dataset[0][0]["edge_index"].astype("int64") | ||
|
||
with tempfile.TemporaryDirectory() as directory: | ||
tmp.spawn( | ||
sample, | ||
args=(world_size, uid, el, "."), | ||
nprocs=world_size, | ||
) | ||
|
||
print("Printing samples...") | ||
for file in os.listdir(directory): | ||
m = re.match(r"batch=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet", file) | ||
rank, start, _, end = int(m[1]), int(m[2]), int(m[3]), int(m[4]) | ||
print(f"File: {file} (batches {start} to {end} for rank {rank})") | ||
print(cudf.read_parquet(os.path.join(directory, file))) | ||
print("\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This example shows how to use cuGraph nccl-only comms, pylibcuGraph, | ||
# and PyTorch to run a single-GPU sampling workflow. Most users of the | ||
# GNN packages will not interact with cuGraph directly. This example | ||
# is intented for users who want to extend cuGraph within a PyTorch workflow. | ||
|
||
import os | ||
import re | ||
import tempfile | ||
|
||
import numpy as np | ||
|
||
import cudf | ||
|
||
from cugraph.gnn import ( | ||
DistSampleWriter, | ||
UniformNeighborSampler, | ||
) | ||
|
||
from pylibcugraph import SGGraph, ResourceHandle, GraphProperties | ||
|
||
from ogb.nodeproppred import NodePropPredDataset | ||
|
||
|
||
def sample(edgelist, directory): | ||
src = cudf.Series(edgelist[0]) | ||
dst = cudf.Series(edgelist[1]) | ||
|
||
seeds = cudf.Series(np.arange(0, 50)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. What happens if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just a toy example, it's not meant to be robust, just to accept the known good input we give it (in this case the ogbn-products dataset). |
||
|
||
print("constructing graph") | ||
G = SGGraph( | ||
ResourceHandle(), | ||
GraphProperties(is_multigraph=True, is_symmetric=False), | ||
src, | ||
dst, | ||
) | ||
print("graph constructed") | ||
|
||
sample_writer = DistSampleWriter(directory=directory, batches_per_partition=2) | ||
sampler = UniformNeighborSampler( | ||
G, | ||
sample_writer, | ||
fanout=[5, 5], | ||
) | ||
|
||
sampler.sample_from_nodes(seeds, batch_size=16, random_state=62) | ||
|
||
|
||
def main(): | ||
dataset = NodePropPredDataset("ogbn-products") | ||
el = dataset[0][0]["edge_index"].astype("int64") | ||
|
||
with tempfile.TemporaryDirectory() as directory: | ||
sample(el, directory) | ||
|
||
print("Printing samples...") | ||
for file in os.listdir(directory): | ||
m = re.match(r"batch=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet", file) | ||
rank, start, _, end = int(m[1]), int(m[2]), int(m[3]), int(m[4]) | ||
print(f"File: {file} (batches {start} to {end} for rank {rank})") | ||
print(cudf.read_parquet(os.path.join(directory, file))) | ||
print("\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This example shows how to use cuGraph nccl-only comms, pylibcuGraph, | ||
# and PyTorch DDP to run a multi-GPU workflow. Most users of the | ||
# GNN packages will not interact with cuGraph directly. This example | ||
# is intented for users who want to extend cuGraph within a DDP workflow. | ||
|
||
import os | ||
|
||
import pandas | ||
import numpy as np | ||
import torch | ||
import torch.multiprocessing as tmp | ||
import torch.distributed as dist | ||
|
||
import cudf | ||
|
||
from cugraph.gnn import ( | ||
cugraph_comms_init, | ||
cugraph_comms_shutdown, | ||
cugraph_comms_create_unique_id, | ||
cugraph_comms_get_raft_handle, | ||
) | ||
|
||
from pylibcugraph import MGGraph, ResourceHandle, GraphProperties, degrees | ||
|
||
from ogb.nodeproppred import NodePropPredDataset | ||
|
||
|
||
def init_pytorch(rank, world_size): | ||
os.environ["MASTER_ADDR"] = "localhost" | ||
os.environ["MASTER_PORT"] = "12355" | ||
dist.init_process_group("nccl", rank=rank, world_size=world_size) | ||
|
||
|
||
def calc_degree(rank: int, world_size: int, uid, edgelist): | ||
init_pytorch(rank, world_size) | ||
|
||
device = rank | ||
cugraph_comms_init(rank, world_size, uid, device) | ||
|
||
print(f"rank {rank} initialized cugraph") | ||
|
||
src = cudf.Series(np.array_split(edgelist[0], world_size)[rank]) | ||
dst = cudf.Series(np.array_split(edgelist[1], world_size)[rank]) | ||
|
||
seeds = cudf.Series(np.arange(rank * 50, (rank + 1) * 50)) | ||
handle = ResourceHandle(cugraph_comms_get_raft_handle().getHandle()) | ||
|
||
print("constructing graph") | ||
G = MGGraph( | ||
handle, | ||
GraphProperties(is_multigraph=True, is_symmetric=False), | ||
[src], | ||
[dst], | ||
) | ||
print("graph constructed") | ||
|
||
print("calculating degrees") | ||
vertices, in_deg, out_deg = degrees(handle, G, seeds, do_expensive_check=False) | ||
print("degrees calculated") | ||
|
||
print("constructing dataframe") | ||
df = pandas.DataFrame( | ||
{"v": vertices.get(), "in": in_deg.get(), "out": out_deg.get()} | ||
) | ||
print(df) | ||
|
||
dist.barrier() | ||
cugraph_comms_shutdown() | ||
print(f"rank {rank} shut down cugraph") | ||
|
||
|
||
def main(): | ||
world_size = torch.cuda.device_count() | ||
uid = cugraph_comms_create_unique_id() | ||
|
||
dataset = NodePropPredDataset("ogbn-products") | ||
el = dataset[0][0]["edge_index"].astype("int64") | ||
|
||
tmp.spawn( | ||
calc_degree, | ||
args=(world_size, uid, el), | ||
nprocs=world_size, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This example shows how to use cuGraph and pylibcuGraph to run a | ||
# single-GPU workflow. Most users of the GNN packages will not interact | ||
# with cuGraph directly. This example is intented for users who want | ||
# to extend cuGraph within a PyTorch workflow. | ||
|
||
import pandas | ||
import numpy as np | ||
|
||
import cudf | ||
|
||
from pylibcugraph import SGGraph, ResourceHandle, GraphProperties, degrees | ||
|
||
from ogb.nodeproppred import NodePropPredDataset | ||
|
||
|
||
def calc_degree(edgelist): | ||
src = cudf.Series(edgelist[0]) | ||
dst = cudf.Series(edgelist[1]) | ||
|
||
seeds = cudf.Series(np.arange(256)) | ||
|
||
print("constructing graph") | ||
G = SGGraph( | ||
ResourceHandle(), | ||
GraphProperties(is_multigraph=True, is_symmetric=False), | ||
src, | ||
dst, | ||
) | ||
print("graph constructed") | ||
|
||
print("calculating degrees") | ||
vertices, in_deg, out_deg = degrees( | ||
ResourceHandle(), G, seeds, do_expensive_check=False | ||
) | ||
print("degrees calculated") | ||
|
||
print("constructing dataframe") | ||
df = pandas.DataFrame( | ||
{"v": vertices.get(), "in": in_deg.get(), "out": out_deg.get()} | ||
) | ||
print(df) | ||
|
||
print("done") | ||
|
||
|
||
def main(): | ||
dataset = NodePropPredDataset("ogbn-products") | ||
el = dataset[0][0]["edge_index"].astype("int64") | ||
calc_degree(el) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just minor nitpicking suggestions.
What is 50? # seeds per rank? If this is an example,
will be more readable.
And just for completeness. Are we assuming that # ranks * # seeds per rank > # vertices in the input graph? If this code assumes anything about the input
edgelist
, we may specify that in comments or we may give a check.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you mean < # vertices? But this is really just meant to be a toy example. It's using a dataset that has far more than that number of vertices.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also I changed the variable to
seeds_per_rank
like you suggested to make it clearer what I'm doing.