Merge branch 'master' into gb_cuda_examples2

dmlc · Jan 23, 2024 · 1240697 · 1240697
2 parents 9ad218c + 2fedcdc
commit 1240697
Show file tree

Hide file tree

Showing 15 changed files with 115 additions and 79 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -318,7 +318,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root"
                   alwaysPull true
                 }
@@ -337,7 +337,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
+                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
                   args "-u root"
                   alwaysPull true
                 }
@@ -392,7 +392,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root"
                   alwaysPull true
                 }
@@ -411,7 +411,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
+                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
                   args "-u root --runtime nvidia"
                   alwaysPull true
                 }
@@ -464,7 +464,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
+                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
                   args "-u root --runtime nvidia"
                   alwaysPull true
                 }
@@ -489,7 +489,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root --shm-size=4gb"
                   alwaysPull true
                 }
@@ -542,7 +542,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-gpu"
-                  image "dgllib/dgl-ci-gpu:cu116_v231103_1700"
+                  image "dgllib/dgl-ci-gpu:cu116_v240123_1000"
                   args "-u root --runtime nvidia --shm-size=8gb"
                   alwaysPull true
                 }
@@ -571,7 +571,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root --shm-size=4gb"
                   alwaysPull true
                 }
@@ -618,7 +618,7 @@ pipeline {
               agent {
                 docker {
                   label "dgl-ci-linux-cpu"
-                  image "dgllib/dgl-ci-cpu:v231103_1700"
+                  image "dgllib/dgl-ci-cpu:v240123_1000"
                   args "-u root"
                   alwaysPull true
                 }

diff --git a/docker/install/conda_env/torch_cpu_pip.txt b/docker/install/conda_env/torch_cpu_pip.txt
@@ -21,4 +21,5 @@ torch==1.13.0+cpu
 torchdata
 torcheval
 torchmetrics
+torch_geometric
 tqdm
diff --git a/docker/install/conda_env/torch_gpu_pip.txt b/docker/install/conda_env/torch_gpu_pip.txt
@@ -19,4 +19,5 @@ torch==1.13.0+cu116
 torchdata
 torcheval
 torchmetrics
+torch_geometric
 tqdm
diff --git a/examples/core/rgcn/README.md b/examples/core/rgcn/README.md
@@ -20,15 +20,15 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9
 
 | Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
 | ------------ | ------------- | ----------- | ------------- | ------------------------ |
-| ~1.1GB       | ~5GB          | 0           |  0GB          | ~243s                    |
-| ~1.1GB       | ~3GB          | 1           |  4.4GB        | ~81s                     |
+| ~1.1GB       | ~7GB          | 0           |  0GB          | ~233s                    |
+| ~1.1GB       | ~5GB          | 1           |  4.5GB        | ~73.6s                   |
 
 ### Accuracies
 ```
-Epoch: 01, Loss: 2.3302, Valid: 47.76%, Test: 46.58%
-Epoch: 02, Loss: 1.5486, Valid: 48.31%, Test: 47.12%
-Epoch: 03, Loss: 1.1469, Valid: 46.43%, Test: 45.18%
-Test accuracy 45.1227
+Epoch: 01, Loss: 2.3386, Valid: 47.67%, Test: 46.96%
+Epoch: 02, Loss: 1.5563, Valid: 47.66%, Test: 47.02%
+Epoch: 03, Loss: 1.1557, Valid: 46.58%, Test: 45.42%
+Test accuracy 45.3850
 ```
 
 ## Run on `ogb-lsc-mag240m` dataset
@@ -54,8 +54,8 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9
 
 | Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
 | ------------ | ------------- | ----------- | ------------- | ------------------------ |
-| ~404GB       | ~60GB         | 0           |  0GB          | ~216s                    |
-| ~404GB       | ~60GB         | 1           |  7GB          | ~157s                    |
+| ~404GB       | ~72GB         | 0           |  0GB          | ~325s                    |
+| ~404GB       | ~61GB         | 1           |  14GB         | ~178s                    |
 
 ### Accuracies
 ```

diff --git a/examples/multigpu/node_classification_sage.py b/examples/multigpu/node_classification_sage.py
@@ -171,12 +171,16 @@ def train(
     use_uva,
 ):
     # Instantiate a neighbor sampler
-    sampler = NeighborSampler(
-        [10, 10, 10],
-        prefetch_node_feats=["feat"],
-        prefetch_labels=["label"],
-        fused=(args.mode != "benchmark"),
-    )
+    if args.mode == "benchmark":
+        # A work-around to prevent CUDA running error. For more details, please
+        # see https://github.com/dmlc/dgl/issues/6697.
+        sampler = NeighborSampler([10, 10, 10], fused=False)
+    else:
+        sampler = NeighborSampler(
+            [10, 10, 10],
+            prefetch_node_feats=["feat"],
+            prefetch_labels=["label"],
+        )
     train_dataloader = DataLoader(
         g,
         train_idx,

diff --git a/examples/sampling/graphbolt/node_classification.py b/examples/sampling/graphbolt/node_classification.py
@@ -287,7 +287,9 @@ def evaluate(args, model, graph, features, itemset, num_classes):
 
 
 def train(args, graph, features, train_set, valid_set, num_classes, model):
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+    optimizer = torch.optim.Adam(
+        model.parameters(), lr=args.lr, weight_decay=5e-4
+    )
     dataloader = create_dataloader(
         graph=graph,
         features=features,
@@ -343,7 +345,7 @@ def parse_args():
     parser.add_argument(
         "--lr",
         type=float,
-        default=0.0005,
+        default=1e-3,
         help="Learning rate for optimization.",
     )
     parser.add_argument(

diff --git a/examples/sampling/graphbolt/rgcn/README.md b/examples/sampling/graphbolt/rgcn/README.md
@@ -19,15 +19,15 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9
 
 | Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
 | ------------ | ------------- | ----------- | ------------- | ------------------------ |
-| ~1.1GB       | ~4.5GB        | 0           |  0GB          | ~235s                    |
-| ~1.1GB       | ~2GB          | 1           |  4.4GB        | ~60s                     |
+| ~1.1GB       | ~5.3GB        | 0           |  0GB          | ~230s                    |
+| ~1.1GB       | ~3GB          | 1           |  3.87GB       | ~64.6s                   |
 
 ### Accuracies
 ```
-Epoch: 01, Loss: 2.6736, Valid accuracy: 42.21%
-Epoch: 02, Loss: 2.0809, Valid accuracy: 42.51%
-Epoch: 03, Loss: 1.8143, Valid accuracy: 42.76%
-Test accuracy 41.4817
+Epoch: 01, Loss: 2.3434, Valid accuracy: 48.23%
+Epoch: 02, Loss: 1.5646, Valid accuracy: 48.49%
+Epoch: 03, Loss: 1.1633, Valid accuracy: 45.79%
+Test accuracy 44.6792
 ```
 
 ## Run on `ogb-lsc-mag240m` dataset
@@ -47,17 +47,17 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9
 
 > **note:**
 `buffer/cache` are highly used during train, it's about 300GB. If more RAM is available, more `buffer/cache` will be consumed as graph size is about 55GB and feature data is about 350GB.
-One more thing, first epoch is quite slow as `buffer/cache` is not ready yet. For GPU train, first epoch takes **34:56min, 1.93s/it**.
+One more thing, first epoch is quite slow as `buffer/cache` is not ready yet. For GPU train, first epoch takes **1030s**.
 Even in following epochs, time consumption varies.
 
 | Dataset Size | CPU RAM Usage | Num of GPUs | GPU RAM Usage | Time Per Epoch(Training) |
 | ------------ | ------------- | ----------- | ------------- | ------------------------ |
-| ~404GB       | ~55GB       | 0           |  0GB            | ~197s                    |
-| ~404GB       | ~55GB       | 1           |  7GB            | ~119s                    |
+| ~404GB       | ~67GB         | 0           |  0GB          | ~248s                    |
+| ~404GB       | ~60GB         | 1           |  15GB         | ~166s                    |
 
 ### Accuracies
 ```
-Epoch: 01, Loss: 2.3038, Valid accuracy: 46.33%
-Epoch: 02, Loss: 2.1160, Valid accuracy: 46.47%
-Epoch: 03, Loss: 2.0847, Valid accuracy: 48.38%
+Epoch: 01, Loss: 2.1432, Valid accuracy: 50.21%
+Epoch: 02, Loss: 1.9267, Valid accuracy: 50.77%
+Epoch: 03, Loss: 1.8797, Valid accuracy: 53.38%
 ```
diff --git a/python/dgl/graphbolt/minibatch.py b/python/dgl/graphbolt/minibatch.py
@@ -80,6 +80,19 @@ class MiniBatch:
           or other graph components depending on the specific context.
     """
 
+    indexes: Union[torch.Tensor, Dict[str, torch.Tensor]] = None
+    """
+    Indexes associated with seed nodes / node pairs in the graph, which
+    indicates to which query a seed node / node pair belongs.
+    - If `indexes` is a tensor: It indicates the graph is homogeneous. The
+      value should be corresponding query to given 'seed_nodes' or
+      'node_pairs'.
+    - If `indexes` is a dictionary: It indicates the graph is
+      heterogeneous. The keys should be node or edge type and the value should
+      be corresponding query to given 'seed_nodes' or 'node_pairs'. For each
+      key, indexes are consecutive integers starting from zero.
+    """
+
     negative_srcs: Union[torch.Tensor, Dict[str, torch.Tensor]] = None
     """
     Representation of negative samples for the head nodes in the link

diff --git a/python/dgl/graphbolt/sampled_subgraph.py b/python/dgl/graphbolt/sampled_subgraph.py
@@ -30,16 +30,18 @@ def sampled_csc(
         self,
     ) -> Union[CSCFormatBase, Dict[str, CSCFormatBase],]:
         """Returns the node pairs representing edges in csc format.
-        - If `sampled_csc` is a CSCFormatBase: It should be in the csc format.
-          `indptr` stores the index in the data array where each column
-          starts. `indices` stores the row indices of the non-zero elements.
-        - If `sampled_csc` is a dictionary: The keys should be edge type and
-          the values should be corresponding node pairs. The ids inside is
-          heterogeneous ids.
+          - If `sampled_csc` is a CSCFormatBase: It should be in the csc
+            format. `indptr` stores the index in the data array where each
+            column starts. `indices` stores the row indices of the non-zero
+            elements.
+          - If `sampled_csc` is a dictionary: The keys should be edge type and
+            the values should be corresponding node pairs. The ids inside is
+            heterogeneous ids.
 
         Examples
         --------
         1. Homogeneous graph.
+
         >>> import dgl.graphbolt as gb
         >>> import torch
         >>> sampled_csc = gb.CSCFormatBase(
@@ -51,6 +53,7 @@ def sampled_csc(
         )
 
         2. Heterogeneous graph.
+
         sampled_csc = {"A:relation:B": gb.CSCFormatBase(
         ...     indptr=torch.tensor([0, 1, 2, 3]),
         ...     indices=torch.tensor([0, 1, 2]))}
@@ -69,11 +72,11 @@ def original_column_node_ids(
         Column's reverse node ids in the original graph. A graph structure
         can be treated as a coordinated row and column pair, and this is
         the mapped ids of the column.
-        - If `original_column_node_ids` is a tensor: It represents the original
-          node ids.
-        - If `original_column_node_ids` is a dictionary: The keys should be
-          node type and the values should be corresponding original
-          heterogeneous node ids.
+          - If `original_column_node_ids` is a tensor: It represents the
+            original node ids.
+          - If `original_column_node_ids` is a dictionary: The keys should be
+            node type and the values should be corresponding original
+            heterogeneous node ids.
         If present, it means column IDs are compacted, and `sampled_csc`
         column IDs match these compacted ones.
         """
@@ -87,11 +90,11 @@ def original_row_node_ids(
         Row's reverse node ids in the original graph. A graph structure
         can be treated as a coordinated row and column pair, and this is
         the mapped ids of the row.
-        - If `original_row_node_ids` is a tensor: It represents the original
-          node ids.
-        - If `original_row_node_ids` is a dictionary: The keys should be node
-          type and the values should be corresponding original heterogeneous
-          node ids.
+          - If `original_row_node_ids` is a tensor: It represents the original
+            node ids.
+          - If `original_row_node_ids` is a dictionary: The keys should be node
+            type and the values should be corresponding original heterogeneous
+            node ids.
         If present, it means row IDs are compacted, and `sampled_csc`
         row IDs match these compacted ones."""
         return None
@@ -101,11 +104,11 @@ def original_edge_ids(self) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
         """Returns corresponding reverse edge ids the original graph.
         Reverse edge ids in the original graph. This is useful when edge
         features are needed.
-        - If `original_edge_ids` is a tensor: It represents the original edge
-          ids.
-        - If `original_edge_ids` is a dictionary: The keys should be edge type
-          and the values should be corresponding original heterogeneous edge
-          ids.
+          - If `original_edge_ids` is a tensor: It represents the original edge
+            ids.
+          - If `original_edge_ids` is a dictionary: The keys should be edge
+            type and the values should be corresponding original heterogeneous
+            edge ids.
         """
         return None
 
@@ -119,17 +122,17 @@ def exclude_edges(
     ):
         r"""Exclude edges from the sampled subgraph.
 
-        This function can be used with sampled subgraphs, regardless of whether they
-        have compacted row/column nodes or not. If the original subgraph has
-        compacted row or column nodes, the corresponding row or column nodes in the
-        returned subgraph will also be compacted.
+        This function can be used with sampled subgraphs, regardless of
+        whether they have compacted row/column nodes or not. If the original
+        subgraph has compacted row or column nodes, the corresponding row or
+        column nodes in the returned subgraph will also be compacted.
 
         Parameters
         ----------
         self : SampledSubgraph
             The sampled subgraph.
         edges : Union[Tuple[torch.Tensor, torch.Tensor],
-        Dict[str, Tuple[torch.Tensor, torch.Tensor]]]
+                Dict[str, Tuple[torch.Tensor, torch.Tensor]]]
             Edges to exclude. If sampled subgraph is homogeneous, then `edges`
             should be a pair of tensors representing the edges to exclude. If
             sampled subgraph is heterogeneous, then `edges` should be a

diff --git a/script/dgl_dev.yml.template b/script/dgl_dev.yml.template
@@ -30,6 +30,7 @@ dependencies:
     - torchdata>=0.5.0
     - torcheval
     - torchmetrics
+    - torch_geometric
     - tqdm
     - boto3 # AWS SDK for python
     - sphinx==4.2.0

diff --git a/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_fused_csc_sampling_graph.py
@@ -1623,6 +1623,8 @@ def test_sample_neighbors_homo(labor, is_pinned):
     0   1   0   0   1
     1   0   0   0   1
     """
+    if F._default_context_str == "cpu" and is_pinned:
+        pytest.skip("Pinning is not meaningful without a GPU.")
     # Initialize data.
     total_num_edges = 12
     indptr = torch.LongTensor([0, 3, 5, 7, 9, 12])
@@ -1631,12 +1633,9 @@ def test_sample_neighbors_homo(labor, is_pinned):
     assert indptr[-1] == len(indices)
 
     # Construct FusedCSCSamplingGraph.
-    graph = gb.fused_csc_sampling_graph(indptr, indices)
-    if F._default_context_str == "gpu":
-        if is_pinned:
-            graph.pin_memory_()
-        else:
-            graph = graph.to(F.ctx())
+    graph = gb.fused_csc_sampling_graph(indptr, indices).to(
+        "pinned" if is_pinned else F.ctx()
+    )
 
     # Generate subgraph via sample neighbors.
     nodes = torch.LongTensor([1, 3, 4]).to(F.ctx())
@@ -1883,6 +1882,8 @@ def test_sample_neighbors_return_eids_homo(labor, is_pinned):
     0   1   0   0   1
     1   0   0   0   1
     """
+    if F._default_context_str == "cpu" and is_pinned:
+        pytest.skip("Pinning is not meaningful without a GPU.")
     # Initialize data.
     total_num_edges = 12
     indptr = torch.LongTensor([0, 3, 5, 7, 9, 12])
@@ -1896,12 +1897,7 @@ def test_sample_neighbors_return_eids_homo(labor, is_pinned):
     # Construct FusedCSCSamplingGraph.
     graph = gb.fused_csc_sampling_graph(
         indptr, indices, edge_attributes=edge_attributes
-    )
-    if F._default_context_str == "gpu":
-        if is_pinned:
-            graph.pin_memory_()
-        else:
-            graph = graph.to(F.ctx())
+    ).to("pinned" if is_pinned else F.ctx())
 
     # Generate subgraph via sample neighbors.
     nodes = torch.LongTensor([1, 3, 4]).to(F.ctx())