rename 'node_rank' to 'global_rank' in dataset reader 'DistributedInfo' (#4608)

epwalsh · web-flow · commit de9165e14a6f · 2020-08-28T10:37:48.000-07:00
* rename 'node_rank' to 'global_rank'

* Clarify doc comments

* fix line length
diff --git a/allennlp/data/dataset_readers/dataset_reader.py b/allennlp/data/dataset_readers/dataset_reader.py
@@ -38,20 +38,21 @@ class WorkerInfo:
 @dataclass
 class DistributedInfo:
     """
-    Contains information about the node rank and world size when the reader is being
+    Contains information about the global process rank and total world size when the reader is being
     used within distributed training.
 
     From a `DatasetReader` this can be accessed with the [`get_distributed_info()`](#get_distributed_info) method.
     """
 
     world_size: int
     """
-    The total number of distributed nodes.
+    The total number of processes in the distributed group.
     """
 
-    node_rank: int
+    global_rank: int
     """
-    The 0-indexed ID of the current node.
+    The 0-indexed ID of the current process within the distributed group.
+    This will be between 0 and `world_size - 1`, inclusive.
     """
 
 
@@ -313,7 +314,7 @@ def _multi_worker_islice(
             if max_instances is not None:
                 # Need to scale down max_instances because otherwise each node would read self.max_instances,
                 # but we really want self.max_instances total across all nodes.
-                if self._distributed_info.node_rank < (
+                if self._distributed_info.global_rank < (
                     max_instances % self._distributed_info.world_size
                 ):
                     max_instances = max_instances // self._distributed_info.world_size + 1
@@ -323,7 +324,7 @@ def _multi_worker_islice(
             if not self.manual_distributed_sharding:
                 sharded_slice = itertools.islice(
                     sharded_slice,
-                    self._distributed_info.node_rank,
+                    self._distributed_info.global_rank,
                     None,
                     self._distributed_info.world_size,
                 )
diff --git a/tests/data/dataset_readers/dataset_reader_test.py b/tests/data/dataset_readers/dataset_reader_test.py
@@ -123,9 +123,9 @@ def test_instance_slicing(
         minimum_expected_result_size //= world_size
         minimum_expected_result_size //= num_workers
         maximum_expected_result_size = minimum_expected_result_size + 1
-        for node_rank in range(world_size):
+        for global_rank in range(world_size):
             monkeypatch.setattr(common_util, "is_distributed", lambda: True)
-            monkeypatch.setattr(dist, "get_rank", lambda: node_rank)
+            monkeypatch.setattr(dist, "get_rank", lambda: global_rank)
             monkeypatch.setattr(dist, "get_world_size", lambda: world_size)
             for worker_id in range(num_workers):
                 reader = reader_class(max_instances=max_instances)
@@ -137,9 +137,9 @@ def test_instance_slicing(
     elif world_size is not None:
         minimum_expected_result_size //= world_size
         maximum_expected_result_size = minimum_expected_result_size + 1
-        for node_rank in range(world_size):
+        for global_rank in range(world_size):
             monkeypatch.setattr(common_util, "is_distributed", lambda: True)
-            monkeypatch.setattr(dist, "get_rank", lambda: node_rank)
+            monkeypatch.setattr(dist, "get_rank", lambda: global_rank)
             monkeypatch.setattr(dist, "get_world_size", lambda: world_size)
             reader = reader_class(max_instances=max_instances)
             result = set(