set all2all dtype using amp precision

vchiley · vchiley · commit 52b39916af6f · 2023-07-27T16:46:56.000Z
diff --git a/megablocks/layers/common.py b/megablocks/layers/common.py
@@ -8,3 +8,15 @@ def dtype(args : Arguments):
     elif args.bf16:
         dtype = torch.bfloat16
     return dtype
+
+
+def cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == 'cuda':
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == 'cpu':
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
diff --git a/megablocks/layers/dmoe.py b/megablocks/layers/dmoe.py
@@ -136,6 +136,7 @@ def forward_once(self, x, top_expert):
 
         # Perform the expert computation.
         x = self.mlp(x, topo)
+        x = common.cast_if_autocast_enabled(x)
 
         # Un-route the data for the MoE output.
         x = ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
@@ -162,6 +163,7 @@ def permute_and_compute(
 
         # Perform the expert computation.
         x = self.mlp(x, topo)
+        x = common.cast_if_autocast_enabled(x)
 
         # Un-route the data for the MoE output.
         return ops.padded_scatter(x, indices, bin_ids, bins, padded_bins)
diff --git a/megablocks/layers/moe.py b/megablocks/layers/moe.py
@@ -185,6 +185,7 @@ def permute_and_compute(
         # Perform the expert computation. Note that we don't
         # use biases for these linear operations.
         x = self.mlp(x)
+        x = common.cast_if_autocast_enabled(x)
 
         # Un-route the data for the MoE output.
         return ops.binned_scatter(x, indices, bins)
@@ -344,6 +345,7 @@ def parallel_forward_once(self, x, top_expert):
         return x, tokens_per_expert.flatten()
 
     def forward(self, x):
+        x = common.cast_if_autocast_enabled(x)
         sl, bs, hs = x.size()
 
         # Compute the top-1 expert routing.