From e198673e40013114172edb8c43b81c916657d09e Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:54:36 -0400 Subject: [PATCH] Update moe.py --- megablocks/layers/moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megablocks/layers/moe.py b/megablocks/layers/moe.py index 1dd57c23..e00d442a 100644 --- a/megablocks/layers/moe.py +++ b/megablocks/layers/moe.py @@ -119,7 +119,7 @@ def __init__(self, args : Arguments): # Note that the output bias is not parallelized with expert # model parallelism. self.bias = torch.nn.Parameter(torch.empty( - 1, 1, args.hidden_size, + args.hidden_size, device=args.device, dtype=common.dtype(args))) torch.nn.init.zeros_(self.bias)