From e198673e40013114172edb8c43b81c916657d09e Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Mon, 28 Aug 2023 14:54:36 -0400
Subject: [PATCH] Update moe.py

---
 megablocks/layers/moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megablocks/layers/moe.py b/megablocks/layers/moe.py
index 1dd57c23..e00d442a 100644
--- a/megablocks/layers/moe.py
+++ b/megablocks/layers/moe.py
@@ -119,7 +119,7 @@ def __init__(self, args : Arguments):
         # Note that the output bias is not parallelized with expert
         # model parallelism.
         self.bias = torch.nn.Parameter(torch.empty(
-            1, 1, args.hidden_size,
+            args.hidden_size,
             device=args.device,
             dtype=common.dtype(args)))
         torch.nn.init.zeros_(self.bias)