fix attention

yao-fengchen · yao-fengchen · commit e51c76e6201a · 2025-03-04T10:04:44.000Z
diff --git a/dlinfer/graph/dicp/vendor/AtbGraph/atb_op.py b/dlinfer/graph/dicp/vendor/AtbGraph/atb_op.py
@@ -295,9 +295,9 @@ def infer_result(
         mask,
         q_head_num,
         kv_head_num,
+        scale,
         head_size,
         head_size_v,
-        scale,
     ):
         return query.new_empty((query.shape[0], q_head_num, head_size_v))
 
diff --git a/dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_op.py b/dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_op.py
@@ -355,9 +355,9 @@ def SelfAttentionPAEncoder(
         mask,
         q_head_num,
         kv_head_num,
+        scale,
         head_size,
         head_size_v,
-        scale,
     ):
         op = Operation(name, "SelfAttentionOperation")
         param = infer_param.SelfAttentionParam()
diff --git a/dlinfer/graph/dicp/vendor/AtbGraph/conversion.py b/dlinfer/graph/dicp/vendor/AtbGraph/conversion.py
@@ -478,12 +478,11 @@ def prefill_attention(
             if softmax_scale
             else 1.0 / math.sqrt(query.node.meta["val"].shape[-1])
         )
+        _, num_q_heads, head_size = query.node.meta["val"].shape
+        _, num_kv_heads, head_size_v = value.node.meta["val"].shape
         if query.node.meta["val"].dtype != mask.node.meta["val"].dtype:
             mask = self.get_proxy(atb_op.Cast, (mask, query.node.meta["val"].dtype))
         if is_unpaged_prefill:
-            _, num_q_heads, head_size = query.node.meta["val"].shape
-            _, num_kv_heads, head_size_v = value.node.meta["val"].shape
-
             out = self.get_proxy(
                 atb_op.SelfAttentionPAEncoder,
                 (
@@ -494,18 +493,14 @@ def prefill_attention(
                     mask,
                     num_q_heads,
                     num_kv_heads,
+                    scale,
                     head_size,
                     head_size_v,
-                    scale,
                 ),
             )
         else:
-            q_shape = list(query.node.meta["val"].shape)
             k_cache_shape = list(k_cache.node.meta["val"].shape)
-            k_shape = list(key.node.meta["val"].shape)
             v_cache_shape = list(v_cache.node.meta["val"].shape)
-            num_q_heads = q_shape[-2]
-            num_kv_heads = k_shape[-2]
 
             is_kv_require_reshape = len(k_cache_shape) == 3 or len(v_cache_shape) == 3
             if is_kv_require_reshape:
@@ -529,6 +524,8 @@ def prefill_attention(
                     num_q_heads,
                     num_kv_heads,
                     scale,
+                    head_size,
+                    head_size_v,
                 ),
             )
         # graph = self.get_proxy(atb_op.Graph, (out,), {"output": [out]})