fix maca_vllm import error. (#160)

Reinerzhou · web-flow · commit 4e76aa92271a · 2025-01-15T16:43:17.000+08:00
diff --git a/dlinfer/vendor/maca/maca_ops.py b/dlinfer/vendor/maca/maca_ops.py
@@ -4,6 +4,7 @@
 import torch
 import torch.distributed as dist
 
+from vllm import _custom_ops as custom_ops
 from flash_attn import flash_attn_varlen_func
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
@@ -59,7 +60,7 @@ def add_rms_norm(
     weight: Tensor,
     epsilon: float,
 ) -> Tuple[Tensor, Tensor]:
-    vllm._custom_ops.fused_add_rms_norm(hidden_states, residual, weight, epsilon)
+    custom_ops.fused_add_rms_norm(hidden_states, residual, weight, epsilon)
     return hidden_states, residual
 
 
@@ -188,7 +189,7 @@ def fill_kv_cache(
     quant_bits: int,
 ) -> Tuple[Tensor, Tensor]:
     kv_indices = kv_indices.squeeze(-1)
-    vllm._custom_ops.reshape_and_cache_new(
+    custom_ops.reshape_and_cache_new(
         key, value, key_cache, value_cache, kv_indices, "auto", 1.0, 1.0
     )
     return key_cache, value_cache
@@ -220,7 +221,7 @@ def paged_decode_attention(
     num_kv_heads = value_cache.size(1)
     block_size = value_cache.size(2)
     output = torch.empty_like(query)
-    vllm._custom_ops.paged_attention_v1(
+    custom_ops.paged_attention_v1(
         output,
         query,
         key_cache,
@@ -304,7 +305,7 @@ def rms_norm(
     epsilon: float,
 ) -> Tensor:
     output = torch.empty_like(hidden_states)
-    vllm._custom_ops.rms_norm(output, hidden_states, weight, epsilon)
+    custom_ops.rms_norm(output, hidden_states, weight, epsilon)
     return output
 
 
@@ -322,7 +323,7 @@ def moe_gating_topk_softmax(
 
     token_expert_indicies = torch.empty_like(topk_ids)
 
-    vllm._custom_ops.topk_softmax(
+    custom_ops.topk_softmax(
         topk_weights,
         topk_ids,
         token_expert_indicies,
@@ -344,7 +345,7 @@ def silu_and_mul(x: Tensor, dim: int = -1) -> Tensor:
     d = x.shape[-1] // 2
     output_shape = x.shape[:-1] + (d,)
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-    vllm._custom_ops.silu_and_mul(out, x)
+    custom_ops.silu_and_mul(out, x)
     return out
 
 
@@ -398,7 +399,7 @@ def weight_quant_matmul(
     group_size: Optional[int] = 0,
 ):
     offset = None if (offset is None or offset.numel() == 0) else offset
-    output = vllm._custom_ops.awq_gemm(x, qweight, scale, offset, group_size)
+    output = custom_ops.awq_gemm(x, qweight, scale, offset, group_size)
     if bias is not None:
         output += bias
     return output