Skip to content

[ascend]Optimize moe #203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def DeepseekV2Attention_forward(
key_states,
value_states,
past_key_value[0],
past_key_value[0][..., :nope_size],
past_key_value[1],
attn_metadata,
k_scales_zeros=None if len(past_key_value) == 2 else past_key_value[2],
v_scales_zeros=None if len(past_key_value) == 2 else past_key_value[3],
Expand Down
17 changes: 14 additions & 3 deletions dlinfer/graph/dicp/vendor/AtbGraph/atb_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,15 +633,26 @@ def infer_result(self, x, num_experts):
)


class MoeInitRouting(Operator):
class AclNnMoeGatingTopkSoftmax(Operator):
def __init__(self):
super().__init__("AclNnMoeGatingTopkSoftmax")

def infer_result(self, x, topk):
return (
x.new_empty((*x.shape[:-1], topk)),
x.new_empty((*x.shape[:-1], topk), dtype=torch.int32),
)


class AclNnMoeInitRouting(Operator):
def __init__(self):
super().__init__("AclNnMoeInitRouting")

def infer_result(self, x, row_ids, topk_ids, active_num, num_experts):
def infer_result(self, x, topk_ids, num_experts):
return (
x.repeat_interleave(topk_ids.size(1), dim=0),
row_ids.flatten(),
topk_ids.flatten(),
topk_ids.new_empty((num_experts,)),
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -652,10 +652,17 @@ class PrepareMoeParam:
numExperts: int = 0


@dataclass
class AclNnMoeGatingTopkSoftmaxParam:
name: str = ""
topk: int = 0
renorm: int = 0
outputSoftmaxResultFlag: bool = False


@dataclass
class AclNnMoeInitRoutingParam:
name: str = ""
activeNum: int = 10240
numExperts: int = 0


Expand Down
16 changes: 12 additions & 4 deletions dlinfer/graph/dicp/vendor/AtbGraph/codegen/atb_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,14 +1014,22 @@ def PrepareMoe(name, x, num_experts):
op.set_output([f"{name}__0", f"{name}__1", f"{name}__2", f"{name}__3"])
return op

def AclNnMoeInitRouting(name, x, row_ids, topk_ids, active_num, num_experts):
def AclNnMoeGatingTopkSoftmax(name, x, topk):
op = Operation(name, "AclNnMoeGatingTopkSoftmaxOperation")
param = infer_param.AclNnMoeGatingTopkSoftmaxParam()
param.name = name
param.topk = topk
op.set_input([x])
op.set_param(param)
op.set_output([f"{name}__0", f"{name}__1"])
return op

def AclNnMoeInitRouting(name, x, topk_ids, num_experts):
op = Operation(name, "AclNnMoeInitRoutingOperation")
param = infer_param.AclNnMoeInitRoutingParam()
param.name = name
param.activeNum = active_num
param.numExperts = num_experts

op.set_input([x, row_ids, topk_ids])
op.set_input([x, topk_ids])
op.set_param(param)
op.set_output([f"{name}__0", f"{name}__1", f"{name}__2"])
return op
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#include "moe_gating_topk_softmax.h"

#include <cstddef>

#include "aclnnop/aclnn_moe_gating_top_k_softmax_v2.h"
#include "third_party/acl/inc/acl/acl_base.h"
#include "utils/log.h"

namespace dicp {

const int NUM1 = 1;
const int NUM2 = 2;
const int NUM3 = 3;

AclNnMoeGatingTopkSoftmaxOperation::AclNnMoeGatingTopkSoftmaxOperation(const std::string& name, int64_t topk, int64_t renorm, bool outputSoftmaxResultFlag)
: AclNnOperation(name), topk_(topk), renorm_(renorm), outputSoftmaxResultFlag_(outputSoftmaxResultFlag) {}

AclNnMoeGatingTopkSoftmaxOperation::~AclNnMoeGatingTopkSoftmaxOperation() {}

atb::Status AclNnMoeGatingTopkSoftmaxOperation::InferShape(const atb::SVector<atb::TensorDesc>& inTensorDescs,
atb::SVector<atb::TensorDesc>& outTensorDescs) const {
DICP_LOG(INFO) << opName_ << " infer shape start";

outTensorDescs.at(0).format = inTensorDescs.at(0).format;
outTensorDescs.at(0).shape.dimNum = inTensorDescs.at(0).shape.dimNum;
outTensorDescs.at(0).dtype = inTensorDescs.at(0).dtype;
for (size_t i = 0; i < outTensorDescs.at(0).shape.dimNum; ++i) {
outTensorDescs.at(0).shape.dims[i] = i == outTensorDescs.at(0).shape.dimNum - 1 ? topk_ : inTensorDescs.at(0).shape.dims[i];
}

outTensorDescs.at(1).format = outTensorDescs.at(0).format;
outTensorDescs.at(1).shape.dimNum = outTensorDescs.at(0).shape.dimNum;
outTensorDescs.at(1).dtype = aclDataType::ACL_INT32;
for (size_t i = 0; i < outTensorDescs.at(1).shape.dimNum; ++i) {
outTensorDescs.at(1).shape.dims[i] = outTensorDescs.at(0).shape.dims[i];
}

DICP_LOG(INFO) << opName_ << " infer shape end";
return 0;
}

uint32_t AclNnMoeGatingTopkSoftmaxOperation::GetInputNum() const { return NUM1; }

uint32_t AclNnMoeGatingTopkSoftmaxOperation::GetOutputNum() const { return NUM2; }

int AclNnMoeGatingTopkSoftmaxOperation::SetAclNnWorkspaceExecutor(uint64_t& workspaceSize) {
DICP_LOG(INFO) << opName_ << " aclnnMoeGatingTopKSoftmaxV2GetWorkspaceSize start";

int ret = aclnnMoeGatingTopKSoftmaxV2GetWorkspaceSize(aclInTensors_.at(0).tensor,
nullptr,
topk_,
renorm_,
outputSoftmaxResultFlag_,
aclOutTensors_.at(0).tensor,
aclOutTensors_.at(1).tensor,
nullptr,
&workspaceSize,
&aclExecutor_);

DICP_LOG(INFO) << opName_ << " aclnnMoeGatingTopKSoftmaxV2GetWorkspaceSize end, ret:" << ret << ", workspaceSize:" << workspaceSize
<< ", aclExecutor:" << aclExecutor_;

return ret;
}

int AclNnMoeGatingTopkSoftmaxOperation::CallAclExecute(uint8_t* workspace, uint64_t workspaceSize, aclOpExecutor* aclExecutor, aclrtStream stream) {
DICP_LOG(INFO) << opName_ << " aclnnMoeGatingTopKSoftmaxV2 start";
int ret = aclnnMoeGatingTopKSoftmaxV2(workspace, workspaceSize, aclExecutor, stream);
DICP_LOG(INFO) << opName_ << " aclnnMoeGatingTopKSoftmaxV2 end, ret:" << ret;
return ret;
}

atb::Operation* AclNnMoeGatingTopkSoftmaxOperationCreate(const nlohmann::json& paramJson) {
std::string opName;
int64_t topk, renorm;
bool outputSoftmaxResultFlag;
if (paramJson.contains("name")) {
opName = paramJson["name"].get<std::string>();
}
if (paramJson.contains("topk")) {
topk = paramJson["topk"].get<int64_t>();
}
if (paramJson.contains("renorm")) {
renorm = paramJson["renorm"].get<int64_t>();
}
if (paramJson.contains("outputSoftmaxResultFlag")) {
outputSoftmaxResultFlag = paramJson["outputSoftmaxResultFlag"].get<bool>();
}
DICP_LOG(INFO) << "AclNnMoeGatingTopkSoftmaxOperation: name: " << opName << " topk:" << topk << " renorm:" << renorm
<< " outputSoftmaxResultFlag:" << outputSoftmaxResultFlag;
atb::Operation* op = new AclNnMoeGatingTopkSoftmaxOperation(opName, topk, renorm, outputSoftmaxResultFlag);
return op;
}

REGISTER_OPERATION(AclNnMoeGatingTopkSoftmaxOperation, AclNnMoeGatingTopkSoftmaxOperationCreate);

} // namespace dicp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma once

#include "ops/aclnn_ops/acl_nn_operation.h"

namespace dicp {

class AclNnMoeGatingTopkSoftmaxOperation : public AclNnOperation {
public:
explicit AclNnMoeGatingTopkSoftmaxOperation(const std::string& name, int64_t topk, int64_t renorm, bool outputSoftmaxResultFlag);
~AclNnMoeGatingTopkSoftmaxOperation() override;
atb::Status InferShape(const atb::SVector<atb::TensorDesc>& inTensorDescs, atb::SVector<atb::TensorDesc>& outTensorDescs) const override;
uint32_t GetInputNum() const override;
uint32_t GetOutputNum() const override;

private:
int64_t topk_;
int64_t renorm_;
bool outputSoftmaxResultFlag_;
int SetAclNnWorkspaceExecutor(uint64_t& workspaceSize) override;
int CallAclExecute(uint8_t* workspace, uint64_t workspaceSize, aclOpExecutor* aclExecutor, aclrtStream stream) override;
};

} // namespace dicp
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
#include "moe_init_routing_operation.h"

#include "aclnnop/aclnn_moe_init_routing.h"
// #include "aclnnop/aclnn_moe_init_routing_v2.h"
#include "aclnnop/aclnn_moe_init_routing_v2.h"
#include "utils/log.h"

namespace dicp {

const int NUM1 = 1;
const int NUM2 = 2;
const int NUM3 = 3;

AclNnMoeInitRoutingOperation::AclNnMoeInitRoutingOperation(const std::string& name, int64_t activeNum, int64_t numExperts)
: AclNnOperation(name), activeNum_(activeNum), numExperts_(numExperts) {}
AclNnMoeInitRoutingOperation::AclNnMoeInitRoutingOperation(const std::string& name, int64_t numExperts) : AclNnOperation(name), numExperts_(numExperts) {}

AclNnMoeInitRoutingOperation::~AclNnMoeInitRoutingOperation() {}

atb::Status AclNnMoeInitRoutingOperation::InferShape(const atb::SVector<atb::TensorDesc>& inTensorDescs, atb::SVector<atb::TensorDesc>& outTensorDescs) const {
DICP_LOG(INFO) << opName_ << " infer shape start";
auto seqLength = inTensorDescs.at(2).shape.dims[0];
auto topk = inTensorDescs.at(2).shape.dims[1];
auto seqLength = inTensorDescs.at(0).shape.dims[0];
auto topk = inTensorDescs.at(1).shape.dims[1];
activeNum_ = seqLength * topk;

outTensorDescs.at(0).format = inTensorDescs.at(0).format;
outTensorDescs.at(0).shape.dimNum = inTensorDescs.at(0).shape.dimNum;
Expand All @@ -30,60 +30,61 @@ atb::Status AclNnMoeInitRoutingOperation::InferShape(const atb::SVector<atb::Ten
outTensorDescs.at(1).dtype = inTensorDescs.at(1).dtype;
outTensorDescs.at(1).shape.dims[0] = seqLength * topk;

outTensorDescs.at(2).format = inTensorDescs.at(2).format;
outTensorDescs.at(2).format = inTensorDescs.at(1).format;
outTensorDescs.at(2).shape.dimNum = NUM1;
outTensorDescs.at(2).dtype = inTensorDescs.at(2).dtype;
outTensorDescs.at(2).shape.dims[0] = seqLength * topk;
outTensorDescs.at(2).dtype = inTensorDescs.at(1).dtype;
outTensorDescs.at(2).shape.dims[0] = numExperts_;

DICP_LOG(INFO) << opName_ << " infer shape end";
return 0;
}

uint32_t AclNnMoeInitRoutingOperation::GetInputNum() const { return NUM3; }
uint32_t AclNnMoeInitRoutingOperation::GetInputNum() const { return NUM2; }

uint32_t AclNnMoeInitRoutingOperation::GetOutputNum() const { return NUM3; }

int AclNnMoeInitRoutingOperation::SetAclNnWorkspaceExecutor(uint64_t& workspaceSize) {
DICP_LOG(INFO) << opName_ << " aclnnMoeInitRoutingGetWorkspaceSize start";

int ret = aclnnMoeInitRoutingGetWorkspaceSize(aclInTensors_.at(0).tensor,
aclInTensors_.at(1).tensor,
aclInTensors_.at(2).tensor,
activeNum_,
aclOutTensors_.at(0).tensor,
aclOutTensors_.at(1).tensor,
aclOutTensors_.at(2).tensor,
&workspaceSize,
&aclExecutor_);

DICP_LOG(INFO) << opName_ << " aclnnMoeInitRoutingGetWorkspaceSize end, ret:" << ret << ", workspaceSize:" << workspaceSize
DICP_LOG(INFO) << opName_ << " aclnnMoeInitRoutingV2GetWorkspaceSize start";

int ret = aclnnMoeInitRoutingV2GetWorkspaceSize(aclInTensors_.at(0).tensor,
aclInTensors_.at(1).tensor,
activeNum_,
0,
numExperts_,
0,
1,
false,
aclOutTensors_.at(0).tensor,
aclOutTensors_.at(1).tensor,
aclOutTensors_.at(2).tensor,
nullptr,
&workspaceSize,
&aclExecutor_);

DICP_LOG(INFO) << opName_ << " aclnnMoeInitRoutingV2GetWorkspaceSize end, ret:" << ret << ", workspaceSize:" << workspaceSize
<< ", aclExecutor:" << aclExecutor_;

return ret;
}

int AclNnMoeInitRoutingOperation::CallAclExecute(uint8_t* workspace, uint64_t workspaceSize, aclOpExecutor* aclExecutor, aclrtStream stream) {
DICP_LOG(INFO) << opName_ << " aclnnMoeInitRouting start";
int ret = aclnnMoeInitRouting(workspace, workspaceSize, aclExecutor, stream);
DICP_LOG(INFO) << opName_ << " aclnnMoeInitRouting end, ret:" << ret;
DICP_LOG(INFO) << opName_ << " aclnnMoeInitRoutingV2 start";
int ret = aclnnMoeInitRoutingV2(workspace, workspaceSize, aclExecutor, stream);
DICP_LOG(INFO) << opName_ << " aclnnMoeInitRoutingV2 end, ret:" << ret;
return ret;
}

atb::Operation* AclNnMoeInitRoutingOperationCreate(const nlohmann::json& paramJson) {
std::string opName;
int64_t activeNum;
int64_t numExperts;
if (paramJson.contains("name")) {
opName = paramJson["name"].get<std::string>();
}
if (paramJson.contains("activeNum")) {
activeNum = paramJson["activeNum"].get<int64_t>();
}
if (paramJson.contains("numExperts")) {
numExperts = paramJson["numExperts"].get<int64_t>();
}
DICP_LOG(INFO) << "AclNnMoeInitRoutingOperation: name: " << opName << " activeNum:" << activeNum << " numExperts:" << numExperts;
atb::Operation* op = new AclNnMoeInitRoutingOperation(opName, activeNum, numExperts);
DICP_LOG(INFO) << "AclNnMoeInitRoutingOperation: name: " << opName << " numExperts:" << numExperts;
atb::Operation* op = new AclNnMoeInitRoutingOperation(opName, numExperts);
return op;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ namespace dicp {

class AclNnMoeInitRoutingOperation : public AclNnOperation {
public:
explicit AclNnMoeInitRoutingOperation(const std::string& name, int64_t activeNum, int64_t numExperts);
explicit AclNnMoeInitRoutingOperation(const std::string& name, int64_t numExperts);
~AclNnMoeInitRoutingOperation() override;
atb::Status InferShape(const atb::SVector<atb::TensorDesc>& inTensorDescs, atb::SVector<atb::TensorDesc>& outTensorDescs) const override;
uint32_t GetInputNum() const override;
uint32_t GetOutputNum() const override;

private:
int64_t activeNum_;
mutable int64_t activeNum_;
int64_t numExperts_;
int SetAclNnWorkspaceExecutor(uint64_t& workspaceSize) override;
int CallAclExecute(uint8_t* workspace, uint64_t workspaceSize, aclOpExecutor* aclExecutor, aclrtStream stream) override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "aclnnop/aclnn_moe_token_permute.h"
#include "utils/log.h"
#include "utils/tensor_utils.h"

namespace dicp {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,7 @@ template <aclDataType T>
void copyAndPrint(const atb::Tensor tensor, int64_t tensorSize) {
using vectorT = typename aclDataTypeMap<T>::type;
std::vector<vectorT> resultData(tensorSize, 0);
auto ret =
aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), tensor.deviceData, tensorSize * sizeof(float16_t), ACL_MEMCPY_DEVICE_TO_HOST);
auto ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(vectorT), tensor.deviceData, tensorSize * sizeof(vectorT), ACL_MEMCPY_DEVICE_TO_HOST);
for (int64_t i = 0; i < tensorSize; ++i) {
DICP_LOG(INFO) << "data[" << i << "]: " << resultData[i];
}
Expand Down
Loading
Loading