Skip to content

Support simple mode to not use access key #17

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion cmd/agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,27 @@ func main() {
preferDriver string
allocAllDevices bool
devicepluginPreStart bool
localERIDiscovery bool
exposedLocalERIs string
)
flag.StringVar(&preferDriver, "prefer-driver", "", "prefer driver")
flag.BoolVar(&allocAllDevices, "allocate-all-devices", false,
"allocate all erdma devices for resource request, true => alloc all, false => alloc devices based on numa")
flag.BoolVar(&devicepluginPreStart, "deviceplugin-prestart-container", false,
"use device plugin prestart container to config smc-r, enable it if not use webhook to inject initContainers")
flag.BoolVar(&localERIDiscovery, "local-eri-discovery", false,
"Only manager on-node eri resources without using OpenAPI and access key")
flag.StringVar(&exposedLocalERIs, "exposed-local-eris", "",
"allocate specific ERI from existing ERI to pods for each instance")
flag.Parse()

eriAgent, err := agent.NewAgent(preferDriver, allocAllDevices, devicepluginPreStart)
eriAgent, err := agent.NewAgent(
preferDriver,
allocAllDevices,
devicepluginPreStart,
localERIDiscovery,
exposedLocalERIs,
)
if err != nil {
panic(err)
}
Expand Down
1 change: 1 addition & 0 deletions deploy/helm/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ data:
"enableWebhook": {{ .Values.config.enableWebhook }},
"smcInitImage": "{{ .Values.config.smcInitImage }}",
"enableInitContainerInject": {{ .Values.config.enableInitContainerInject }},
"localERIDiscovery": {{ .Values.config.localERIDiscovery }},
"nodeSelector": {{ .Values.nodeSelector | toJson }}
}
8 changes: 7 additions & 1 deletion deploy/helm/templates/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ spec:
spec:
hostPID: true
hostNetwork: true
{{- with .Values.imagePullSecrets }}
{{- with .Values.agent.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
Expand All @@ -44,6 +44,12 @@ spec:
{{ if .Values.agent.allocateAllDevices }}
- --allocate-all-devices
{{ end }}
{{ if .Values.config.localERIDiscovery }}
- --local-eri-discovery
{{ end }}
{{ if .Values.agent.exposedLocalERIs }}
- --exposed-local-eris={{ join "," .Values.agent.exposedLocalERIs }}
{{ end }}
{{ if not .Values.config.enableWebhook }}
- --deviceplugin-prestart-container
{{ end }}
Expand Down
3 changes: 3 additions & 0 deletions deploy/helm/templates/deployment.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
{{- if not .Values.config.localERIDiscovery }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
Expand Down Expand Up @@ -64,3 +66,4 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
8 changes: 8 additions & 0 deletions deploy/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

# controller will not be deployed if localERIDiscovery is set
controller:
replicaCount: 2
image:
Expand All @@ -24,6 +25,12 @@ agent:
tag: "latest"
preferDriver: ""
allocateAllDevices: false
# format:
# expose specific eris for matched node: - <instance_id> <eri-0>/<eri-1>/...
# expose specific eris for unmatched node: - i-* <eri-0>/<eri-1>/...
# expose all eris for unmatched node: - i-* erdma_*
exposedLocalERIs:
- i-XXX erdma_0/erdma_1
imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""
Expand Down Expand Up @@ -57,6 +64,7 @@ config:
enableWebhook: false
enableInitContainerInject: true
smcInitImage: ""
localERIDiscovery: false

credentials:
type: ""
Expand Down
47 changes: 42 additions & 5 deletions internal/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ import (
"os"
"os/signal"
"runtime"
"strings"
"syscall"

"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/deviceplugin"
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/drivers"
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/k8s"
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/types"
"github.com/samber/lo"
ctrl "sigs.k8s.io/controller-runtime"

networkv1 "github.com/AliyunContainerService/alibabacloud-erdma-controller/api/v1"
)

var (
Expand All @@ -23,6 +27,8 @@ type Agent struct {
driver drivers.ERdmaDriver
allocAllDevices bool
devicepluginPreStart bool
localERIDiscovery bool
exposedLocalERIs []string
}

func stackTriger() {
Expand All @@ -48,25 +54,56 @@ func stackTriger() {
signal.Notify(sigchain, syscall.SIGUSR1)
}

func NewAgent(preferDriver string, allocAllDevice bool, devicepluginPreStart bool) (*Agent, error) {
func NewAgent(preferDriver string, allocAllDevice bool, devicepluginPreStart bool, localERIDiscovery bool, exposedLocalERIs string) (*Agent, error) {
kubernetes, err := k8s.NewKubernetes()
if err != nil {
return nil, err
}
agentLog.Info("NewAgent: ", "localERIDiscovery", localERIDiscovery)
return &Agent{
kubernetes: kubernetes,
driver: drivers.GetDriver(preferDriver),
allocAllDevices: allocAllDevice,
devicepluginPreStart: devicepluginPreStart,
localERIDiscovery: localERIDiscovery,
exposedLocalERIs: strings.Split(exposedLocalERIs, ","),
}, nil
}

func (a *Agent) Run() error {
go stackTriger()
// 1. wait related eri device
eriInfos, err := a.kubernetes.WaitEriInfo()
if err != nil {
return err
var err error
var eriInfos *networkv1.ERdmaDevice
var eri []*types.ERI
if !a.localERIDiscovery {
// 1. wait related eri device
eriInfos, err = a.kubernetes.WaitEriInfo()
if err != nil {
return err
}
} else {
if !(len(a.exposedLocalERIs) == 1 && a.exposedLocalERIs[0] == "") {
a.allocAllDevices = true
agentLog.Info("LocalERIDiscovery: enable expose ERIs, set allocAllDevices to true")
}
eri, err = drivers.SelectERIs(a.exposedLocalERIs)
if err != nil {
return fmt.Errorf("LocalERIDiscovery: select eri failed: %v", err)
}
eriInfos = &networkv1.ERdmaDevice{
Spec: networkv1.ERdmaDeviceSpec{
Devices: lo.Map(eri, func(item *types.ERI, index int) networkv1.DeviceInfo {
return networkv1.DeviceInfo{
InstanceID: item.InstanceID,
MAC: item.MAC,
IsPrimaryENI: item.IsPrimaryENI,
ID: item.ID,
NetworkCardIndex: item.CardIndex,
QueuePair: item.QueuePair,
}
}),
},
}
}
agentLog.Info("eri info", "eriInfo", eriInfos, "driver", a.driver.Name())
// 2. install eri driver
Expand Down
90 changes: 88 additions & 2 deletions internal/drivers/utils_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,52 @@ import (
"os/exec"
"path"
"path/filepath"
"regexp"
"strconv"
"strings"

"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/types"
"github.com/AliyunContainerService/alibabacloud-erdma-controller/internal/utils"
"github.com/samber/lo"
"github.com/vishvananda/netlink"
)

func checkExpose(instanceID string, exposedLocalERIs []string, rdmaDevice string) (bool, error) {
var unMatchExposeERIs []string
isMatched := false
if len(exposedLocalERIs) == 1 && exposedLocalERIs[0] == "" {
return true, nil
}
pattern := `^(i-(?:\w+|\*))\s+((?:(?:\w+)(?:\/\w+)*))$`
re := regexp.MustCompile(pattern)
for _, exposeInfo := range exposedLocalERIs {
if !re.MatchString(exposeInfo) {
return false, fmt.Errorf("invalid format %s", exposeInfo)
}
id := strings.SplitN(exposeInfo, " ", 2)[0]
if instanceID == id {
isMatched = true
exposeERIs := strings.Split(strings.TrimSpace(strings.SplitN(exposeInfo, " ", 2)[1]), "/")
for _, dev := range exposeERIs {
if dev == rdmaDevice {
return true, nil
}
}
}
if id == "i-*" {
unMatchExposeERIs = strings.Split(strings.TrimSpace(strings.SplitN(exposeInfo, " ", 2)[1]), "/")
}
}
if !isMatched {
driverLog.Info("no matched instanceID found, using unMatchExposeERIs", "instanceID", instanceID)
for _, dev := range unMatchExposeERIs {
if dev == "erdma_*" || dev == rdmaDevice {
return true, nil
}
}
}
return false, nil
}
func driverExists() bool {
if isContainerOS() {
_, err := containerExec("modinfo erdma")
Expand Down Expand Up @@ -104,14 +142,16 @@ func GetERdmaFromLink(link netlink.Link) (*netlink.RdmaLink, error) {
}
linkHwAddr := link.Attrs().HardwareAddr
// erdma guid first byte is ^= 0x2
linkHwAddr[0] ^= 0x2
new_linkHwAddr := make(net.HardwareAddr, len(linkHwAddr))
copy(new_linkHwAddr, linkHwAddr)
new_linkHwAddr[0] ^= 0x2
for _, rl := range rdmaLinks {
rdmaHwAddr, err := parseERdmaLinkHwAddr(rl.Attrs.NodeGuid)
if err != nil {
return nil, err
}
driverLog.Info("check rdma link", "rdmaLink", rl.Attrs.Name, "rdmaHwAddr", rdmaHwAddr.String(), "linkHwAddr", linkHwAddr.String())
if rdmaHwAddr.String() == linkHwAddr.String() {
if rdmaHwAddr.String() == new_linkHwAddr.String() {
return rl, nil
}
}
Expand Down Expand Up @@ -203,3 +243,49 @@ func GetERDMANumaNode(info *netlink.RdmaLink) (int64, error) {
}
return int64(numa), nil
}

const (
instanceIDAddr = "http://100.100.100.200/latest/meta-data/instance-id"
)

func SelectERIs(exposedLocalERIs []string) ([]*types.ERI, error) {
var selectEriList []*types.ERI
var isExposed bool
instanceID, _ := utils.GetStrFromMetadata(instanceIDAddr)
links, err := netlink.LinkList()
if err != nil {
return nil, fmt.Errorf("list link failed: %v", err)
}

for _, link := range links {
if _, ok := link.(*netlink.Device); !ok {
continue
}
if link.Attrs().HardwareAddr != nil {
rdmaLink, _ := GetERdmaFromLink(link)
if rdmaLink != nil {
rdmadevice := rdmaLink.Attrs.Name
isExposed, err = checkExpose(instanceID, exposedLocalERIs, rdmadevice)
if isExposed {
driverLog.Info("LocalERIDiscovery: expose eri", "rdmadevice", rdmadevice, "link name", link.Attrs().Name)
eri := &types.ERI{
ID: rdmadevice,
IsPrimaryENI: link.Attrs().Name == "eth0",
MAC: link.Attrs().HardwareAddr.String(),
InstanceID: instanceID,
CardIndex: -1,
QueuePair: -1,
}
selectEriList = append(selectEriList, eri)
driverLog.Info("Simple mode SelectERIs: eri", "eri", eri)
} else if err != nil {
return nil, err
}
} else {
driverLog.Info("LocalERIDiscovery: link is not rdma device, skip", "link_name", link.Attrs().Name)
}
}
}

return selectEriList, nil
}