Skip to content

Commit 2d626c0

Browse files
authored
fix: check and repair pods. (#99)
1 parent 6aeeb84 commit 2d626c0

File tree

10 files changed

+67
-44
lines changed

10 files changed

+67
-44
lines changed

Makefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ docker-build-xnet-ubuntu-22.04:
3535
docker-build-xnet-ubuntu-24.04:
3636
docker buildx build --builder fsm --platform=$(DOCKER_BUILDX_PLATFORM) -o $(DOCKER_BUILDX_OUTPUT) -t $(CTR_REGISTRY)/xnet:ubuntu-24.04-$(CTR_TAG) -f dockerfiles/Dockerfile.ubuntu.24.04 --build-arg LDFLAGS=$(LDFLAGS) .
3737

38-
.PHONY: docker-build-xnet-openeuler-22.03
39-
docker-build-bclinux-xnet-openeuler-22.03:
40-
docker-build-bclinux-xnet-openeuler-22.03:
38+
.PHONY: docker-build-xnet-bclinux-openeuler-22.03
39+
docker-build-xnet-bclinux-openeuler-22.03:
40+
docker-build-xnet-bclinux-openeuler-22.03:
4141
docker buildx build --builder fsm --platform=$(DOCKER_BUILDX_PLATFORM) -o $(DOCKER_BUILDX_OUTPUT) -t $(CTR_REGISTRY)/xnet:openeuler-22.03-$(CTR_TAG) -f dockerfiles/Dockerfile.openeuler.22.03 --build-arg LDFLAGS=$(LDFLAGS) .
4242

4343
TARGETS = xnet xnet-ubuntu-20.04 xnet-ubuntu-22.04 xnet-ubuntu-24.04

cmd/xctr/main.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,21 @@ func parseFlags() error {
132132
}
133133
if len(nodePathSysRun) > 0 {
134134
volume.SysRun.HostPath = nodePathSysRun
135-
volume.Netns.HostPath = path.Join(volume.SysRun.HostPath, `netns`)
135+
136+
netnsDir := path.Join(volume.SysRun.HostPath, `docker`, `netns`)
137+
if _, err := os.ReadDir(netnsDir); err == nil {
138+
volume.Netns = append(volume.Netns, netnsDir)
139+
}
140+
141+
netnsDir = path.Join(volume.SysRun.HostPath, `netns`)
142+
if _, err := os.ReadDir(netnsDir); err == nil {
143+
volume.Netns = append(volume.Netns, netnsDir)
144+
}
145+
146+
netnsDir = volume.SysProc.MountPath
147+
if _, err := os.ReadDir(netnsDir); err == nil {
148+
volume.Netns = append(volume.Netns, netnsDir)
149+
}
136150
}
137151

138152
return nil

pkg/k8s/informers/informers.go

+4-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package informers
33
import (
44
"errors"
55
"os"
6-
"strings"
76
"testing"
87

98
"github.com/rs/zerolog/log"
@@ -53,11 +52,11 @@ func WithKubeClient(kubeClient kubernetes.Interface) InformerCollectionOption {
5352
nsInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, DefaultKubeEventResyncInterval, monitorNamespaceOption)
5453
ic.informers[InformerKeyNamespace] = nsInformerFactory.Core().V1().Namespaces().Informer()
5554

56-
nodeName, err := os.Hostname()
57-
if err != nil {
58-
log.Fatal().Msg(err.Error())
55+
nodeName := os.Getenv("NODE_NAME")
56+
if len(nodeName) == 0 {
57+
log.Fatal().Msg("missing NODE_NAME env")
5958
}
60-
nodeSelector := fields.OneTermEqualSelector("spec.nodeName", strings.ToLower(nodeName)).String()
59+
nodeSelector := fields.OneTermEqualSelector("spec.nodeName", nodeName).String()
6160

6261
podNodeOption := informers.WithTweakListOptions(func(opt *metav1.ListOptions) {
6362
opt.FieldSelector = nodeSelector

pkg/xnet/bpf/cli/wait.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ func (a *waitCmd) run() error {
5656
break
5757
}
5858
}
59-
time.Sleep(time.Second * 2)
6059
}
60+
time.Sleep(time.Second * 2)
6161
}
6262
return nil
6363
}

pkg/xnet/cni/controller/hwaddr.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ import (
1111

1212
func (s *server) findHwAddrByPodIP(podIP string) (net.HardwareAddr, bool) {
1313
var hwAddr net.HardwareAddr
14-
netnsDirs := []string{volume.Netns.MountPath, volume.SysProc.MountPath}
15-
for _, netnsDir := range netnsDirs {
14+
for _, netnsDir := range volume.Netns {
1615
rd, err := os.ReadDir(netnsDir)
1716
if err != nil {
1817
log.Debug().Err(err).Msg(netnsDir)

pkg/xnet/cni/controller/plugin.go

+1-2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,7 @@ func (s *server) CmdAdd(args *skel.CmdArgs) (err error) {
5858
if len(args.IfName) != 0 {
5959
return tc.AttachBPFProg(maps.SysMesh, args.IfName, true, true)
6060
}
61-
ifaces, _ := net.Interfaces()
62-
for _, iface := range ifaces {
61+
if iface, ifaceErr := net.InterfaceByName(podEth0); ifaceErr == nil {
6362
if (iface.Flags&net.FlagLoopback) == 0 && (iface.Flags&net.FlagUp) != 0 {
6463
return tc.AttachBPFProg(maps.SysMesh, iface.Name, true, true)
6564
}

pkg/xnet/cni/controller/repair.go

+30-25
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ import (
1515
)
1616

1717
func (s *server) checkAndRepairPods() {
18-
var repairFailPods map[string]string
1918
for {
20-
repairFailPods = s.doCheckAndRepairPods()
19+
repairFailPods := s.doCheckAndRepairPods()
2120
if len(repairFailPods) == 0 {
22-
break
21+
time.Sleep(time.Second * 10)
22+
continue
2323
}
2424
for _, pod := range repairFailPods {
2525
log.Error().Msgf(`fail to check and repair pod: %s`, pod)
@@ -40,8 +40,13 @@ func (s *server) doCheckAndRepairPods() map[string]string {
4040
allPodsByAddr[pod.Status.PodIP] = fmt.Sprintf(`%s/%s`, pod.Namespace, pod.Name)
4141
}
4242

43-
netnsDirs := []string{volume.Netns.MountPath, volume.SysProc.MountPath}
44-
for _, netnsDir := range netnsDirs {
43+
log.Debug().Msgf("monitoredPodsByAddr Count: %d", len(monitoredPodsByAddr))
44+
log.Debug().Msgf("allPodsByAddr Count: %d", len(allPodsByAddr))
45+
46+
for _, netnsDir := range volume.Netns {
47+
if len(monitoredPodsByAddr) == 0 {
48+
break
49+
}
4550
rd, err := os.ReadDir(netnsDir)
4651
if err != nil {
4752
log.Debug().Err(err).Msg(netnsDir)
@@ -56,27 +61,27 @@ func (s *server) doCheckAndRepairPods() map[string]string {
5661
}
5762

5863
if doErr := netNS.Do(func(_ ns.NetNS) error {
59-
ifaces, ifaceErr := net.Interfaces()
60-
if ifaceErr != nil {
61-
log.Debug().Err(ifaceErr).Msg(nsName)
62-
return nil
63-
}
64-
for _, iface := range ifaces {
64+
if iface, ifaceErr := net.InterfaceByName(podEth0); ifaceErr == nil {
6565
if (iface.Flags&net.FlagLoopback) == 0 && (iface.Flags&net.FlagUp) != 0 {
6666
if addrs, addrErr := iface.Addrs(); addrErr == nil {
6767
for _, addr := range addrs {
6868
addrStr := addr.String()
6969
addrStr = addrStr[0:strings.Index(addrStr, `/`)]
70+
log.Debug().Msgf("netns Addr:%s %s", iface.Name, addrStr)
7071
if pod, exists := monitoredPodsByAddr[addrStr]; exists {
72+
log.Debug().Msgf("monitoredPodsByAddr:%s", addrStr)
7173
if attachErr := tc.AttachBPFProg(maps.SysMesh, iface.Name, true, true); attachErr != nil {
7274
return fmt.Errorf(`%s %s`, pod, attachErr.Error())
7375
}
76+
log.Debug().Msgf("monitoredPodsByAddr:%s attach success", addrStr)
7477
delete(monitoredPodsByAddr, addrStr)
7578
delete(allPodsByAddr, addrStr)
7679
} else if pod, exists := allPodsByAddr[addrStr]; exists {
80+
log.Debug().Msgf("allPodsByAddr:%s", addrStr)
7781
if detachErr := tc.DetachBPFProg(maps.SysMesh, iface.Name, true, true); detachErr != nil {
7882
return fmt.Errorf(`%s %s`, pod, detachErr.Error())
7983
}
84+
log.Debug().Msgf("allPodsByAddr:%s detach success", addrStr)
8085
delete(allPodsByAddr, addrStr)
8186
}
8287
}
@@ -89,19 +94,18 @@ func (s *server) doCheckAndRepairPods() map[string]string {
8994
}
9095
}
9196
}
97+
log.Debug().Msgf("monitoredPodsByAddr Attach Fail Count: %d", len(monitoredPodsByAddr))
9298
return monitoredPodsByAddr
9399
}
94100

95101
func (s *server) checkAndResetPods() {
96-
var resetFailPods map[string]string
102+
retries := 3
97103
for {
98-
resetFailPods = s.doCheckAndResetPods()
99-
if len(resetFailPods) == 0 {
104+
_ = s.doCheckAndResetPods()
105+
retries--
106+
if retries < 0 {
100107
break
101108
}
102-
for _, pod := range resetFailPods {
103-
log.Error().Msgf(`fail to check and reset pod: %s`, pod)
104-
}
105109
time.Sleep(time.Second * 3)
106110
}
107111
}
@@ -112,8 +116,12 @@ func (s *server) doCheckAndResetPods() map[string]string {
112116
for _, pod := range pods {
113117
allPodsByAddr[pod.Status.PodIP] = fmt.Sprintf(`%s/%s`, pod.Namespace, pod.Name)
114118
}
115-
netnsDirs := []string{volume.Netns.MountPath, volume.SysProc.MountPath}
116-
for _, netnsDir := range netnsDirs {
119+
log.Debug().Msgf("allPodsByAddr Count: %d", len(allPodsByAddr))
120+
121+
for _, netnsDir := range volume.Netns {
122+
if len(allPodsByAddr) == 0 {
123+
break
124+
}
117125
rd, err := os.ReadDir(netnsDir)
118126
if err != nil {
119127
log.Debug().Err(err).Msg(netnsDir)
@@ -128,21 +136,18 @@ func (s *server) doCheckAndResetPods() map[string]string {
128136
}
129137

130138
if nsErr = netNS.Do(func(_ ns.NetNS) error {
131-
ifaces, ifaceErr := net.Interfaces()
132-
if ifaceErr != nil {
133-
log.Debug().Err(ifaceErr).Msg(nsName)
134-
return nil
135-
}
136-
for _, iface := range ifaces {
139+
if iface, ifaceErr := net.InterfaceByName(podEth0); ifaceErr == nil {
137140
if (iface.Flags&net.FlagLoopback) == 0 && (iface.Flags&net.FlagUp) != 0 {
138141
if addrs, addrErr := iface.Addrs(); addrErr == nil {
139142
for _, addr := range addrs {
140143
addrStr := addr.String()
141144
addrStr = addrStr[0:strings.Index(addrStr, `/`)]
145+
log.Debug().Msgf("netns Addr:%s %s", iface.Name, addrStr)
142146
if pod, exists := allPodsByAddr[addrStr]; exists {
143147
if detachErr := tc.DetachBPFProg(maps.SysMesh, iface.Name, true, true); detachErr != nil {
144148
return fmt.Errorf(`%s %s`, pod, detachErr.Error())
145149
}
150+
log.Debug().Msgf("allPodsByAddr:%s detach success", addrStr)
146151
delete(allPodsByAddr, addrStr)
147152
}
148153
}

pkg/xnet/cni/controller/types.go

+2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ const (
1212

1313
bridgeAclId = uint16('b'<<8 | 'r')
1414
bridgeAclFlag = uint8('c')
15+
16+
podEth0 = `eth0`
1517
)
1618

1719
// Server CNI Server.

pkg/xnet/tc/tc.go

+8
Original file line numberDiff line numberDiff line change
@@ -238,8 +238,12 @@ func AttachBPFProg(sysId maps.SysID, dev string, ingress, egress bool) error {
238238
if err := addBPFFilter(rtnl, uint32(iface.Index), HandleIngress, uint32(ingressProgFD)); err != nil {
239239
log.Error().Msgf("add tc ingress filter error: %v", err)
240240
return err
241+
} else {
242+
log.Debug().Msgf("tc ingress filter add success: %s", iface.Name)
241243
}
242244
}
245+
} else {
246+
log.Debug().Msgf("tc ingress filter exists: %s", iface.Name)
243247
}
244248
}
245249

@@ -252,8 +256,12 @@ func AttachBPFProg(sysId maps.SysID, dev string, ingress, egress bool) error {
252256
if err := addBPFFilter(rtnl, uint32(iface.Index), HandleEgress, uint32(egressProgFD)); err != nil {
253257
log.Error().Msgf("add tc egress filter error: %v", err)
254258
return err
259+
} else {
260+
log.Debug().Msgf("tc egress filter add success: %s", iface.Name)
255261
}
256262
}
263+
} else {
264+
log.Debug().Msgf("tc egress filter exists: %s", iface.Name)
257265
}
258266
}
259267
}

pkg/xnet/volume/types.go

+2-5
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,6 @@ var (
2121
MountPath: "/host/run",
2222
}
2323

24-
Netns = HostMount{
25-
HostPath: "/var/run/netns",
26-
MountPath: "/host/run/netns",
27-
}
28-
2924
CniBin = HostMount{
3025
HostPath: "/bin",
3126
MountPath: "/host/cni/bin",
@@ -36,4 +31,6 @@ var (
3631
HostPath: "/var/lib/rancher/k3s/agent/etc/cni/net.d", //k3s
3732
MountPath: "/host/cni/net.d",
3833
}
34+
35+
Netns = []string{}
3936
)

0 commit comments

Comments
 (0)