-
Notifications
You must be signed in to change notification settings - Fork 456
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ip资源未被回收,子网ip占用残留 #2125
Comments
IP 的crd的记录存在vm已删除未清理,存在vm ip crd重复。 |
可能和 #2087 相关,可以更新这个 patch 再看 |
对于开启 keep-vm-ip=true 参数的 vm,vm pod 是在 running状态删除的,IP crd 能在删除 vm pod的时候,同时删除。如果 vm 是在stopped 状态删除的,IP crd 需要等gc 回收,大概12分钟。因为stopped 状态的 vm,有可能会重新启动,所以在删除前一直保留着对应的 IP crd。 |
当前这个重复的ip crd 绝对不止12分钟 |
[centos@iaas-cms-ctrl-1 ~]$ grep "Starting OVN controller" -r /var/log/kube-ovn/
/var/log/kube-ovn/kube-ovn-controller.log:I1205 14:12:20.809618 7 controller.go:461] Starting OVN controller
[centos@iaas-cms-ctrl-1 ~]$
[centos@iaas-cms-ctrl-1 ~]$ ssh iaas-cms-ctrl-2
[centos@iaas-cms-ctrl-2 ~]$ grep "Starting OVN controller" -r /var/log/kube-ovn/
[centos@iaas-cms-ctrl-2 ~]$ logout
Connection to iaas-cms-ctrl-2 closed.
[centos@iaas-cms-ctrl-1 ~]$ ssh iaas-cms-ctrl-3
[centos@iaas-cms-ctrl-3 ~]$ grep "Starting OVN controller" -r /var/log/kube-ovn/ kube-ovn-controller 最近没有持续crash的log,应该是没有持续崩溃重启 |
[root@iaas-cms-ctrl-1 ovn]# k get ip vm-ce3fr4q8j5gh613m5u50.yiaas.net1.yiaas.ovn -o yaml
apiVersion: kubeovn.io/v1
kind: IP
metadata:
creationTimestamp: "2022-11-30T06:53:32Z"
generation: 3
labels:
ovn.kubernetes.io/subnet: subnet-cdq57ea8j5gqg4vf8ak0
subnet-cdq57ea8j5gqg4vf8ak0: ""
name: vm-ce3fr4q8j5gh613m5u50.yiaas.net1.yiaas.ovn
resourceVersion: "17229232"
uid: 92614203-af24-4327-ae14-edbc9a41c771
spec:
attachIps: []
attachMacs: []
attachSubnets: []
containerID: ""
ipAddress: 168.50.8.2
macAddress: 00:00:00:A9:2E:06
namespace: yiaas
nodeName: iaas-cms-ctrl-1
podName: vm-ce3fr4q8j5gh613m5u50
podType: VirtualMachine
subnet: subnet-cdq57ea8j5gqg4vf8ak0 # 子网id不一致
v4IpAddress: 168.50.8.2
v6IpAddress: ""
[root@iaas-cms-ctrl-1 ovn]# k get ip vm-ce418ki8j5ggeis9ivmg.yiaas.net1.yiaas.ovn -o yaml
apiVersion: kubeovn.io/v1
kind: IP
metadata:
creationTimestamp: "2022-12-01T02:42:52Z"
generation: 4
labels:
ovn.kubernetes.io/subnet: subnet-cdq57ea8j5gqg4vf8ak0
subnet-cdq57ea8j5gqg4vf8ak0: ""
name: vm-ce418ki8j5ggeis9ivmg.yiaas.net1.yiaas.ovn
resourceVersion: "18109557"
uid: 9ea4b4f9-6317-4a8a-ba1e-3afafa77db48
spec:
attachIps: []
attachMacs: []
attachSubnets: []
containerID: ""
ipAddress: 168.50.8.2
macAddress: 00:00:00:21:DA:C3
namespace: yiaas
nodeName: iaas-cms-ctrl-2
podName: vm-ce418ki8j5ggeis9ivmg
podType: VirtualMachine
subnet: subnet-cdq57ea8j5gqg4vf8ak0 # 子网id不一致
v4IpAddress: 168.50.8.2
v6IpAddress: ""
[root@iaas-cms-ctrl-1 ovn]#
# 这个保留了很长时间了
# 这两个ip的子网不一样,所以ip才冲突了,归根应该是子网冲突
|
如果 k8s 开启审计日志功能可以看一下该 ip 资源的操作记录,是不是有删除后重复创建的操作 |
看起来这两个重复的ip创建,相隔了20+个小时,而且不属于同一个subnet,应该不是同一个pod触发的删除后重建的动作。 审计日志这个功能正在计划中,暂无。 |
|
是所有都会遗留么,还是批量创建删除部分没有清理? |
[root@iaas-cms-ctrl-1 ~]# k get ip | grep 168.0.0
vm-ce6s14q8j5gjlb83p58g.yiaas.net1.yiaas.ovn 168.0.0.2 00:00:00:A7:B8:D2 iaas-cms-ctrl-1 subnet-ce6rhna8j5gjlb83p4fg
vm-ce6tl1a8j5gjlb83p5e0.yiaas.net1.yiaas.ovn 168.0.0.3 00:00:00:A1:E1:70 iaas-cms-ctrl-2 subnet-ce6rhna8j5gjlb83p4fg
vm-ce7am7q8j5gjlb83p5lg.yiaas.net1.yiaas.ovn 168.0.0.4 00:00:00:AC:78:8F iaas-cms-ctrl-2 subnet-ce6rhna8j5gjlb83p4fg
vpc-nat-gw-ngw-ce7a22a8j5gjlb83p5gg-0.kube-system 168.0.0.253 00:00:00:1E:D3:9B iaas-cms-ctrl-3 subnet-ce6rhna8j5gjlb83p4fg
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]# k get po -A -o wide | grep 168.0.0
kube-system vpc-nat-gw-ngw-ce7a22a8j5gjlb83p5gg-0 1/1 Running 0 3h53m 168.0.0.253 iaas-cms-ctrl-3 <none> <none>
[root@iaas-cms-ctrl-1 ~]#
# 就现在集群的信息看,应该是删除的虚拟机的ip都遗留下来了, 目前虚拟机的测试都是单个单个创建的
|
是使用什么方式创建的 vm,我们在 1.9 版本上用 VirtualMachine 这个资源创建 vm ,删除这个资源后可以正常回收 ip |
apiVersion: kubevirt.io/v1
kind: VirtualMachine
metadata:
name: vm-ce7am7q8j5gjlb83p5lg
namespace: iaas
spec:
dataVolumeTemplates:
- apiVersion: cdi.kubevirt.io/v1beta1
kind: DataVolume
metadata:
annotations:
cdi.kubevirt.io/cloneStrategyOverride: copy
name: vol-ce7am7q8j5gjlb83p5k0
namespace: yiaas
spec:
pvc:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 25Gi
storageClassName: rbd.csi.ssd
volumeMode: Block
source:
pvc:
name: img-ce3faji8j5gh613m5tkg
namespace: yiaas
- apiVersion: cdi.kubevirt.io/v1beta1
kind: DataVolume
metadata:
name: vol-ce7am7q8j5gjlb83p5kg
namespace: yiaas
spec:
pvc:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 25Gi
storageClassName: rbd.csi.ssd
volumeMode: Block
source:
blank: {}
instancetype:
kind: VirtualMachineInstancetype
name: small
revisionName: vm-ce7am7q8j5gjlb83p5lg-small-a8dee6d1-f20d-4cf2-bf1a-2fb148bd05e5-1
running: true
template:
metadata:
annotations:
kubevirt.io/hide-pod-network: "true"
net1.virtualmachine.fields.yiaas.yealink.com/network: vpc-ce6rhj28j5gjlb83p4f0
net1.yiaas.ovn.kubernetes.io/allow_live_migration: "true"
net1.yiaas.ovn.kubernetes.io/logical_switch: subnet-ce6rhna8j5gjlb83p4fg
creationTimestamp: null
spec:
accessCredentials:
- sshPublicKey:
propagationMethod:
configDrive: {}
source:
secret:
secretName: ac-cdqot2a8j5gqg4vf8bt0
dnsConfig:
nameservers:
- 168.0.0.0
dnsPolicy: ClusterFirst
domain:
devices:
disks:
- disk: {}
name: vol-ce7am7q8j5gjlb83p5k0
- disk: {}
name: vol-ce7am7q8j5gjlb83p5kg
- disk: {}
name: cdi-ce7am7q8j5gjlb83p5l0
interfaces:
- bridge: {}
name: wk
machine:
type: q35
resources: {}
networks:
- multus:
networkName: net1
name: wk
volumes:
- dataVolume:
name: vol-ce7am7q8j5gjlb83p5k0
name: vol-ce7am7q8j5gjlb83p5k0
- dataVolume:
name: vol-ce7am7q8j5gjlb83p5kg
name: vol-ce7am7q8j5gjlb83p5kg
- cloudInitConfigDrive:
userData: |
#cloud-config
ssh_pwauth: True
groups:
- admingroup: [root,sys]
users:
- name: root
gecos: Foo B. Bar
sudo: ALL=(ALL) NOPASSWD:ALL
groups: root
expiredate: '2032-09-01'
lock_passwd: false
plain_text_passwd: 123456
name: cdi-ce7am7q8j5gjlb83p5l0 |
现在关机后立刻启动,vm现象为ip地址丢失,应该是关机后就将ip回收,分配了新的ip与原有ip不一致导致。 |
对于这个现象,需要确认下,keep-vm-ip 参数是否开启了?没有开启这个参数的时候,vm 关机才会直接删除 IP crd。 |
|
看这个配置是没有问题的,lsp的名称也没有问题。 |
root@iaas-cms-ctrl-1 ~]# k get daemonset -A -o wide | grep kube-ovn-cni
kube-system kube-ovn-cni 3 3 3 3 3 kubernetes.io/os=linux 2d1h cni-server kubeovn/kube-ovn:v1.11.0 app=kube-ovn-cni
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]#
[root@iaas-cms-ctrl-1 ~]# k get po -A -o wide | grep kube-ovn-cni
kube-system kube-ovn-cni-7gsxf 1/1 Running 0 2d 10.121.33.12 iaas-cms-ctrl-2 <none> <none>
kube-system kube-ovn-cni-nsk5n 1/1 Running 0 2d 10.121.33.13 iaas-cms-ctrl-3 <none> <none>
kube-system kube-ovn-cni-xq4hn 1/1 Running 0 2d 10.121.33.11 iaas-cms-ctrl-1 <none> <none>
[root@iaas-cms-ctrl-1 ~]# k logs -f -n kube-system kube-ovn-cni-7gsxf
setting sysctl variable "net.ipv4.neigh.default.gc_thresh1" to "1024"
net.ipv4.neigh.default.gc_thresh1 = 1024
setting sysctl variable "net.ipv4.neigh.default.gc_thresh2" to "2048"
net.ipv4.neigh.default.gc_thresh2 = 2048
setting sysctl variable "net.ipv4.neigh.default.gc_thresh3" to "4096"
net.ipv4.neigh.default.gc_thresh3 = 4096
setting sysctl variable "net.netfilter.nf_conntrack_tcp_be_liberal" to "1"
net.netfilter.nf_conntrack_tcp_be_liberal = 1
I1205 16:50:58.161898 3737885 cniserver.go:34]
-------------------------------------------------------------------------------
Kube-OVN:
Version: v1.11.0
Build: 2022-12-03_06:43:38
Commit: git-86f75c8
Go Version: go1.19.3
Arch: amd64
|
删除 running 状态的 pod 复现了问题,还需要再确认一下 |
Expected Behavior
ip资源未被回收,子网ip占用残留
Actual Behavior
Steps to Reproduce the Problem
Additional Info
Kubernetes version:
Output of
kubectl version
:Client Version: version.Info{Major:"1", Minor:"23", GitVersion:"v1.23.7", GitCommit:"42c05a547468804b2053ecf60a3bd15560362fc2", GitTreeState:"clean", BuildDate:"2022-05-24T12:30:55Z", GoVersion:"go1.17.10", Compiler:"gc", Platform:"linux/amd64"}
v1.10.7
CentOS Stream 8 5.4.223-1.el8.elrepo.x86_64
The text was updated successfully, but these errors were encountered: