Skip to content

Commit c3c1673

Browse files
Vicente-Chengc3y1huangderekbit
committed
fix(networking): cleanup service/endpoint if needed
We meet a corner case that the service/endpoint would not be cleanup. That will cause the service keep the ClusterIP `None`. With this config, the endpoint of sharemanager would not correct. So the CSI driver cannot perform the mountpoint well. We would like to have a checking mechanism to know if the service/ endpoint did not cleanup. Then we will cleanup the service/endpoint to ensure the correct endpoint. Remove the cleanup function in the setting controller, we could do the cleanup on the sm controller Signed-off-by: Vicente Cheng <[email protected]> Co-authored-by: Chin-Ya Huang <[email protected]> Co-authored-by: Derek Su <[email protected]>
1 parent 19ba220 commit c3c1673

File tree

3 files changed

+109
-55
lines changed

3 files changed

+109
-55
lines changed

controller/setting_controller.go

-40
Original file line numberDiff line numberDiff line change
@@ -359,14 +359,6 @@ func (sc *SettingController) syncDangerZoneSettingsForManagedComponents(settingN
359359
return &types.ErrorInvalidState{Reason: fmt.Sprintf("failed to apply %v setting to Longhorn components when there are attached volumes. It will be eventually applied", types.SettingNameStorageNetworkForRWXVolumeEnabled)}
360360
}
361361

362-
// Perform cleanup of the share manager Service
363-
// This is to allow the creation of the correct Service
364-
// and Endpoint when switching between cluster network
365-
// and storage network.
366-
if err := sc.cleanupShareManagerServiceAndEndpoints(); err != nil {
367-
return err
368-
}
369-
370362
return nil
371363
}
372364

@@ -943,38 +935,6 @@ func (sc *SettingController) updateKubernetesClusterAutoscalerEnabled() error {
943935
return nil
944936
}
945937

946-
func (sc *SettingController) cleanupShareManagerServiceAndEndpoints() error {
947-
var err error
948-
defer func() {
949-
if err != nil {
950-
err = errors.Wrapf(err, "failed to cleanup share manager service and endpoints for %s setting update", types.SettingNameStorageNetworkForRWXVolumeEnabled)
951-
}
952-
}()
953-
954-
shareManagers, err := sc.ds.ListShareManagers()
955-
if err != nil {
956-
return err
957-
}
958-
959-
for _, shareManager := range shareManagers {
960-
log := sc.logger.WithField("shareManager", shareManager.Name)
961-
962-
log.WithField("service", shareManager.Name).Infof("Deleting Service for %v setting update", types.SettingNameStorageNetworkForRWXVolumeEnabled)
963-
err := sc.ds.DeleteService(shareManager.Namespace, shareManager.Name)
964-
if err != nil && !apierrors.IsNotFound(err) {
965-
return err
966-
}
967-
968-
log.WithField("endpoint", shareManager.Name).Infof("Deleting Endpoint for %v setting update", types.SettingNameStorageNetworkForRWXVolumeEnabled)
969-
err = sc.ds.DeleteKubernetesEndpoint(shareManager.Namespace, shareManager.Name)
970-
if err != nil && !apierrors.IsNotFound(err) {
971-
return err
972-
}
973-
}
974-
975-
return nil
976-
}
977-
978938
// updateCNI deletes all system-managed data plane components immediately with the updated CNI annotation.
979939
func (sc *SettingController) updateCNI(funcPreupdate func() error) error {
980940
storageNetwork, err := sc.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)

controller/share_manager_controller.go

+101-15
Original file line numberDiff line numberDiff line change
@@ -451,17 +451,12 @@ func (c *ShareManagerController) syncShareManagerEndpoint(sm *longhorn.ShareMana
451451
return nil
452452
}
453453

454-
storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
454+
storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
455455
if err != nil {
456456
return err
457457
}
458458

459-
storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
460-
if err != nil {
461-
return err
462-
}
463-
464-
if types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled) {
459+
if storageNetworkForRWXVolume {
465460
serviceFqdn := fmt.Sprintf("%v.%v.svc.cluster.local", sm.Name, sm.Namespace)
466461
sm.Status.Endpoint = fmt.Sprintf("nfs://%v/%v", serviceFqdn, sm.Name)
467462
} else {
@@ -1050,6 +1045,97 @@ func (c *ShareManagerController) getShareManagerTolerationsFromStorageClass(sc *
10501045
return tolerations
10511046
}
10521047

1048+
func (c *ShareManagerController) isStorageNetworkForRWXVolume() (bool, error) {
1049+
storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
1050+
if err != nil {
1051+
return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetwork)
1052+
}
1053+
1054+
storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
1055+
if err != nil {
1056+
return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetworkForRWXVolumeEnabled)
1057+
}
1058+
1059+
return types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled), nil
1060+
}
1061+
1062+
func (c *ShareManagerController) checkStorageNetworkApplied() (bool, error) {
1063+
targetSettings := []types.SettingName{types.SettingNameStorageNetwork, types.SettingNameStorageNetworkForRWXVolumeEnabled}
1064+
for _, item := range targetSettings {
1065+
if applied, err := c.ds.GetSettingApplied(item); err != nil || !applied {
1066+
return applied, err
1067+
}
1068+
}
1069+
return true, nil
1070+
}
1071+
1072+
func (c *ShareManagerController) canCleanupServiceAndEndpoint(ns, shareManagerName string) (bool, error) {
1073+
service, err := c.ds.GetService(c.namespace, shareManagerName)
1074+
if err != nil {
1075+
// if NotFound, means the service/endpoint is already cleaned up
1076+
// The service and endpoint are related with the kubernetes endpoint controller.
1077+
// It means once the service is deleted, the corresponding endpoint will be deleted automatically.
1078+
if apierrors.IsNotFound(err) {
1079+
return false, nil
1080+
}
1081+
return false, errors.Wrap(err, "failed to get service")
1082+
}
1083+
1084+
// check the settings status of storage network and storage network for RWX volume
1085+
settingsApplied, err := c.checkStorageNetworkApplied()
1086+
if err != nil {
1087+
return false, errors.Wrap(err, "failed to check if the storage network settings are applied")
1088+
}
1089+
if !settingsApplied {
1090+
c.logger.Warn("Storage network settings are not applied, do nothing")
1091+
return false, nil
1092+
}
1093+
1094+
storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
1095+
if err != nil {
1096+
return false, err
1097+
}
1098+
1099+
// no need to cleanup because looks the service file is correct
1100+
if storageNetworkForRWXVolume {
1101+
if service.Spec.ClusterIP == core.ClusterIPNone {
1102+
return false, nil
1103+
}
1104+
} else {
1105+
if service.Spec.ClusterIP != core.ClusterIPNone {
1106+
return false, nil
1107+
}
1108+
}
1109+
return true, nil
1110+
}
1111+
1112+
func (c *ShareManagerController) cleanupServiceAndEndpoint(shareManager *longhorn.ShareManager) error {
1113+
if ok, err := c.canCleanupServiceAndEndpoint(c.namespace, shareManager.Name); !ok || err != nil {
1114+
if err != nil {
1115+
return errors.Wrapf(err, "failed to check if we can cleanup service and endpoint for share manager %v", shareManager.Name)
1116+
}
1117+
return nil
1118+
}
1119+
1120+
// let's cleanup
1121+
c.logger.Infof("Deleting Service for share manager %v", shareManager.Name)
1122+
err := c.ds.DeleteService(c.namespace, shareManager.Name)
1123+
if err != nil && !apierrors.IsNotFound(err) {
1124+
return errors.Wrapf(err, "failed to delete service for share manager %v", shareManager.Name)
1125+
}
1126+
1127+
c.logger.Infof("Deleting Endpoint for share manager %v", shareManager.Name)
1128+
err = c.ds.DeleteKubernetesEndpoint(c.namespace, shareManager.Name)
1129+
if err != nil && !apierrors.IsNotFound(err) {
1130+
// we don't return error with the endpoint deletion because the kubernetes
1131+
// endpoints_controller will sync the service to clean up the corresponding endpoint
1132+
// https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/endpoint/endpoints_controller.go#L374-L392
1133+
c.logger.Warnf("Failed to delete endpoint for share manager %v", shareManager.Name)
1134+
}
1135+
1136+
return nil
1137+
}
1138+
10531139
func (c *ShareManagerController) createServiceAndEndpoint(shareManager *longhorn.ShareManager) error {
10541140
// check if we need to create the service
10551141
_, err := c.ds.GetService(c.namespace, shareManager.Name)
@@ -1122,6 +1208,11 @@ func (c *ShareManagerController) createShareManagerPod(sm *longhorn.ShareManager
11221208
}
11231209
priorityClass := setting.Value
11241210

1211+
err = c.cleanupServiceAndEndpoint(sm)
1212+
if err != nil {
1213+
return nil, errors.Wrapf(err, "failed to cleanup service and endpoint for share manager %v", sm.Name)
1214+
}
1215+
11251216
err = c.createServiceAndEndpoint(sm)
11261217
if err != nil {
11271218
return nil, errors.Wrapf(err, "failed to create service and endpoint for share manager %v", sm.Name)
@@ -1276,17 +1367,12 @@ func (c *ShareManagerController) createServiceManifest(sm *longhorn.ShareManager
12761367

12771368
log := getLoggerForShareManager(c.logger, sm)
12781369

1279-
storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
1280-
if err != nil {
1281-
log.WithError(err).Warnf("Failed to get %v setting, fallback to cluster IP", types.SettingNameStorageNetwork)
1282-
}
1283-
1284-
storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
1370+
storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
12851371
if err != nil {
1286-
log.WithError(err).Warnf("Failed to get %v setting, fallback to cluster IP", types.SettingNameStorageNetworkForRWXVolumeEnabled)
1372+
log.WithError(err).Warnf("Failed to check storage network for RWX volume")
12871373
}
12881374

1289-
if types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled) {
1375+
if storageNetworkForRWXVolume {
12901376
// Create a headless service do it doesn't use a cluster IP. This allows
12911377
// directly reaching the share manager pods using their individual
12921378
// IP address.

datastore/longhorn.go

+8
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,14 @@ func (s *DataStore) GetSettingExactRO(sName types.SettingName) (*longhorn.Settin
619619
return resultRO, nil
620620
}
621621

622+
func (s *DataStore) GetSettingApplied(sName types.SettingName) (bool, error) {
623+
resultRO, err := s.getSettingRO(string(sName))
624+
if err != nil {
625+
return false, err
626+
}
627+
return resultRO.Status.Applied, nil
628+
}
629+
622630
// GetSetting will automatically fill the non-existing setting if it's a valid
623631
// setting name.
624632
// The function will not return nil for *longhorn.Setting when error is nil

0 commit comments

Comments
 (0)