Skip to content

Commit 9eb4f69

Browse files
committed
fix(orphan): finalize orphan instance CR when instance manager unavailable
Signed-off-by: Raphanus Lo <[email protected]>
1 parent 0088687 commit 9eb4f69

File tree

1 file changed

+14
-54
lines changed

1 file changed

+14
-54
lines changed

controller/orphan_controller.go

Lines changed: 14 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ func (oc *OrphanController) cleanupOrphanedResource(orphan *longhorn.Orphan) (is
315315
longhorn.OrphanConditionTypeError, longhorn.ConditionStatusTrue, "", err.Error())
316316
}()
317317

318-
// Make sure if the orphan nodeID and controller ID are same.
318+
// Make sure if the orphan nodeID and controller ID are the same.
319319
// If NO, just delete the orphan resource object and don't touch the data.
320320
if orphan.Spec.NodeID != oc.controllerID {
321321
log.WithFields(logrus.Fields{
@@ -370,7 +370,7 @@ func (oc *OrphanController) cleanupOrphanedEngineInstance(orphan *longhorn.Orpha
370370
} else {
371371
spec = &engineCR.Spec.InstanceSpec
372372
}
373-
return oc.cleanupOrphanedInstance(orphan, instance, imName, longhorn.InstanceManagerTypeEngine, spec)
373+
return oc.cleanupOrphanedInstance(orphan, instance, imName, longhorn.InstanceManagerTypeEngine, spec), nil
374374
}
375375

376376
func (oc *OrphanController) cleanupOrphanedReplicaInstance(orphan *longhorn.Orphan) (isCleanupComplete bool, err error) {
@@ -394,7 +394,7 @@ func (oc *OrphanController) cleanupOrphanedReplicaInstance(orphan *longhorn.Orph
394394
} else {
395395
spec = &replicaCR.Spec.InstanceSpec
396396
}
397-
return oc.cleanupOrphanedInstance(orphan, instance, imName, longhorn.InstanceManagerTypeReplica, spec)
397+
return oc.cleanupOrphanedInstance(orphan, instance, imName, longhorn.InstanceManagerTypeReplica, spec), nil
398398
}
399399

400400
func (oc *OrphanController) extractOrphanedInstanceInfo(orphan *longhorn.Orphan) (name, instanceManager string, err error) {
@@ -418,33 +418,33 @@ func (oc *OrphanController) extractOrphanedInstanceInfo(orphan *longhorn.Orphan)
418418
return name, instanceManager, nil
419419
}
420420

421-
func (oc *OrphanController) cleanupOrphanedInstance(orphan *longhorn.Orphan, instance, imName string, imType longhorn.InstanceManagerType, instanceCRSpec *longhorn.InstanceSpec) (isCleanupComplete bool, err error) {
421+
func (oc *OrphanController) cleanupOrphanedInstance(orphan *longhorn.Orphan, instance, imName string, imType longhorn.InstanceManagerType, instanceCRSpec *longhorn.InstanceSpec) bool {
422422
if instanceCRSpec != nil && instanceCRSpec.NodeID == orphan.Spec.NodeID {
423423
oc.logger.Infof("Orphan instance %v is scheduled back to current node %v. Skip cleaning up the instance resource and finalize the orphan CR.", instance, orphan.Spec.NodeID)
424-
return true, nil
424+
return true
425425
}
426426

427+
// If the instance manager client is unavailable or failed to delete the instance, continue finalizing the orphan.
428+
// Later if the orphaned instance is still reachable, the orphan will be recreated.
427429
imc, err := oc.getRunningInstanceManagerClientForOrphan(orphan, imName)
428430
if err != nil {
429-
return false, errors.Wrapf(err, "failed to get running instance manager client for orphan %v", orphan.Name)
431+
oc.logger.WithError(err).Warnf("Failed to delete orphan instance %v due to instance manager client initialization failure. Continue to finalize orphan %v", instance, orphan.Name)
432+
return false
430433
} else if imc == nil {
431434
oc.logger.WithField("orphanInstanceNode", orphan.Spec.NodeID).Warnf("No running instance manager for deleting orphan instance %v", orphan.Name)
432-
return true, nil
435+
return true
433436
}
434437
defer func() {
435438
if closeErr := imc.Close(); closeErr != nil {
436439
oc.logger.WithError(closeErr).Error("Failed to close instance manager client")
437440
}
438441
}()
439442

440-
if err := oc.deleteInstance(imc, instance, imType, orphan.Spec.DataEngine); err != nil {
441-
return false, err
442-
}
443-
isCleanupComplete, cleanupErr := oc.confirmOrphanInstanceCleanup(imc, instance, imType, orphan.Spec.DataEngine)
444-
if cleanupErr == nil && !isCleanupComplete {
445-
oc.logger.Infof("Orphan instance %v cleanup in progress", instance)
443+
err = imc.InstanceDelete(orphan.Spec.DataEngine, instance, string(imType), "", false)
444+
if err != nil && !types.ErrorIsNotFound(err) {
445+
oc.logger.WithError(err).Warnf("Failed to delete orphan instance %v. Continue to finalize orphan %v", instance, orphan.Name)
446446
}
447-
return isCleanupComplete, cleanupErr
447+
return true
448448
}
449449

450450
func (oc *OrphanController) getRunningInstanceManagerClientForOrphan(orphan *longhorn.Orphan, imName string) (*engineapi.InstanceManagerClient, error) {
@@ -462,46 +462,6 @@ func (oc *OrphanController) getRunningInstanceManagerClientForOrphan(orphan *lon
462462
return engineapi.NewInstanceManagerClient(im, false)
463463
}
464464

465-
func (oc *OrphanController) deleteInstance(imc *engineapi.InstanceManagerClient, instanceName string, instanceKind longhorn.InstanceManagerType, engineType longhorn.DataEngineType) (err error) {
466-
// There is a delay between deletion initiation and state/InstanceManager update,
467-
// this function may be called multiple times before given instance exits.
468-
469-
oc.logger.Infof("Orphan controller deleting instance %v", instanceName)
470-
471-
defer func() {
472-
err = errors.Wrapf(err, "failed to delete instance %v", instanceName)
473-
}()
474-
475-
err = imc.InstanceDelete(engineType, instanceName, string(instanceKind), "", false)
476-
if err != nil && !types.ErrorIsNotFound(err) {
477-
return err
478-
}
479-
480-
return nil
481-
}
482-
483-
func (oc *OrphanController) confirmOrphanInstanceCleanup(imc *engineapi.InstanceManagerClient, instanceName string, imType longhorn.InstanceManagerType, engineType longhorn.DataEngineType) (isCleanupComplete bool, err error) {
484-
defer func() {
485-
if err != nil {
486-
err = errors.Wrapf(err, "failed to confirm cleanup result of instance %v", instanceName)
487-
}
488-
}()
489-
490-
_, err = imc.InstanceGet(engineType, instanceName, string(imType))
491-
switch {
492-
case err == nil:
493-
// Instance still exists - cleanup not complete
494-
// Cleanup will continue after the instance state update.
495-
return false, nil
496-
case types.ErrorIsNotFound(err):
497-
// instance not found - cleanup completed.
498-
return true, nil
499-
default:
500-
// Unexpected error - cleanup status unknown.
501-
return false, err
502-
}
503-
}
504-
505465
func (oc *OrphanController) deleteOrphanedReplicaDataStore(orphan *longhorn.Orphan) error {
506466
oc.logger.Infof("Deleting orphan %v replica data store %v in disk %v on node %v",
507467
orphan.Name, orphan.Spec.Parameters[longhorn.OrphanDataName],

0 commit comments

Comments
 (0)