@@ -131,13 +131,13 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
131
131
// trigger reboot only for nodes which are in UpgradeStarted but haven't rebooted yet
132
132
if nodeObj .Status .NodeInfo .BootID == moduleStatus .BootId {
133
133
log .FromContext (ctx ).Info (fmt .Sprintf ("Node: %v: Reboot is required for driver upgrade, triggering node reboot" , nodeName ))
134
- n .helper .handleNodeReboot (ctx , nodeObj , deviceConfig )
134
+ n .helper .handleNodeReboot (ctx , nodeObj , * deviceConfig )
135
135
// for nodes which are in UpgradeStarted but already rebooted. Schedule the reboot pod deletion
136
136
} else {
137
137
currentBootID := nodeObj .Status .NodeInfo .BootID
138
138
n .helper .setBootID (nodeObj .Name , currentBootID )
139
139
log .FromContext (ctx ).Info (fmt .Sprintf ("Node: %v: Node already rebooted, scheduling reboot pod deletion" , nodeName ))
140
- go n .helper .deleteRebootPod (ctx , nodeName , deviceConfig , false , deviceConfig . Generation )
140
+ go n .helper .deleteRebootPod (ctx , nodeName , * deviceConfig , false )
141
141
}
142
142
}
143
143
} else {
@@ -155,7 +155,7 @@ func (n *upgradeMgr) HandleUpgrade(ctx context.Context, deviceConfig *amdv1alpha
155
155
n .helper .setNodeStatus (ctx , nodeName , amdv1alpha1 .UpgradeStateInProgress )
156
156
} else {
157
157
n .helper .setNodeStatus (ctx , nodeName , moduleStatus .Status )
158
- go n .helper .deleteRebootPod (ctx , nodeName , deviceConfig , false , deviceConfig . Generation )
158
+ go n .helper .deleteRebootPod (ctx , nodeName , * deviceConfig , false )
159
159
}
160
160
} else {
161
161
n .helper .setNodeStatus (ctx , nodeName , moduleStatus .Status )
@@ -276,7 +276,7 @@ func (n *upgradeMgr) HandleDelete(ctx context.Context, deviceConfig *amdv1alpha1
276
276
if err := n .helper .cordonOrUncordonNode (ctx , deviceConfig , & nodeList .Items [i ], false ); err != nil {
277
277
log .FromContext (ctx ).Error (err , fmt .Sprintf ("Taint Removal failed for %v during deviceconfig delete:%v" , & nodeList .Items [i ].Name , err ))
278
278
}
279
- n .helper .deleteRebootPod (ctx , nodeList .Items [i ].Name , deviceConfig , true , deviceConfig . Generation )
279
+ n .helper .deleteRebootPod (ctx , nodeList .Items [i ].Name , * deviceConfig , true )
280
280
}
281
281
n .helper .clearNodeStatus ()
282
282
return
@@ -322,8 +322,8 @@ type upgradeMgrHelperAPI interface {
322
322
getPodsToDrainOrDelete (ctx context.Context , deviceConfig * amdv1alpha1.DeviceConfig , node * v1.Node ) (newPods []v1.Pod , err error )
323
323
deleteOrDrainPods (ctx context.Context , deviceConfig * amdv1alpha1.DeviceConfig , node * v1.Node ) error
324
324
updateModuleVersionOnNode (ctx context.Context , deviceConfig * amdv1alpha1.DeviceConfig , node * v1.Node ) error
325
- handleNodeReboot (ctx context.Context , node * v1.Node , dc * amdv1alpha1.DeviceConfig )
326
- deleteRebootPod (ctx context.Context , nodeName string , dc * amdv1alpha1.DeviceConfig , force bool , genId int64 )
325
+ handleNodeReboot (ctx context.Context , node * v1.Node , dc amdv1alpha1.DeviceConfig )
326
+ deleteRebootPod (ctx context.Context , nodeName string , dc amdv1alpha1.DeviceConfig , force bool )
327
327
getRebootPod (nodeName string , dc * amdv1alpha1.DeviceConfig ) * v1.Pod
328
328
329
329
// getters and setters
@@ -817,7 +817,7 @@ func (h *upgradeMgrHelper) handleNodeUpgrade(ctx context.Context, deviceConfig a
817
817
818
818
// Reboot the node if required
819
819
if deviceConfig .Spec .Driver .UpgradePolicy .RebootRequired != nil && * deviceConfig .Spec .Driver .UpgradePolicy .RebootRequired {
820
- h .handleNodeReboot (ctx , & node , & deviceConfig )
820
+ h .handleNodeReboot (ctx , & node , deviceConfig )
821
821
} else {
822
822
// Update expected module version on the node
823
823
if err := h .updateModuleVersionOnNode (ctx , & deviceConfig , & node ); err != nil {
@@ -956,9 +956,9 @@ func (h *upgradeMgrHelper) updateModuleVersionOnNode(ctx context.Context, device
956
956
return nil
957
957
}
958
958
959
- func (h * upgradeMgrHelper ) handleNodeReboot (ctx context.Context , node * v1.Node , dc * amdv1alpha1.DeviceConfig ) {
959
+ func (h * upgradeMgrHelper ) handleNodeReboot (ctx context.Context , node * v1.Node , dc amdv1alpha1.DeviceConfig ) {
960
960
logger := log .FromContext (ctx )
961
- rebootPod := h .getRebootPod (node .Name , dc )
961
+ rebootPod := h .getRebootPod (node .Name , & dc )
962
962
// Delete the existing pod if present
963
963
pod := & v1.Pod {}
964
964
if err := h .client .Get (ctx , types.NamespacedName {Namespace : dc .Namespace , Name : rebootPod .Name }, pod ); err == nil {
@@ -970,7 +970,7 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node,
970
970
}
971
971
972
972
// Update expected module version on the node
973
- if err := h .updateModuleVersionOnNode (ctx , dc , node ); err != nil {
973
+ if err := h .updateModuleVersionOnNode (ctx , & dc , node ); err != nil {
974
974
logger .Error (err , fmt .Sprintf ("Node: %v State: %v UpgradeFailed with Error: %v" , node .Name , h .getNodeStatus (node .Name ), err ))
975
975
// Mark the state as failed
976
976
h .setNodeStatus (ctx , node .Name , amdv1alpha1 .UpgradeStateFailed )
@@ -1037,15 +1037,22 @@ func (h *upgradeMgrHelper) handleNodeReboot(ctx context.Context, node *v1.Node,
1037
1037
// Wait for the rebootPod to get spawned
1038
1038
waitForRebootPod ()
1039
1039
1040
- h .setNodeStatus (ctx , node .Name , amdv1alpha1 .UpgradeStateRebootInProgress )
1041
- h .deleteRebootPod (ctx , node .Name , dc , false , dc .Generation )
1040
+ fetchedDeviceConfig := & amdv1alpha1.DeviceConfig {}
1041
+ if err := h .client .Get (ctx , types.NamespacedName {Namespace : dc .Namespace , Name : dc .Name }, fetchedDeviceConfig ); err != nil {
1042
+ logger .Error (err , "Failed to fetch DeviceConfig from API server" )
1043
+ return
1044
+ }
1045
+ if fetchedDeviceConfig .Spec .Driver .Version == dc .Spec .Driver .Version {
1046
+ h .setNodeStatus (ctx , node .Name , amdv1alpha1 .UpgradeStateRebootInProgress )
1047
+ }
1048
+ h .deleteRebootPod (ctx , node .Name , dc , false )
1042
1049
1043
1050
}
1044
1051
1045
- func (h * upgradeMgrHelper ) deleteRebootPod (ctx context.Context , nodeName string , dc * amdv1alpha1.DeviceConfig , force bool , genId int64 ) {
1052
+ func (h * upgradeMgrHelper ) deleteRebootPod (ctx context.Context , nodeName string , dc amdv1alpha1.DeviceConfig , force bool ) {
1046
1053
1047
1054
logger := log .FromContext (ctx )
1048
- rebootPod := h .getRebootPod (nodeName , dc )
1055
+ rebootPod := h .getRebootPod (nodeName , & dc )
1049
1056
fetchedDeviceConfig := & amdv1alpha1.DeviceConfig {}
1050
1057
pod := & v1.Pod {}
1051
1058
if err := h .client .Get (ctx , types.NamespacedName {Namespace : dc .Namespace , Name : rebootPod .Name }, pod ); err != nil {
@@ -1077,7 +1084,7 @@ func (h *upgradeMgrHelper) deleteRebootPod(ctx context.Context, nodeName string,
1077
1084
if err := h .client .Delete (ctx , rebootPod ); err != nil {
1078
1085
logger .Error (err , fmt .Sprintf ("Node: %v State: %v RebootPod Delete failed with Error: %v" , nodeName , h .getNodeStatus (nodeName ), err ))
1079
1086
}
1080
- if fetchedDeviceConfig .Generation == genId {
1087
+ if fetchedDeviceConfig .Spec . Driver . Version == dc . Spec . Driver . Version {
1081
1088
logger .Info ("Setting to In-Progress after deleting reboot pod" )
1082
1089
h .setNodeStatus (ctx , nodeName , amdv1alpha1 .UpgradeStateInProgress )
1083
1090
}
@@ -1100,7 +1107,7 @@ func (h *upgradeMgrHelper) deleteRebootPod(ctx context.Context, nodeName string,
1100
1107
logger .Error (err , "Failed to fetch DeviceConfig from API server" )
1101
1108
return
1102
1109
}
1103
- if fetchedDeviceConfig .Generation == genId {
1110
+ if fetchedDeviceConfig .Spec . Driver . Version == dc . Spec . Driver . Version {
1104
1111
logger .Info ("Setting to In-Progress after deleting reboot pod eventually" )
1105
1112
h .setNodeStatus (ctx , nodeName , amdv1alpha1 .UpgradeStateInProgress )
1106
1113
}
0 commit comments