Skip to content

Commit 261964d

Browse files
committed
[fix] Fix lost state (instanceIdsToTerminate) on configuration change
[fix] Fix maxtotaluses decrement logic add logs in post job action to expose tasks terminated with problems #322 add and fix tests
1 parent fd5e9da commit 261964d

File tree

5 files changed

+228
-177
lines changed

5 files changed

+228
-177
lines changed

src/main/java/com/amazon/jenkins/ec2fleet/EC2RetentionStrategy.java

+61-32
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919
public class EC2RetentionStrategy extends RetentionStrategy<SlaveComputer> implements ExecutorListener {
2020

21-
private static final int RE_CHECK_IN_MINUTE = 1;
21+
private static final int RE_CHECK_IN_A_MINUTE = 1;
2222

2323
private static final Logger LOGGER = Logger.getLogger(EC2RetentionStrategy.class.getName());
2424

@@ -42,52 +42,62 @@ public long check(final SlaveComputer computer) {
4242
if (cloud == null) {
4343
LOGGER.warning("Cloud is null for computer " + fc.getDisplayName()
4444
+ ". This should be autofixed in a few minutes, if not please create an issue for the plugin");
45-
return RE_CHECK_IN_MINUTE;
45+
return RE_CHECK_IN_A_MINUTE;
4646
}
4747

4848
// Ensure that the EC2FleetCloud cannot be mutated from under us while
4949
// we're doing this check
5050
// Ensure nobody provisions onto this node until we've done
5151
// checking
5252
boolean shouldAcceptTasks = fc.isAcceptingTasks();
53-
boolean justTerminated = false;
53+
boolean markedForTermination = false;
5454
fc.setAcceptingTasks(false);
5555
try {
5656
if(fc.isIdle()) {
57-
final EC2AgentTerminationReason reason;
58-
if (isIdleForTooLong(cloud, fc)) {
59-
reason = EC2AgentTerminationReason.IDLE_FOR_TOO_LONG;
57+
Node node = fc.getNode();
58+
if (node == null) {
59+
return RE_CHECK_IN_A_MINUTE;
60+
}
61+
62+
EC2AgentTerminationReason reason;
63+
// Determine the reason for termination from specific to generic use cases.
64+
// Reasoning for checking all cases of termination initiated by the plugin:
65+
// A user-initiated change to cloud configuration creates a new EC2FleetCloud object, erasing class fields containing data like instance IDs to terminate.
66+
// Hence, determine the reasons for termination here using persisted fields for accurate handling of termination.
67+
if (fc.isMarkedForDeletion()) {
68+
reason = EC2AgentTerminationReason.AGENT_DELETED;
6069
} else if (cloud.hasExcessCapacity()) {
6170
reason = EC2AgentTerminationReason.EXCESS_CAPACITY;
71+
} else if (cloud instanceof EC2FleetCloud && !((EC2FleetCloud) cloud).hasUnlimitedUsesForNodes()
72+
&& ((EC2FleetNode)node).getUsesRemaining() <= 0) {
73+
reason = EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED;
74+
} else if (isIdleForTooLong(cloud, fc)) {
75+
reason = EC2AgentTerminationReason.IDLE_FOR_TOO_LONG;
6276
} else {
63-
return 0;
64-
}
65-
66-
// Find instance ID
67-
Node compNode = fc.getNode();
68-
if (compNode == null) {
69-
return 0;
77+
return RE_CHECK_IN_A_MINUTE;
7078
}
7179

72-
final String instanceId = compNode.getNodeName();
73-
if (cloud.scheduleToTerminate(instanceId, false, reason)) {
80+
final String instanceId = node.getNodeName();
81+
final boolean ignoreMinConstraints = reason.equals(EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED);
82+
if (cloud.scheduleToTerminate(instanceId, ignoreMinConstraints, reason)) {
7483
// Instance successfully scheduled for termination, so no longer accept tasks (i.e. suspended)
7584
shouldAcceptTasks = false;
7685
LOGGER.fine(String.format("Suspended node %s after scheduling instance for termination, reason: %s.",
77-
compNode.getDisplayName(), instanceId, reason));
78-
justTerminated = true;
86+
node.getDisplayName(), instanceId, reason));
87+
markedForTermination = true;
7988
}
8089
}
8190

82-
if (cloud.isAlwaysReconnect() && !justTerminated && fc.isOffline() && !fc.isConnecting() && fc.isLaunchSupported()) {
91+
// if connection to the computer is lost for some reason, try to reconnect if configured to do so.
92+
if (cloud.isAlwaysReconnect() && !markedForTermination && fc.isOffline() && !fc.isConnecting() && fc.isLaunchSupported()) {
8393
LOGGER.log(Level.INFO, "Reconnecting to instance: " + fc.getDisplayName());
8494
fc.tryReconnect();
8595
}
8696
} finally {
8797
fc.setAcceptingTasks(shouldAcceptTasks);
8898
}
8999

90-
return RE_CHECK_IN_MINUTE;
100+
return RE_CHECK_IN_A_MINUTE;
91101
}
92102

93103
@Override
@@ -121,37 +131,56 @@ public void taskAccepted(Executor executor, Queue.Task task) {
121131
final EC2FleetNode ec2FleetNode = computer.getNode();
122132
if (ec2FleetNode != null) {
123133
final int maxTotalUses = ec2FleetNode.getMaxTotalUses();
124-
if (maxTotalUses <= -1) {
125-
LOGGER.fine("maxTotalUses set to unlimited (" + ec2FleetNode.getMaxTotalUses() + ") for agent " + computer.getName());
126-
} else if (maxTotalUses <= 1) {
127-
LOGGER.info("maxTotalUses drained - suspending agent after current build " + computer.getName());
128-
computer.setAcceptingTasks(false);
129-
} else {
130-
ec2FleetNode.setMaxTotalUses(ec2FleetNode.getMaxTotalUses() - 1);
131-
LOGGER.info("Agent " + computer.getName() + " has " + ec2FleetNode.getMaxTotalUses() + " builds left");
134+
if (maxTotalUses <= -1) { // unlimited uses
135+
LOGGER.fine("maxTotalUses set to unlimited (" + maxTotalUses + ") for agent " + computer.getName());
136+
} else { // limited uses
137+
if (ec2FleetNode.getUsesRemaining() > 1) {
138+
ec2FleetNode.decrementUsesRemaining();
139+
LOGGER.info("Agent " + computer.getName() + " has " + ec2FleetNode.getUsesRemaining() + " builds left");
140+
} else if (ec2FleetNode.getUsesRemaining() == 1) { // current task should be the last task for this agent
141+
LOGGER.info(String.format("maxTotalUses drained - suspending agent %s after current build", computer.getName()));
142+
computer.setAcceptingTasks(false);
143+
ec2FleetNode.decrementUsesRemaining();
144+
} else {
145+
// don't decrement when usesRemaining=0, as -1 has a special meaning.
146+
LOGGER.warning(String.format("Agent %s accepted a task after being suspended!!! MaxTotalUses: %d, uses remaining: %d",
147+
computer.getName(), ec2FleetNode.getMaxTotalUses(), ec2FleetNode.getUsesRemaining()));
148+
}
132149
}
133150
}
134151
}
135152
}
136153

137154
@Override
138155
public void taskCompleted(Executor executor, Queue.Task task, long l) {
139-
postJobAction(executor);
156+
postJobAction(executor, null);
140157
}
141158

142159
@Override
143160
public void taskCompletedWithProblems(Executor executor, Queue.Task task, long l, Throwable throwable) {
144-
postJobAction(executor);
161+
postJobAction(executor, throwable);
145162
}
146163

147-
private void postJobAction(Executor executor) {
164+
private void postJobAction(final Executor executor, final Throwable throwable) {
165+
if (throwable != null) {
166+
LOGGER.warning(String.format("Build %s completed with problems on agent %s. TimeSpentInQueue: %ds, duration: %ds, problems: %s",
167+
executor.getCurrentExecutable(), executor.getOwner().getName(),
168+
TimeUnit.MILLISECONDS.toSeconds(executor.getTimeSpentInQueue()),
169+
TimeUnit.MILLISECONDS.toSeconds(executor.getElapsedTime()), throwable.getMessage()));
170+
} else {
171+
LOGGER.info(String.format("Build %s completed successfully on agent %s. TimeSpentInQueue: %ds, duration: %ds.",
172+
executor.getCurrentExecutable(), executor.getOwner().getName(),
173+
TimeUnit.MILLISECONDS.toSeconds(executor.getTimeSpentInQueue()),
174+
TimeUnit.MILLISECONDS.toSeconds(executor.getElapsedTime())));
175+
}
176+
148177
final EC2FleetNodeComputer computer = (EC2FleetNodeComputer) executor.getOwner();
149-
if(computer != null) {
178+
if (computer != null) {
150179
final EC2FleetNode ec2FleetNode = computer.getNode();
151180
if (ec2FleetNode != null) {
152181
final AbstractEC2FleetCloud cloud = ec2FleetNode.getCloud();
153182
if (computer.countBusy() <= 1 && !computer.isAcceptingTasks()) {
154-
LOGGER.info("Calling scheduleToTerminate for node " + ec2FleetNode.getNodeName() + " due to maxTotalUses (" + ec2FleetNode.getMaxTotalUses() + ")");
183+
LOGGER.info("Calling scheduleToTerminate for node " + ec2FleetNode.getNodeName() + " due to exhausted maxTotalUses.");
155184
// Schedule instance for termination even if it breaches minSize and minSpareSize constraints
156185
cloud.scheduleToTerminate(ec2FleetNode.getNodeName(), true, EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED);
157186
}

src/test/java/com/amazon/jenkins/ec2fleet/EC2MaxTotalUsesRetentionStrategyTest.java

-73
This file was deleted.

0 commit comments

Comments
 (0)