Skip to content

Commit 46d6731

Browse files
authored
Fix 322 and 363 (#376)
* [fix] Terminate scheduled instances ONLY IF idle #363 * [fix] leave maxTotalUses alone and track remainingUses correctly add a flag to track termination of agents by plugin * [fix] Fix lost state (instanceIdsToTerminate) on configuration change [fix] Fix maxtotaluses decrement logic add logs in post job action to expose tasks terminated with problems #322 add and fix tests * add integration tests for configuration change leading to lost state and rebuilding lost state to terminate instances previously marked for termination
1 parent 9450fa8 commit 46d6731

10 files changed

+512
-197
lines changed

src/main/java/com/amazon/jenkins/ec2fleet/EC2FleetCloud.java

+30-7
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import com.amazonaws.services.ec2.model.Instance;
1010
import com.amazonaws.services.ec2.model.InstanceStateName;
1111
import com.cloudbees.jenkins.plugins.awscredentials.AWSCredentialsHelper;
12+
import com.google.common.collect.Sets;
1213
import hudson.Extension;
1314
import hudson.model.Computer;
1415
import hudson.model.Descriptor;
@@ -51,6 +52,7 @@
5152
import java.util.logging.Level;
5253
import java.util.logging.Logger;
5354
import java.util.logging.SimpleFormatter;
55+
import java.util.stream.Collectors;
5456

5557
/**
5658
* @see CloudNanny
@@ -212,7 +214,7 @@ public EC2FleetCloud(final String name,
212214
this.minSize = Math.max(0, minSize);
213215
this.maxSize = maxSize;
214216
this.minSpareSize = Math.max(0, minSpareSize);
215-
this.maxTotalUses = StringUtils.isBlank(maxTotalUses) ? -1 : Integer.parseInt(maxTotalUses);
217+
this.maxTotalUses = StringUtils.isBlank(maxTotalUses) ? DEFAULT_MAX_TOTAL_USES : Integer.parseInt(maxTotalUses);
216218
this.numExecutors = Math.max(numExecutors, 1);
217219
this.addNodeOnlyIfRunning = addNodeOnlyIfRunning;
218220
this.restrictUsage = restrictUsage;
@@ -284,10 +286,6 @@ public String getEndpoint() {
284286
return endpoint;
285287
}
286288

287-
public int getMaxTotalUses() {
288-
return maxTotalUses == null ? DEFAULT_MAX_TOTAL_USES : maxTotalUses;
289-
}
290-
291289
@Override
292290
public String getFleet() {
293291
return fleet;
@@ -382,6 +380,11 @@ synchronized void setStats(final FleetStateStats stats) {
382380
this.stats = stats;
383381
}
384382

383+
// make maxTotalUses inaccessible from cloud for safety. Use {@link EC2FleetNode#maxTotalUses} and {@link EC2FleetNode#usesRemaining} instead.
384+
public boolean hasUnlimitedUsesForNodes() {
385+
return maxTotalUses == -1;
386+
}
387+
385388
@Override
386389
public synchronized boolean hasExcessCapacity() {
387390
if(stats == null) {
@@ -502,7 +505,9 @@ public FleetStateStats update() {
502505
}
503506
}
504507
currentToAdd = toAdd;
505-
currentInstanceIdsToTerminate = new HashMap<>(instanceIdsToTerminate);
508+
509+
// for computers currently busy doing work, wait until next update cycle to terminate corresponding instances (issue#363).
510+
currentInstanceIdsToTerminate = filterOutBusyNodes();
506511
}
507512

508513
currentState = updateByState(currentToAdd, currentInstanceIdsToTerminate, currentState);
@@ -533,6 +538,24 @@ public FleetStateStats update() {
533538
}
534539
}
535540

541+
private Map<String, EC2AgentTerminationReason> filterOutBusyNodes() {
542+
final Jenkins j = Jenkins.get();
543+
final Map<String, EC2AgentTerminationReason> filteredInstanceIdsToTerminate = instanceIdsToTerminate.entrySet()
544+
.stream()
545+
.filter(e -> {
546+
final Computer c = j.getComputer(e.getKey());
547+
return c == null || c.isIdle();
548+
})
549+
.collect(Collectors.toMap(Map.Entry::getKey,Map.Entry::getValue));
550+
551+
final Set<String> filteredOutNonIdleIds = Sets.difference(instanceIdsToTerminate.keySet(), filteredInstanceIdsToTerminate.keySet());
552+
if (filteredOutNonIdleIds.size() > 0) {
553+
info("Skipping termination of the following instances until the next update cycle, as they are still busy doing some work: %s.", filteredOutNonIdleIds);
554+
}
555+
556+
return filteredInstanceIdsToTerminate;
557+
}
558+
536559
public boolean removePlannedNodeScheduledFutures(final int numToRemove) {
537560
if (numToRemove < 1) {
538561
return false;
@@ -829,7 +852,7 @@ private void addNewSlave(final AmazonEC2 ec2, final Instance instance, FleetStat
829852
final Node.Mode nodeMode = restrictUsage ? Node.Mode.EXCLUSIVE : Node.Mode.NORMAL;
830853
final EC2FleetNode node = new EC2FleetNode(instanceId, "Fleet slave for " + instanceId,
831854
effectiveFsRoot, effectiveNumExecutors, nodeMode, labelString, new ArrayList<NodeProperty<?>>(),
832-
this, computerLauncher, getMaxTotalUses());
855+
this, computerLauncher, maxTotalUses);
833856

834857
// Initialize our retention strategy
835858
node.setRetentionStrategy(new EC2RetentionStrategy());

src/main/java/com/amazon/jenkins/ec2fleet/EC2FleetNode.java

+9-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
public class EC2FleetNode extends Slave implements EphemeralNode, EC2FleetCloudAware {
1818

1919
private volatile AbstractEC2FleetCloud cloud;
20-
private int maxTotalUses;
20+
private final int maxTotalUses;
21+
private int usesRemaining;
2122

2223
public EC2FleetNode(final String name, final String nodeDescription, final String remoteFS, final int numExecutors, final Mode mode, final String label,
2324
final List<? extends NodeProperty<?>> nodeProperties, final AbstractEC2FleetCloud cloud, ComputerLauncher launcher, final int maxTotalUses) throws IOException, Descriptor.FormException {
@@ -26,6 +27,7 @@ public EC2FleetNode(final String name, final String nodeDescription, final Strin
2627
launcher, RetentionStrategy.NOOP, nodeProperties);
2728
this.cloud = cloud;
2829
this.maxTotalUses = maxTotalUses;
30+
this.usesRemaining = maxTotalUses;
2931
}
3032

3133
@Override
@@ -61,8 +63,12 @@ public int getMaxTotalUses() {
6163
return this.maxTotalUses;
6264
}
6365

64-
public void setMaxTotalUses(final int maxTotalUses) {
65-
this.maxTotalUses = maxTotalUses;
66+
public int getUsesRemaining() {
67+
return usesRemaining;
68+
}
69+
70+
public void decrementUsesRemaining() {
71+
this.usesRemaining--;
6672
}
6773

6874
@Extension

src/main/java/com/amazon/jenkins/ec2fleet/EC2FleetNodeComputer.java

+11-4
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@
1717
public class EC2FleetNodeComputer extends SlaveComputer implements EC2FleetCloudAware {
1818

1919
private final String name;
20-
2120
private volatile AbstractEC2FleetCloud cloud;
21+
private boolean isMarkedForDeletion;
2222

2323
public EC2FleetNodeComputer(final Slave slave, @Nonnull final String name, @Nonnull final AbstractEC2FleetCloud cloud) {
2424
super(slave);
2525
this.name = name;
2626
this.cloud = cloud;
27+
this.isMarkedForDeletion = false;
28+
}
29+
30+
public boolean isMarkedForDeletion() {
31+
return isMarkedForDeletion;
2732
}
2833

2934
@Override
@@ -44,9 +49,9 @@ public String getDisplayName() {
4449
final String displayName = String.format("%s %s", cloud.getDisplayName(), name);
4550
final EC2FleetNode node = getNode();
4651
if(node != null) {
47-
final int totalUses = node.getMaxTotalUses();
48-
if(totalUses != -1) {
49-
return String.format("%s Builds left: %d ", displayName, totalUses);
52+
final int usesRemaining = node.getUsesRemaining();
53+
if(usesRemaining != -1) {
54+
return String.format("%s Builds left: %d ", displayName, usesRemaining);
5055
}
5156
}
5257
return displayName;
@@ -82,6 +87,8 @@ public HttpResponse doDoDelete() throws IOException {
8287
final AbstractEC2FleetCloud cloud = node.getCloud();
8388
if (cloud != null && StringUtils.isNotBlank(instanceId)) {
8489
cloud.scheduleToTerminate(instanceId, false, EC2AgentTerminationReason.AGENT_DELETED);
90+
// Persist a flag here as the cloud objects can be re-created on user-initiated changes, hence, losing track of instance ids scheduled to terminate.
91+
this.isMarkedForDeletion = true;
8592
}
8693
}
8794
return super.doDoDelete();

src/main/java/com/amazon/jenkins/ec2fleet/EC2RetentionStrategy.java

+61-32
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
*/
1919
public class EC2RetentionStrategy extends RetentionStrategy<SlaveComputer> implements ExecutorListener {
2020

21-
private static final int RE_CHECK_IN_MINUTE = 1;
21+
private static final int RE_CHECK_IN_A_MINUTE = 1;
2222

2323
private static final Logger LOGGER = Logger.getLogger(EC2RetentionStrategy.class.getName());
2424

@@ -42,52 +42,62 @@ public long check(final SlaveComputer computer) {
4242
if (cloud == null) {
4343
LOGGER.warning("Cloud is null for computer " + fc.getDisplayName()
4444
+ ". This should be autofixed in a few minutes, if not please create an issue for the plugin");
45-
return RE_CHECK_IN_MINUTE;
45+
return RE_CHECK_IN_A_MINUTE;
4646
}
4747

4848
// Ensure that the EC2FleetCloud cannot be mutated from under us while
4949
// we're doing this check
5050
// Ensure nobody provisions onto this node until we've done
5151
// checking
5252
boolean shouldAcceptTasks = fc.isAcceptingTasks();
53-
boolean justTerminated = false;
53+
boolean markedForTermination = false;
5454
fc.setAcceptingTasks(false);
5555
try {
5656
if(fc.isIdle()) {
57-
final EC2AgentTerminationReason reason;
58-
if (isIdleForTooLong(cloud, fc)) {
59-
reason = EC2AgentTerminationReason.IDLE_FOR_TOO_LONG;
57+
Node node = fc.getNode();
58+
if (node == null) {
59+
return RE_CHECK_IN_A_MINUTE;
60+
}
61+
62+
EC2AgentTerminationReason reason;
63+
// Determine the reason for termination from specific to generic use cases.
64+
// Reasoning for checking all cases of termination initiated by the plugin:
65+
// A user-initiated change to cloud configuration creates a new EC2FleetCloud object, erasing class fields containing data like instance IDs to terminate.
66+
// Hence, determine the reasons for termination here using persisted fields for accurate handling of termination.
67+
if (fc.isMarkedForDeletion()) {
68+
reason = EC2AgentTerminationReason.AGENT_DELETED;
6069
} else if (cloud.hasExcessCapacity()) {
6170
reason = EC2AgentTerminationReason.EXCESS_CAPACITY;
71+
} else if (cloud instanceof EC2FleetCloud && !((EC2FleetCloud) cloud).hasUnlimitedUsesForNodes()
72+
&& ((EC2FleetNode)node).getUsesRemaining() <= 0) {
73+
reason = EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED;
74+
} else if (isIdleForTooLong(cloud, fc)) {
75+
reason = EC2AgentTerminationReason.IDLE_FOR_TOO_LONG;
6276
} else {
63-
return 0;
64-
}
65-
66-
// Find instance ID
67-
Node compNode = fc.getNode();
68-
if (compNode == null) {
69-
return 0;
77+
return RE_CHECK_IN_A_MINUTE;
7078
}
7179

72-
final String instanceId = compNode.getNodeName();
73-
if (cloud.scheduleToTerminate(instanceId, false, reason)) {
80+
final String instanceId = node.getNodeName();
81+
final boolean ignoreMinConstraints = reason.equals(EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED);
82+
if (cloud.scheduleToTerminate(instanceId, ignoreMinConstraints, reason)) {
7483
// Instance successfully scheduled for termination, so no longer accept tasks (i.e. suspended)
7584
shouldAcceptTasks = false;
7685
LOGGER.fine(String.format("Suspended node %s after scheduling instance for termination, reason: %s.",
77-
compNode.getDisplayName(), instanceId, reason));
78-
justTerminated = true;
86+
node.getDisplayName(), instanceId, reason));
87+
markedForTermination = true;
7988
}
8089
}
8190

82-
if (cloud.isAlwaysReconnect() && !justTerminated && fc.isOffline() && !fc.isConnecting() && fc.isLaunchSupported()) {
91+
// if connection to the computer is lost for some reason, try to reconnect if configured to do so.
92+
if (cloud.isAlwaysReconnect() && !markedForTermination && fc.isOffline() && !fc.isConnecting() && fc.isLaunchSupported()) {
8393
LOGGER.log(Level.INFO, "Reconnecting to instance: " + fc.getDisplayName());
8494
fc.tryReconnect();
8595
}
8696
} finally {
8797
fc.setAcceptingTasks(shouldAcceptTasks);
8898
}
8999

90-
return RE_CHECK_IN_MINUTE;
100+
return RE_CHECK_IN_A_MINUTE;
91101
}
92102

93103
@Override
@@ -121,37 +131,56 @@ public void taskAccepted(Executor executor, Queue.Task task) {
121131
final EC2FleetNode ec2FleetNode = computer.getNode();
122132
if (ec2FleetNode != null) {
123133
final int maxTotalUses = ec2FleetNode.getMaxTotalUses();
124-
if (maxTotalUses <= -1) {
125-
LOGGER.fine("maxTotalUses set to unlimited (" + ec2FleetNode.getMaxTotalUses() + ") for agent " + computer.getName());
126-
} else if (maxTotalUses <= 1) {
127-
LOGGER.info("maxTotalUses drained - suspending agent after current build " + computer.getName());
128-
computer.setAcceptingTasks(false);
129-
} else {
130-
ec2FleetNode.setMaxTotalUses(ec2FleetNode.getMaxTotalUses() - 1);
131-
LOGGER.info("Agent " + computer.getName() + " has " + ec2FleetNode.getMaxTotalUses() + " builds left");
134+
if (maxTotalUses <= -1) { // unlimited uses
135+
LOGGER.fine("maxTotalUses set to unlimited (" + maxTotalUses + ") for agent " + computer.getName());
136+
} else { // limited uses
137+
if (ec2FleetNode.getUsesRemaining() > 1) {
138+
ec2FleetNode.decrementUsesRemaining();
139+
LOGGER.info("Agent " + computer.getName() + " has " + ec2FleetNode.getUsesRemaining() + " builds left");
140+
} else if (ec2FleetNode.getUsesRemaining() == 1) { // current task should be the last task for this agent
141+
LOGGER.info(String.format("maxTotalUses drained - suspending agent %s after current build", computer.getName()));
142+
computer.setAcceptingTasks(false);
143+
ec2FleetNode.decrementUsesRemaining();
144+
} else {
145+
// don't decrement when usesRemaining=0, as -1 has a special meaning.
146+
LOGGER.warning(String.format("Agent %s accepted a task after being suspended!!! MaxTotalUses: %d, uses remaining: %d",
147+
computer.getName(), ec2FleetNode.getMaxTotalUses(), ec2FleetNode.getUsesRemaining()));
148+
}
132149
}
133150
}
134151
}
135152
}
136153

137154
@Override
138155
public void taskCompleted(Executor executor, Queue.Task task, long l) {
139-
postJobAction(executor);
156+
postJobAction(executor, null);
140157
}
141158

142159
@Override
143160
public void taskCompletedWithProblems(Executor executor, Queue.Task task, long l, Throwable throwable) {
144-
postJobAction(executor);
161+
postJobAction(executor, throwable);
145162
}
146163

147-
private void postJobAction(Executor executor) {
164+
private void postJobAction(final Executor executor, final Throwable throwable) {
165+
if (throwable != null) {
166+
LOGGER.warning(String.format("Build %s completed with problems on agent %s. TimeSpentInQueue: %ds, duration: %ds, problems: %s",
167+
executor.getCurrentExecutable(), executor.getOwner().getName(),
168+
TimeUnit.MILLISECONDS.toSeconds(executor.getTimeSpentInQueue()),
169+
TimeUnit.MILLISECONDS.toSeconds(executor.getElapsedTime()), throwable.getMessage()));
170+
} else {
171+
LOGGER.info(String.format("Build %s completed successfully on agent %s. TimeSpentInQueue: %ds, duration: %ds.",
172+
executor.getCurrentExecutable(), executor.getOwner().getName(),
173+
TimeUnit.MILLISECONDS.toSeconds(executor.getTimeSpentInQueue()),
174+
TimeUnit.MILLISECONDS.toSeconds(executor.getElapsedTime())));
175+
}
176+
148177
final EC2FleetNodeComputer computer = (EC2FleetNodeComputer) executor.getOwner();
149-
if(computer != null) {
178+
if (computer != null) {
150179
final EC2FleetNode ec2FleetNode = computer.getNode();
151180
if (ec2FleetNode != null) {
152181
final AbstractEC2FleetCloud cloud = ec2FleetNode.getCloud();
153182
if (computer.countBusy() <= 1 && !computer.isAcceptingTasks()) {
154-
LOGGER.info("Calling scheduleToTerminate for node " + ec2FleetNode.getNodeName() + " due to maxTotalUses (" + ec2FleetNode.getMaxTotalUses() + ")");
183+
LOGGER.info("Calling scheduleToTerminate for node " + ec2FleetNode.getNodeName() + " due to exhausted maxTotalUses.");
155184
// Schedule instance for termination even if it breaches minSize and minSpareSize constraints
156185
cloud.scheduleToTerminate(ec2FleetNode.getNodeName(), true, EC2AgentTerminationReason.MAX_TOTAL_USES_EXHAUSTED);
157186
}

0 commit comments

Comments
 (0)