16
16
17
17
package com .google .jenkins .plugins .computeengine ;
18
18
19
- import static com .google .jenkins .plugins .computeengine .ComputeEngineCloud .CLOUD_ID_LABEL_KEY ;
20
19
import static java .util .Collections .emptyList ;
21
20
22
21
import com .google .api .services .compute .model .Instance ;
22
+ import com .google .common .annotations .VisibleForTesting ;
23
23
import com .google .common .collect .ImmutableMap ;
24
+ import com .google .jenkins .plugins .computeengine .client .ComputeClientV2 ;
24
25
import hudson .Extension ;
25
26
import hudson .model .PeriodicWork ;
26
27
import hudson .model .Slave ;
27
28
import java .io .IOException ;
29
+ import java .security .GeneralSecurityException ;
30
+ import java .time .LocalDateTime ;
31
+ import java .time .OffsetDateTime ;
32
+ import java .time .ZoneOffset ;
33
+ import java .time .format .DateTimeFormatter ;
34
+ import java .time .temporal .ChronoUnit ;
28
35
import java .util .List ;
29
- import java .util .Map ;
30
36
import java .util .Set ;
31
37
import java .util .logging .Level ;
32
38
import java .util .logging .Logger ;
39
45
@ Symbol ("cleanLostNodesWork" )
40
46
public class CleanLostNodesWork extends PeriodicWork {
41
47
protected final Logger logger = Logger .getLogger (getClass ().getName ());
48
+ public static final String NODE_IN_USE_LABEL_KEY = "jenkins_node_last_refresh" ;
49
+ public static final long RECURRENCE_PERIOD = Long .parseLong (
50
+ System .getProperty (CleanLostNodesWork .class .getName () + ".recurrencePeriod" , String .valueOf (HOUR )));
51
+
52
+ @ VisibleForTesting
53
+ public static final int LOST_MULTIPLIER = 3 ;
54
+ /**
55
+ * The formatter for the label timestamp value as per google label format,
56
+ * "The value can only contain lowercase letters, numeric characters, underscores and dashes.
57
+ * The value can be at most 63 characters long. International characters are allowed".
58
+ */
59
+ private static final DateTimeFormatter formatter = DateTimeFormatter .ofPattern ("yyyy_MM_dd't'HH_mm_ss_SSS'z'" );
42
60
43
61
/** {@inheritDoc} */
44
62
@ Override
45
63
public long getRecurrencePeriod () {
46
- return HOUR ;
64
+ return RECURRENCE_PERIOD ;
65
+ }
66
+
67
+ public static String getLastRefreshLabelVal () {
68
+ return formatter .format (OffsetDateTime .now (ZoneOffset .UTC ));
47
69
}
48
70
49
71
/** {@inheritDoc} */
@@ -55,22 +77,50 @@ protected void doRun() {
55
77
56
78
private void cleanCloud (ComputeEngineCloud cloud ) {
57
79
logger .log (Level .FINEST , "Cleaning cloud " + cloud .getCloudName ());
58
- List <Instance > remoteInstances = findRemoteInstances (cloud );
80
+ ComputeClientV2 clientV2 ;
81
+ try {
82
+ clientV2 = cloud .getClientV2 ();
83
+ } catch (GeneralSecurityException | IOException ex ) {
84
+ logger .log (Level .WARNING , "Error getting clientV2 for cloud " + cloud .getCloudName (), ex );
85
+ return ;
86
+ }
87
+ List <Instance > remoteInstances = findRunningRemoteInstances (clientV2 );
59
88
Set <String > localInstances = findLocalInstances (cloud );
89
+ if (!(localInstances .isEmpty () || remoteInstances .isEmpty ())) {
90
+ updateLocalInstancesLabel (clientV2 , localInstances , remoteInstances );
91
+ }
60
92
remoteInstances .stream ()
61
93
.filter (remote -> isOrphaned (remote , localInstances ))
62
94
.forEach (remote -> terminateInstance (remote , cloud ));
63
95
}
64
96
65
97
private boolean isOrphaned (Instance remote , Set <String > localInstances ) {
66
- String instanceName = remote .getName ();
67
- logger .log (Level .FINEST , "Checking instance " + instanceName );
68
- return !localInstances .contains (instanceName );
98
+ /* It is necessary to check if the remote instance is present in localInstances.
99
+ The `remote` instance has an old timestamp because it hasn't been fetched again
100
+ after the `updateLocalInstancesLabel` call, to avoid extra network calls.
101
+ */
102
+ if (localInstances .contains (remote .getName ())) {
103
+ return false ;
104
+ }
105
+ String nodeLastRefresh = remote .getLabels ().get (NODE_IN_USE_LABEL_KEY );
106
+ if (nodeLastRefresh == null ) {
107
+ return false ;
108
+ }
109
+ OffsetDateTime lastRefresh =
110
+ LocalDateTime .parse (nodeLastRefresh , formatter ).atOffset (ZoneOffset .UTC );
111
+ boolean isOrphan = lastRefresh
112
+ .plus (RECURRENCE_PERIOD * LOST_MULTIPLIER , ChronoUnit .MILLIS )
113
+ .isBefore (OffsetDateTime .now (ZoneOffset .UTC ));
114
+ logger .log (
115
+ Level .FINEST ,
116
+ () -> "Instance " + remote .getName () + " last_refresh label value: " + nodeLastRefresh + ", isOrphan: "
117
+ + isOrphan );
118
+ return isOrphan ;
69
119
}
70
120
71
121
private void terminateInstance (Instance remote , ComputeEngineCloud cloud ) {
72
122
String instanceName = remote .getName ();
73
- logger .log (Level .INFO , "Remote instance " + instanceName + " not found locally, removing it" );
123
+ logger .log (Level .INFO , "Removing orphaned instance: " + instanceName );
74
124
try {
75
125
cloud .getClient ().terminateInstanceAsync (cloud .getProjectId (), remote .getZone (), instanceName );
76
126
} catch (IOException ex ) {
@@ -86,27 +136,47 @@ private List<ComputeEngineCloud> getClouds() {
86
136
}
87
137
88
138
private Set <String > findLocalInstances (ComputeEngineCloud cloud ) {
89
- return Jenkins .get ().getNodes ().stream ()
139
+ var localInstances = Jenkins .get ().getNodes ().stream ()
90
140
.filter (node -> node instanceof ComputeEngineInstance )
91
141
.map (node -> (ComputeEngineInstance ) node )
92
142
.filter (node -> node .getCloud ().equals (cloud ))
93
143
.map (Slave ::getNodeName )
94
144
.collect (Collectors .toSet ());
145
+ logger .log (Level .FINEST , () -> "Found " + localInstances .size () + " local instances" );
146
+ return localInstances ;
95
147
}
96
148
97
- private List <Instance > findRemoteInstances (ComputeEngineCloud cloud ) {
98
- Map <String , String > filterLabel = ImmutableMap .of (CLOUD_ID_LABEL_KEY , cloud .getInstanceId ());
149
+ private List <Instance > findRunningRemoteInstances (ComputeClientV2 clientV2 ) {
99
150
try {
100
- return cloud . getClient (). listInstancesWithLabel ( cloud . getProjectId (), filterLabel ). stream ()
101
- . filter ( instance -> shouldTerminateStatus ( instance . getStatus ()))
102
- . collect ( Collectors . toList ()) ;
151
+ var remoteInstances = clientV2 . retrieveInstanceByLabelKeyAndStatus ( NODE_IN_USE_LABEL_KEY , "RUNNING" );
152
+ logger . log ( Level . FINEST , () -> "Found " + remoteInstances . size () + " running remote instances" );
153
+ return remoteInstances ;
103
154
} catch (IOException ex ) {
104
155
logger .log (Level .WARNING , "Error finding remote instances" , ex );
105
156
return emptyList ();
106
157
}
107
158
}
108
159
109
- private boolean shouldTerminateStatus (String status ) {
110
- return !status .equals ("STOPPING" );
160
+ /**
161
+ * Updates the label of the local instances to indicate they are still in use. The method makes N network calls
162
+ * for N local instances, couldn't find any bulk update apis.
163
+ */
164
+ private void updateLocalInstancesLabel (
165
+ ComputeClientV2 clientV2 , Set <String > localInstances , List <Instance > remoteInstances ) {
166
+ var remoteInstancesByName =
167
+ remoteInstances .stream ().collect (Collectors .toMap (Instance ::getName , instance -> instance ));
168
+ var labelToUpdate = ImmutableMap .of (NODE_IN_USE_LABEL_KEY , getLastRefreshLabelVal ());
169
+ for (String instanceName : localInstances ) {
170
+ var remoteInstance = remoteInstancesByName .get (instanceName );
171
+ if (remoteInstance == null ) {
172
+ continue ;
173
+ }
174
+ try {
175
+ clientV2 .updateInstanceLabels (remoteInstance , labelToUpdate );
176
+ logger .log (Level .FINEST , () -> "Updated label for instance " + instanceName );
177
+ } catch (IOException e ) {
178
+ logger .log (Level .WARNING , "Error updating label for instance " + instanceName , e );
179
+ }
180
+ }
111
181
}
112
182
}
0 commit comments