@@ -87,6 +87,120 @@ const (
87
87
IndexedCompletion CompletionMode = "Indexed"
88
88
)
89
89
90
+ // PodFailurePolicyAction specifies how a Pod failure is handled.
91
+ // +enum
92
+ type PodFailurePolicyAction string
93
+
94
+ const (
95
+ // This is an action which might be taken on a pod failure - mark the
96
+ // pod's job as Failed and terminate all running pods.
97
+ PodFailurePolicyActionFailJob PodFailurePolicyAction = "FailJob"
98
+
99
+ // This is an action which might be taken on a pod failure - the counter towards
100
+ // .backoffLimit, represented by the job's .status.failed field, is not
101
+ // incremented and a replacement pod is created.
102
+ PodFailurePolicyActionIgnore PodFailurePolicyAction = "Ignore"
103
+
104
+ // This is an action which might be taken on a pod failure - the pod failure
105
+ // is handled in the default way - the counter towards .backoffLimit,
106
+ // represented by the job's .status.failed field, is incremented.
107
+ PodFailurePolicyActionCount PodFailurePolicyAction = "Count"
108
+ )
109
+
110
+ // +enum
111
+ type PodFailurePolicyOnExitCodesOperator string
112
+
113
+ const (
114
+ PodFailurePolicyOnExitCodesOpIn PodFailurePolicyOnExitCodesOperator = "In"
115
+ PodFailurePolicyOnExitCodesOpNotIn PodFailurePolicyOnExitCodesOperator = "NotIn"
116
+ )
117
+
118
+ // PodFailurePolicyOnExitCodesRequirement describes the requirement for handling
119
+ // a failed pod based on its container exit codes. In particular, it lookups the
120
+ // .state.terminated.exitCode for each app container and init container status,
121
+ // represented by the .status.containerStatuses and .status.initContainerStatuses
122
+ // fields in the Pod status, respectively. Containers completed with success
123
+ // (exit code 0) are excluded from the requirement check.
124
+ type PodFailurePolicyOnExitCodesRequirement struct {
125
+ // Restricts the check for exit codes to the container with the
126
+ // specified name. When null, the rule applies to all containers.
127
+ // When specified, it should match one the container or initContainer
128
+ // names in the pod template.
129
+ // +optional
130
+ ContainerName * string `json:"containerName" protobuf:"bytes,1,opt,name=containerName"`
131
+
132
+ // Represents the relationship between the container exit code(s) and the
133
+ // specified values. Containers completed with success (exit code 0) are
134
+ // excluded from the requirement check. Possible values are:
135
+ // - In: the requirement is satisfied if at least one container exit code
136
+ // (might be multiple if there are multiple containers not restricted
137
+ // by the 'containerName' field) is in the set of specified values.
138
+ // - NotIn: the requirement is satisfied if at least one container exit code
139
+ // (might be multiple if there are multiple containers not restricted
140
+ // by the 'containerName' field) is not in the set of specified values.
141
+ // Additional values are considered to be added in the future. Clients should
142
+ // react to an unknown operator by assuming the requirement is not satisfied.
143
+ Operator PodFailurePolicyOnExitCodesOperator `json:"operator" protobuf:"bytes,2,req,name=operator"`
144
+
145
+ // Specifies the set of values. Each returned container exit code (might be
146
+ // multiple in case of multiple containers) is checked against this set of
147
+ // values with respect to the operator. The list of values must be ordered
148
+ // and must not contain duplicates. Value '0' cannot be used for the In operator.
149
+ // At least one element is required. At most 255 elements are allowed.
150
+ // +listType=set
151
+ Values []int32 `json:"values" protobuf:"varint,3,rep,name=values"`
152
+ }
153
+
154
+ // PodFailurePolicyOnPodConditionsPattern describes a pattern for matching
155
+ // an actual pod condition type.
156
+ type PodFailurePolicyOnPodConditionsPattern struct {
157
+ // Specifies the required Pod condition type. To match a pod condition
158
+ // it is required that specified type equals the pod condition type.
159
+ Type corev1.PodConditionType `json:"type" protobuf:"bytes,1,req,name=type"`
160
+
161
+ // Specifies the required Pod condition status. To match a pod condition
162
+ // it is required that the specified status equals the pod condition status.
163
+ // Defaults to True.
164
+ Status corev1.ConditionStatus `json:"status" protobuf:"bytes,2,req,name=status"`
165
+ }
166
+
167
+ // PodFailurePolicyRule describes how a pod failure is handled when the requirements are met.
168
+ // One of OnExitCodes and onPodConditions, but not both, can be used in each rule.
169
+ type PodFailurePolicyRule struct {
170
+ // Specifies the action taken on a pod failure when the requirements are satisfied.
171
+ // Possible values are:
172
+ // - FailJob: indicates that the pod's job is marked as Failed and all
173
+ // running pods are terminated.
174
+ // - Ignore: indicates that the counter towards the .backoffLimit is not
175
+ // incremented and a replacement pod is created.
176
+ // - Count: indicates that the pod is handled in the default way - the
177
+ // counter towards the .backoffLimit is incremented.
178
+ // Additional values are considered to be added in the future. Clients should
179
+ // react to an unknown action by skipping the rule.
180
+ Action PodFailurePolicyAction `json:"action" protobuf:"bytes,1,req,name=action"`
181
+
182
+ // Represents the requirement on the container exit codes.
183
+ // +optional
184
+ OnExitCodes * PodFailurePolicyOnExitCodesRequirement `json:"onExitCodes" protobuf:"bytes,2,opt,name=onExitCodes"`
185
+
186
+ // Represents the requirement on the pod conditions. The requirement is represented
187
+ // as a list of pod condition patterns. The requirement is satisfied if at
188
+ // least one pattern matches an actual pod condition. At most 20 elements are allowed.
189
+ // +listType=atomic
190
+ OnPodConditions []PodFailurePolicyOnPodConditionsPattern `json:"onPodConditions" protobuf:"bytes,3,opt,name=onPodConditions"`
191
+ }
192
+
193
+ // PodFailurePolicy describes how failed pods influence the backoffLimit.
194
+ type PodFailurePolicy struct {
195
+ // A list of pod failure policy rules. The rules are evaluated in order.
196
+ // Once a rule matches a Pod failure, the remaining of the rules are ignored.
197
+ // When no rule matches the Pod failure, the default handling applies - the
198
+ // counter of pod failures is incremented and it is checked against
199
+ // the backoffLimit. At most 20 elements are allowed.
200
+ // +listType=atomic
201
+ Rules []PodFailurePolicyRule `json:"rules" protobuf:"bytes,1,opt,name=rules"`
202
+ }
203
+
90
204
// JobSpec describes how the job execution will look like.
91
205
type JobSpec struct {
92
206
@@ -115,6 +229,19 @@ type JobSpec struct {
115
229
// +optional
116
230
ActiveDeadlineSeconds * int64 `json:"activeDeadlineSeconds,omitempty" protobuf:"varint,3,opt,name=activeDeadlineSeconds"`
117
231
232
+ // Specifies the policy of handling failed pods. In particular, it allows to
233
+ // specify the set of actions and conditions which need to be
234
+ // satisfied to take the associated action.
235
+ // If empty, the default behaviour applies - the counter of failed pods,
236
+ // represented by the jobs's .status.failed field, is incremented and it is
237
+ // checked against the backoffLimit. This field cannot be used in combination
238
+ // with restartPolicy=OnFailure.
239
+ //
240
+ // This field is alpha-level. To use this field, you must enable the
241
+ // `JobPodFailurePolicy` feature gate (disabled by default).
242
+ // +optional
243
+ PodFailurePolicy * PodFailurePolicy `json:"podFailurePolicy,omitempty" protobuf:"bytes,11,opt,name=podFailurePolicy"`
244
+
118
245
// Specifies the number of retries before marking this job failed.
119
246
// Defaults to 6
120
247
// +optional
@@ -297,6 +424,9 @@ const (
297
424
JobComplete JobConditionType = "Complete"
298
425
// JobFailed means the job has failed its execution.
299
426
JobFailed JobConditionType = "Failed"
427
+ // FailureTarget means the job is about to fail its execution.
428
+ // The constant is to be renamed once the name is accepted within the KEP-3329.
429
+ AlphaNoCompatGuaranteeJobFailureTarget JobConditionType = "FailureTarget"
300
430
)
301
431
302
432
// JobCondition describes current state of a job.
0 commit comments