Skip to content

Commit fe83bea

Browse files
mimowok8s-publishing-bot
authored andcommitted
Support handling of pod failures with respect to the specified rules
Kubernetes-commit: bf9ce70de34c93b545f95e1d81c122c81a8a0aa5
1 parent 3be517c commit fe83bea

17 files changed

+1629
-118
lines changed

batch/v1/generated.pb.go

+1,138-118
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

batch/v1/generated.proto

+99
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

batch/v1/types.go

+130
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,120 @@ const (
8787
IndexedCompletion CompletionMode = "Indexed"
8888
)
8989

90+
// PodFailurePolicyAction specifies how a Pod failure is handled.
91+
// +enum
92+
type PodFailurePolicyAction string
93+
94+
const (
95+
// This is an action which might be taken on a pod failure - mark the
96+
// pod's job as Failed and terminate all running pods.
97+
PodFailurePolicyActionFailJob PodFailurePolicyAction = "FailJob"
98+
99+
// This is an action which might be taken on a pod failure - the counter towards
100+
// .backoffLimit, represented by the job's .status.failed field, is not
101+
// incremented and a replacement pod is created.
102+
PodFailurePolicyActionIgnore PodFailurePolicyAction = "Ignore"
103+
104+
// This is an action which might be taken on a pod failure - the pod failure
105+
// is handled in the default way - the counter towards .backoffLimit,
106+
// represented by the job's .status.failed field, is incremented.
107+
PodFailurePolicyActionCount PodFailurePolicyAction = "Count"
108+
)
109+
110+
// +enum
111+
type PodFailurePolicyOnExitCodesOperator string
112+
113+
const (
114+
PodFailurePolicyOnExitCodesOpIn PodFailurePolicyOnExitCodesOperator = "In"
115+
PodFailurePolicyOnExitCodesOpNotIn PodFailurePolicyOnExitCodesOperator = "NotIn"
116+
)
117+
118+
// PodFailurePolicyOnExitCodesRequirement describes the requirement for handling
119+
// a failed pod based on its container exit codes. In particular, it lookups the
120+
// .state.terminated.exitCode for each app container and init container status,
121+
// represented by the .status.containerStatuses and .status.initContainerStatuses
122+
// fields in the Pod status, respectively. Containers completed with success
123+
// (exit code 0) are excluded from the requirement check.
124+
type PodFailurePolicyOnExitCodesRequirement struct {
125+
// Restricts the check for exit codes to the container with the
126+
// specified name. When null, the rule applies to all containers.
127+
// When specified, it should match one the container or initContainer
128+
// names in the pod template.
129+
// +optional
130+
ContainerName *string `json:"containerName" protobuf:"bytes,1,opt,name=containerName"`
131+
132+
// Represents the relationship between the container exit code(s) and the
133+
// specified values. Containers completed with success (exit code 0) are
134+
// excluded from the requirement check. Possible values are:
135+
// - In: the requirement is satisfied if at least one container exit code
136+
// (might be multiple if there are multiple containers not restricted
137+
// by the 'containerName' field) is in the set of specified values.
138+
// - NotIn: the requirement is satisfied if at least one container exit code
139+
// (might be multiple if there are multiple containers not restricted
140+
// by the 'containerName' field) is not in the set of specified values.
141+
// Additional values are considered to be added in the future. Clients should
142+
// react to an unknown operator by assuming the requirement is not satisfied.
143+
Operator PodFailurePolicyOnExitCodesOperator `json:"operator" protobuf:"bytes,2,req,name=operator"`
144+
145+
// Specifies the set of values. Each returned container exit code (might be
146+
// multiple in case of multiple containers) is checked against this set of
147+
// values with respect to the operator. The list of values must be ordered
148+
// and must not contain duplicates. Value '0' cannot be used for the In operator.
149+
// At least one element is required. At most 255 elements are allowed.
150+
// +listType=set
151+
Values []int32 `json:"values" protobuf:"varint,3,rep,name=values"`
152+
}
153+
154+
// PodFailurePolicyOnPodConditionsPattern describes a pattern for matching
155+
// an actual pod condition type.
156+
type PodFailurePolicyOnPodConditionsPattern struct {
157+
// Specifies the required Pod condition type. To match a pod condition
158+
// it is required that specified type equals the pod condition type.
159+
Type corev1.PodConditionType `json:"type" protobuf:"bytes,1,req,name=type"`
160+
161+
// Specifies the required Pod condition status. To match a pod condition
162+
// it is required that the specified status equals the pod condition status.
163+
// Defaults to True.
164+
Status corev1.ConditionStatus `json:"status" protobuf:"bytes,2,req,name=status"`
165+
}
166+
167+
// PodFailurePolicyRule describes how a pod failure is handled when the requirements are met.
168+
// One of OnExitCodes and onPodConditions, but not both, can be used in each rule.
169+
type PodFailurePolicyRule struct {
170+
// Specifies the action taken on a pod failure when the requirements are satisfied.
171+
// Possible values are:
172+
// - FailJob: indicates that the pod's job is marked as Failed and all
173+
// running pods are terminated.
174+
// - Ignore: indicates that the counter towards the .backoffLimit is not
175+
// incremented and a replacement pod is created.
176+
// - Count: indicates that the pod is handled in the default way - the
177+
// counter towards the .backoffLimit is incremented.
178+
// Additional values are considered to be added in the future. Clients should
179+
// react to an unknown action by skipping the rule.
180+
Action PodFailurePolicyAction `json:"action" protobuf:"bytes,1,req,name=action"`
181+
182+
// Represents the requirement on the container exit codes.
183+
// +optional
184+
OnExitCodes *PodFailurePolicyOnExitCodesRequirement `json:"onExitCodes" protobuf:"bytes,2,opt,name=onExitCodes"`
185+
186+
// Represents the requirement on the pod conditions. The requirement is represented
187+
// as a list of pod condition patterns. The requirement is satisfied if at
188+
// least one pattern matches an actual pod condition. At most 20 elements are allowed.
189+
// +listType=atomic
190+
OnPodConditions []PodFailurePolicyOnPodConditionsPattern `json:"onPodConditions" protobuf:"bytes,3,opt,name=onPodConditions"`
191+
}
192+
193+
// PodFailurePolicy describes how failed pods influence the backoffLimit.
194+
type PodFailurePolicy struct {
195+
// A list of pod failure policy rules. The rules are evaluated in order.
196+
// Once a rule matches a Pod failure, the remaining of the rules are ignored.
197+
// When no rule matches the Pod failure, the default handling applies - the
198+
// counter of pod failures is incremented and it is checked against
199+
// the backoffLimit. At most 20 elements are allowed.
200+
// +listType=atomic
201+
Rules []PodFailurePolicyRule `json:"rules" protobuf:"bytes,1,opt,name=rules"`
202+
}
203+
90204
// JobSpec describes how the job execution will look like.
91205
type JobSpec struct {
92206

@@ -115,6 +229,19 @@ type JobSpec struct {
115229
// +optional
116230
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty" protobuf:"varint,3,opt,name=activeDeadlineSeconds"`
117231

232+
// Specifies the policy of handling failed pods. In particular, it allows to
233+
// specify the set of actions and conditions which need to be
234+
// satisfied to take the associated action.
235+
// If empty, the default behaviour applies - the counter of failed pods,
236+
// represented by the jobs's .status.failed field, is incremented and it is
237+
// checked against the backoffLimit. This field cannot be used in combination
238+
// with restartPolicy=OnFailure.
239+
//
240+
// This field is alpha-level. To use this field, you must enable the
241+
// `JobPodFailurePolicy` feature gate (disabled by default).
242+
// +optional
243+
PodFailurePolicy *PodFailurePolicy `json:"podFailurePolicy,omitempty" protobuf:"bytes,11,opt,name=podFailurePolicy"`
244+
118245
// Specifies the number of retries before marking this job failed.
119246
// Defaults to 6
120247
// +optional
@@ -297,6 +424,9 @@ const (
297424
JobComplete JobConditionType = "Complete"
298425
// JobFailed means the job has failed its execution.
299426
JobFailed JobConditionType = "Failed"
427+
// FailureTarget means the job is about to fail its execution.
428+
// The constant is to be renamed once the name is accepted within the KEP-3329.
429+
AlphaNoCompatGuaranteeJobFailureTarget JobConditionType = "FailureTarget"
300430
)
301431

302432
// JobCondition describes current state of a job.

0 commit comments

Comments
 (0)