Skip to content

Commit 952b6ae

Browse files
INSIGHTS-157 - PDB <> HPA check (#1057)
* fix typo * fix failure message * fix changelog * fix missingPodDisruptionBudget validation * add tests for pdbMinAvailableLessThenHPAMaxReplicas * add simple success test * fix typo * lowercasing warnings * WIP implement pdbMinAvailableLessThanHPAMaxReplicas * change check name * rename testes * fix check message * change check name * minor fixes * improving tests * improve tests * fix check name * Update docs/checks/reliability.md Co-authored-by: Andy Suderman <[email protected]> * fix/add tests * fixes from PR * fix error message --------- Co-authored-by: Andy Suderman <[email protected]>
1 parent 875a8ff commit 952b6ae

26 files changed

+708
-23
lines changed

docs/checks/reliability.md

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ key | default | description
2121
`topologySpreadConstraint` | `warning` | Fails when there is no topology spread constraint on the pod
2222
`hpaMaxAvailability` | `warning` | Fails when `maxAvailable` lesser or equal than `minAvailable` (if defined) for a HorizontalPodAutoscaler
2323
`hpaMinAvailability` | `warning` | Fails when `minAvailable` (if defined) lesser or equal to one for a HorizontalPodAutoscaler
24+
`pdbMinAvailableGreaterThanHPAMinReplicas` | `warning` | Fails when PDB `minAvailable` is greater than HPA `minReplicas`
2425

2526
## Background
2627

pkg/config/checks.go

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ var (
6969
"rolebindingClusterAdminRole",
7070
"hpaMaxAvailability",
7171
"hpaMinAvailability",
72+
"pdbMinAvailableGreaterThanHPAMinReplicas",
7273
}
7374

7475
// BuiltInChecks contains the checks that come pre-installed w/ Polaris

pkg/config/checks/missingPodDisruptionBudget.yaml

+4-8
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,23 @@ controllers:
88
schema:
99
"$schema": http://json-schema.org/draft-07/schema#
1010
type: object
11+
required: [spec]
1112
properties:
1213
spec:
1314
type: object
15+
required: [template]
1416
properties:
1517
template:
1618
type: object
19+
required: [metadata]
1720
properties:
1821
metadata:
1922
type: object
23+
required: [labels]
2024
properties:
2125
labels:
2226
type: object
2327
minProperties: 1
24-
required:
25-
- labels
26-
required:
27-
- metadata
28-
required:
29-
- template
30-
required:
31-
- spec
3228
additionalSchemaStrings:
3329
policy/PodDisruptionBudget: |
3430
type: object
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
successMessage: PDB and HPA are correctly configured
2+
failureMessage: PDB minAvailable is greater than HPA minReplicas
3+
category: Reliability
4+
target: Controller
5+
controllers:
6+
include:
7+
- Deployment

pkg/config/default.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ checks:
1212
topologySpreadConstraint: warning
1313
hpaMaxAvailability: warning
1414
hpaMinAvailability: warning
15+
pdbMinAvailableGreaterThanHPAMinReplicas: warning
1516

1617
# efficiency
1718
cpuRequestsMissing: warning

pkg/config/examples/config-full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ checks:
1212
metadataAndInstanceMismatched: warning
1313
hpaMaxAvailability: warning
1414
hpaMinAvailability: warning
15+
pdbMinAvailableGreaterThanHPAMinReplicas: warning
1516

1617
# efficiency
1718
cpuRequestsMissing: warning

pkg/kube/resource.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ func resolveControllerFromPod(ctx context.Context, podResource kubeAPICoreV1.Pod
181181
err = cacheAllObjectsOfKind(ctx, firstOwner.APIVersion, firstOwner.Kind, dynamicClient, restMapper, objectCache)
182182
}
183183
if err != nil {
184-
logrus.Warnf("Error caching objects of Kind %s %v", firstOwner.Kind, err)
184+
logrus.Warnf("error caching objects of Kind %s %v", firstOwner.Kind, err)
185185
break
186186
}
187187
abstractObject, ok = objectCache[key]
@@ -193,7 +193,7 @@ func resolveControllerFromPod(ctx context.Context, podResource kubeAPICoreV1.Pod
193193

194194
objMeta, err := meta.Accessor(&abstractObject)
195195
if err != nil {
196-
logrus.Warnf("Error retrieving parent metadata %s of API %s and Kind %s because of error: %v ", firstOwner.Name, firstOwner.APIVersion, firstOwner.Kind, err)
196+
logrus.Warnf("error retrieving parent metadata %s of API %s and Kind %s because of error: %v ", firstOwner.Name, firstOwner.APIVersion, firstOwner.Kind, err)
197197
return GenericResource{}, err
198198
}
199199
podSpec := GetPodSpec(abstractObject.Object)
@@ -221,7 +221,7 @@ func cacheSingleObject(ctx context.Context, apiVersion, kind, namespace, name st
221221
logrus.Debugf("Caching a single %s", kind)
222222
object, err := getObject(ctx, namespace, kind, apiVersion, name, dynamicClient, restMapper)
223223
if err != nil {
224-
logrus.Warnf("Error retrieving object %s/%s/%s/%s because of error: %v", kind, apiVersion, namespace, name, err)
224+
logrus.Warnf("error retrieving object %s/%s/%s/%s because of error: %v", kind, apiVersion, namespace, name, err)
225225
return err
226226
}
227227
key := fmt.Sprintf("%s/%s/%s", object.GetKind(), object.GetNamespace(), object.GetName())
@@ -235,13 +235,13 @@ func cacheAllObjectsOfKind(ctx context.Context, apiVersion, kind string, dynamic
235235
fqKind := schema.FromAPIVersionAndKind(apiVersion, kind)
236236
mapping, err := restMapper.RESTMapping(fqKind.GroupKind(), fqKind.Version)
237237
if err != nil {
238-
logrus.Warnf("Error retrieving mapping of API %s and Kind %s because of error: %v", apiVersion, kind, err)
238+
logrus.Warnf("error retrieving mapping of API %s and Kind %s because of error: %v", apiVersion, kind, err)
239239
return err
240240
}
241241

242242
objects, err := dynamicClient.Resource(mapping.Resource).Namespace("").List(ctx, kubeAPIMetaV1.ListOptions{})
243243
if err != nil {
244-
logrus.Warnf("Error retrieving parent object API %s and Kind %s because of error: %v", mapping.Resource.Version, mapping.Resource.Resource, err)
244+
logrus.Warnf("error retrieving parent object API %s and Kind %s because of error: %v", mapping.Resource.Version, mapping.Resource.Resource, err)
245245
return err
246246
}
247247
for idx, object := range objects.Items {

pkg/kube/resources.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ func CreateResourceProviderFromPath(directory string) (*ResourceProvider, error)
206206
}
207207
err = resources.addResourcesFromYaml(string(contents))
208208
if err != nil {
209-
logrus.Warnf("Skipping %s: cannot add resource from YAML: %v", path, err)
209+
logrus.Warnf("skipping %s: cannot add resource from YAML: %v", path, err)
210210
}
211211
return nil
212212
}
@@ -340,7 +340,7 @@ func CreateResourceProviderFromAPI(ctx context.Context, kube kubernetes.Interfac
340340
groupKind := parseGroupKind(maybeTransformKindIntoGroupKind(string(kind)))
341341
mapping, err := restMapper.RESTMapping(groupKind)
342342
if err != nil {
343-
logrus.Warnf("Error retrieving mapping of Kind %s because of error: %v", kind, err)
343+
logrus.Warnf("error retrieving mapping of Kind %s because of error: %v", kind, err)
344344
return nil, err
345345
}
346346
if c.Namespace != "" && mapping.Scope.Name() != meta.RESTScopeNameNamespace {
@@ -351,7 +351,7 @@ func CreateResourceProviderFromAPI(ctx context.Context, kube kubernetes.Interfac
351351
logrus.Info("Loading " + kind)
352352
objects, err := dynamic.Resource(mapping.Resource).Namespace(c.Namespace).List(ctx, metav1.ListOptions{})
353353
if err != nil {
354-
logrus.Warnf("Error retrieving parent object API %s and Kind %s because of error: %v", mapping.Resource.Version, mapping.Resource.Resource, err)
354+
logrus.Warnf("error retrieving parent object API %s and Kind %s because of error: %v", mapping.Resource.Version, mapping.Resource.Resource, err)
355355
return nil, err
356356
}
357357
for _, obj := range objects.Items {

pkg/validator/custom.go

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package validator
2+
3+
import (
4+
"sync"
5+
6+
"github.com/qri-io/jsonschema"
7+
)
8+
9+
type validatorFunction func(test schemaTestCase) (bool, []jsonschema.ValError, error)
10+
11+
var validatorMapper = map[string]validatorFunction{}
12+
var lock = &sync.Mutex{}
13+
14+
func registerCustomChecks(name string, check validatorFunction) {
15+
lock.Lock()
16+
defer lock.Unlock()
17+
18+
validatorMapper[name] = check
19+
}

pkg/validator/pdb_hpa_validator.go

+150
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
package validator
2+
3+
import (
4+
"fmt"
5+
"strconv"
6+
"strings"
7+
8+
"github.com/fairwindsops/polaris/pkg/kube"
9+
"github.com/qri-io/jsonschema"
10+
"github.com/sirupsen/logrus"
11+
appsv1 "k8s.io/api/apps/v1"
12+
autoscalingv1 "k8s.io/api/autoscaling/v1"
13+
policyv1 "k8s.io/api/policy/v1"
14+
"k8s.io/apimachinery/pkg/runtime"
15+
"k8s.io/apimachinery/pkg/util/intstr"
16+
)
17+
18+
func init() {
19+
registerCustomChecks("pdbMinAvailableGreaterThanHPAMinReplicas", pdbMinAvailableGreaterThanHPAMinReplicas)
20+
}
21+
22+
func pdbMinAvailableGreaterThanHPAMinReplicas(test schemaTestCase) (bool, []jsonschema.ValError, error) {
23+
if test.ResourceProvider == nil {
24+
logrus.Debug("ResourceProvider is nil")
25+
return true, nil, nil
26+
}
27+
28+
deployment := &appsv1.Deployment{}
29+
err := runtime.DefaultUnstructuredConverter.FromUnstructured(test.Resource.Resource.Object, deployment)
30+
if err != nil {
31+
logrus.Warnf("error converting unstructured to Deployment: %v", err)
32+
return true, nil, nil
33+
}
34+
35+
attachedPDB, err := hasPDBAttached(*deployment, test.ResourceProvider.Resources["policy/PodDisruptionBudget"])
36+
if err != nil {
37+
logrus.Warnf("error getting PodDisruptionBudget: %v", err)
38+
return true, nil, nil
39+
}
40+
41+
attachedHPA, err := hasHPAAttached(*deployment, test.ResourceProvider.Resources["autoscaling/HorizontalPodAutoscaler"])
42+
if err != nil {
43+
logrus.Warnf("error getting HorizontalPodAutoscaler: %v", err)
44+
return true, nil, nil
45+
}
46+
47+
if attachedPDB != nil && attachedHPA != nil {
48+
logrus.Debugf("both PDB and HPA are attached to deployment %s", deployment.Name)
49+
50+
pdbMinAvailable, isPercent, err := getIntOrPercentValueSafely(attachedPDB.Spec.MinAvailable)
51+
if err != nil {
52+
logrus.Warnf("error getting getIntOrPercentValueSafely: %v", err)
53+
return true, nil, nil
54+
}
55+
56+
if isPercent {
57+
// if the value is a percentage, we need to calculate the actual value
58+
if attachedHPA.Spec.MinReplicas == nil {
59+
logrus.Debug("attachedHPA.Spec.MinReplicas is nil")
60+
return true, nil, nil
61+
}
62+
63+
pdbMinAvailable, err = intstr.GetScaledValueFromIntOrPercent(attachedPDB.Spec.MinAvailable, int(*attachedHPA.Spec.MinReplicas), true)
64+
if err != nil {
65+
logrus.Warnf("error getting minAvailable value from PodDisruptionBudget: %v", err)
66+
return true, nil, nil
67+
}
68+
}
69+
70+
if attachedHPA.Spec.MinReplicas != nil && pdbMinAvailable >= int(*attachedHPA.Spec.MinReplicas) {
71+
return false, []jsonschema.ValError{
72+
{
73+
PropertyPath: "spec.minAvailable",
74+
InvalidValue: pdbMinAvailable,
75+
Message: fmt.Sprintf("The minAvailable value in the PodDisruptionBudget(%s) is %d, which is greater or equal than the minReplicas value in the HorizontalPodAutoscaler(%s) (%d)", attachedPDB.Name, pdbMinAvailable, attachedHPA.Name, *attachedHPA.Spec.MinReplicas),
76+
},
77+
}, nil
78+
}
79+
}
80+
81+
return true, nil, nil
82+
}
83+
84+
func hasPDBAttached(deployment appsv1.Deployment, pdbs []kube.GenericResource) (*policyv1.PodDisruptionBudget, error) {
85+
for _, generic := range pdbs {
86+
pdb := &policyv1.PodDisruptionBudget{}
87+
err := runtime.DefaultUnstructuredConverter.FromUnstructured(generic.Resource.Object, pdb)
88+
if err != nil {
89+
return nil, fmt.Errorf("error converting unstructured to PodDisruptionBudget: %v", err)
90+
}
91+
92+
if pdb.Spec.Selector == nil {
93+
logrus.Debug("pdb.Spec.Selector is nil")
94+
continue
95+
}
96+
97+
if matchesPDBForDeployment(deployment.Spec.Template.Labels, pdb.Spec.Selector.MatchLabels) {
98+
return pdb, nil
99+
}
100+
}
101+
return nil, nil
102+
}
103+
104+
// matchesPDBForDeployment checks if the labels of the deployment match the labels of the PDB
105+
func matchesPDBForDeployment(deploymentLabels, pdbLabels map[string]string) bool {
106+
for key, value := range pdbLabels {
107+
if deploymentLabels[key] == value {
108+
return true
109+
}
110+
}
111+
return false
112+
}
113+
114+
func hasHPAAttached(deployment appsv1.Deployment, hpas []kube.GenericResource) (*autoscalingv1.HorizontalPodAutoscaler, error) {
115+
for _, generic := range hpas {
116+
hpa := &autoscalingv1.HorizontalPodAutoscaler{}
117+
err := runtime.DefaultUnstructuredConverter.FromUnstructured(generic.Resource.Object, hpa)
118+
if err != nil {
119+
return nil, fmt.Errorf("error converting unstructured to HorizontalPodAutoscaler: %v", err)
120+
}
121+
122+
if hpa.Spec.ScaleTargetRef.Kind == "Deployment" && hpa.Spec.ScaleTargetRef.Name == deployment.Name {
123+
return hpa, nil
124+
}
125+
}
126+
return nil, nil
127+
}
128+
129+
// getIntOrPercentValueSafely is a safer version of getIntOrPercentValue based on private function intstr.getIntOrPercentValueSafely
130+
func getIntOrPercentValueSafely(intOrStr *intstr.IntOrString) (int, bool, error) {
131+
switch intOrStr.Type {
132+
case intstr.Int:
133+
return intOrStr.IntValue(), false, nil
134+
case intstr.String:
135+
isPercent := false
136+
s := intOrStr.StrVal
137+
if strings.HasSuffix(s, "%") {
138+
isPercent = true
139+
s = strings.TrimSuffix(intOrStr.StrVal, "%")
140+
} else {
141+
return 0, false, fmt.Errorf("invalid type: string is not a percentage")
142+
}
143+
v, err := strconv.Atoi(s)
144+
if err != nil {
145+
return 0, false, fmt.Errorf("invalid value %q: %v", intOrStr.StrVal, err)
146+
}
147+
return int(v), isPercent, nil
148+
}
149+
return 0, false, fmt.Errorf("invalid type: neither int nor percentage")
150+
}

pkg/validator/schema.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,8 @@ func applySchemaCheck(conf *config.Configuration, checkID string, test schemaTes
369369
passes, issues, err = check.CheckContainer(test.Container)
370370
} else if check.Validator.SchemaURI != "" {
371371
passes, issues, err = check.CheckObject(test.Resource.Resource.Object)
372+
} else if validatorMapper[checkID] != nil {
373+
passes, issues, err = validatorMapper[checkID](test)
372374
} else {
373375
passes, issues, err = true, []jsonschema.ValError{}, nil
374376
}
@@ -380,7 +382,7 @@ func applySchemaCheck(conf *config.Configuration, checkID string, test schemaTes
380382
break
381383
}
382384
if test.ResourceProvider == nil {
383-
logrus.Warnf("No ResourceProvider available, check %s will not work in this context (e.g. admission control)", checkID)
385+
logrus.Warnf("no ResourceProvider available, check %s will not work in this context (e.g. admission control)", checkID)
384386
break
385387
}
386388
resources := test.ResourceProvider.Resources[groupkind]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: zookeeper
5+
spec:
6+
replicas: 10
7+
template:
8+
metadata:
9+
labels:
10+
app.kubernetes.io/name: zookeeper
11+
foo: bar
12+
spec:
13+
containers:
14+
- name: zookeeper
15+
image: zookeeper
16+
---
17+
apiVersion: policy/v1
18+
kind: PodDisruptionBudget
19+
metadata:
20+
name: zookeeper-pdb
21+
spec:
22+
minAvailable: 150% # 1.5 * 10 = 15
23+
selector:
24+
matchLabels:
25+
app.kubernetes.io/name: zookeeper
26+
---
27+
apiVersion: autoscaling/v2
28+
kind: HorizontalPodAutoscaler
29+
metadata:
30+
name: zookeeper-hpa
31+
spec:
32+
scaleTargetRef:
33+
apiVersion: apps/v1
34+
kind: Deployment
35+
name: zookeeper
36+
minReplicas: 10
37+
maxReplicas: 15
38+
metrics:
39+
- type: Resource
40+
resource:
41+
name: cpu
42+
target:
43+
type: Utilization
44+
averageUtilization: 50

0 commit comments

Comments
 (0)