Skip to content

Commit c072d51

Browse files
authored
RHOAIENG-13625: Add DBAvailable status to CR (#304)
* Add DBAvailable status to CR * Remove probes
1 parent f366b6e commit c072d51

File tree

10 files changed

+249
-29
lines changed

10 files changed

+249
-29
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai
151151
| `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. |
152152
| `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. |
153153

154+
#### Database Status
155+
156+
| Status Type | Status Reason | Description |
157+
|---------------|-------------------------|---------------------------------------------------|
158+
| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found |
159+
| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) |
160+
| `DBAvailable` | `DBConnectionError` | Service error connecting to the database |
161+
| `DBAvailable` | `DBAvailable` | Successfully connected to the database |
162+
154163

155164
#### Status Behavior
156165

157166
- If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`.
167+
- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready`
158168
- However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so.
159169

160170
## Contributing

config/base/params.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
trustyaiServiceImage=quay.io/trustyai/trustyai-service:latest
22
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:latest
33
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
4-
kServeServerless=disabled
4+
kServeServerless=disabled

config/overlays/odh/params.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
trustyaiServiceImage=quay.io/trustyai/trustyai-service:v0.19.0
22
trustyaiOperatorImage=quay.io/trustyai/trustyai-service-operator:v1.25.0
33
oauthProxyImage=quay.io/openshift/origin-oauth-proxy:4.14.0
4-
kServeServerless=enabled
4+
kServeServerless=enabled

controllers/constants.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ const (
4646
StatusTypePVCAvailable = "PVCAvailable"
4747
StatusTypeRouteAvailable = "RouteAvailable"
4848
StatusTypeAvailable = "Available"
49+
StatusTypeDBAvailable = "DBAvailable"
4950
)
5051

5152
// Status reasons
@@ -58,6 +59,10 @@ const (
5859
StatusReasonRouteFound = "RouteFound"
5960
StatusAvailable = "AllComponentsReady"
6061
StatusNotAvailable = "NotAllComponentsReady"
62+
StatusDBCredentialsNotFound = "DBCredentialsNotFound"
63+
StatusDBCredentialsError = "DBCredentialsError"
64+
StatusDBConnectionError = "DBConnectionError"
65+
StatusDBAvailable = "DBAvailable"
6166
)
6267

6368
// Event reasons
@@ -67,4 +72,14 @@ const (
6772
EventReasonServiceMonitorCreated = "ServiceMonitorCreated"
6873
)
6974

75+
const (
76+
StateReasonCrashLoopBackOff = "CrashLoopBackOff"
77+
)
78+
79+
// Phases
80+
const (
81+
PhaseReady = "Ready"
82+
PhaseNotReady = "Not Ready"
83+
)
84+
7085
const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration"

controllers/database.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
package controllers
2+
3+
import (
4+
"context"
5+
"strings"
6+
7+
trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
8+
appsv1 "k8s.io/api/apps/v1"
9+
corev1 "k8s.io/api/core/v1"
10+
"k8s.io/apimachinery/pkg/api/errors"
11+
"k8s.io/apimachinery/pkg/types"
12+
"sigs.k8s.io/controller-runtime/pkg/client"
13+
)
14+
15+
// checkDatabaseAccessible checks if the TrustyAI service pod failed with database issues.
16+
func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
17+
deployment := &appsv1.Deployment{}
18+
err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment)
19+
if err != nil {
20+
if errors.IsNotFound(err) {
21+
return false, nil
22+
}
23+
return false, err
24+
}
25+
26+
for _, cond := range deployment.Status.Conditions {
27+
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
28+
podList := &corev1.PodList{}
29+
listOpts := []client.ListOption{
30+
client.InNamespace(instance.Namespace),
31+
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
32+
}
33+
if err := r.List(ctx, podList, listOpts...); err != nil {
34+
return false, err
35+
}
36+
37+
for _, pod := range podList.Items {
38+
for _, cs := range pod.Status.ContainerStatuses {
39+
if cs.Name == "trustyai-service" {
40+
if cs.State.Running != nil {
41+
return true, nil
42+
}
43+
44+
if cs.LastTerminationState.Terminated != nil {
45+
termination := cs.LastTerminationState.Terminated
46+
if termination.Reason == "Error" && termination.Message != "" {
47+
if strings.Contains(termination.Message, "Socket fail to connect to host:address") {
48+
return false, nil
49+
}
50+
}
51+
}
52+
53+
if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
54+
return false, nil
55+
}
56+
}
57+
}
58+
}
59+
}
60+
}
61+
62+
return false, nil
63+
}

controllers/deployment.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@ package controllers
22

33
import (
44
"context"
5-
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"
65
"reflect"
76
"strconv"
87

8+
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/templates"
9+
910
trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/v1alpha1"
1011
appsv1 "k8s.io/api/apps/v1"
1112
corev1 "k8s.io/api/core/v1"
1213
"k8s.io/apimachinery/pkg/api/errors"
1314
"k8s.io/apimachinery/pkg/types"
1415
ctrl "sigs.k8s.io/controller-runtime"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
1517
"sigs.k8s.io/controller-runtime/pkg/log"
1618
)
1719

@@ -75,7 +77,7 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context,
7577
_, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace)
7678
if err != nil {
7779
deploymentConfig.UseDBTLSCerts = false
78-
log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found")
80+
log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-tls not found")
7981
} else {
8082
deploymentConfig.UseDBTLSCerts = true
8183
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls")
@@ -201,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan
201203
return nil
202204
}
203205

206+
// checkDeploymentReady verifies that a TrustyAI service deployment is ready
204207
func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
205208
deployment := &appsv1.Deployment{}
206209

@@ -215,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in
215218
for _, cond := range deployment.Status.Conditions {
216219
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
217220
if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas {
221+
podList := &corev1.PodList{}
222+
listOpts := []client.ListOption{
223+
client.InNamespace(instance.Namespace),
224+
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
225+
}
226+
if err := r.List(ctx, podList, listOpts...); err != nil {
227+
return false, err
228+
}
229+
230+
for _, pod := range podList.Items {
231+
for _, cs := range pod.Status.ContainerStatuses {
232+
if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
233+
return false, nil
234+
}
235+
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
236+
return false, nil
237+
}
238+
}
239+
}
240+
218241
return true, nil
219242
}
220243
}

controllers/statuses.go

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ import (
1313

1414
// IsAllReady checks if all the necessary readiness fields are true for the specific mode
1515
func (rs *AvailabilityStatus) IsAllReady(mode string) bool {
16-
return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) || (rs.DeploymentReady && rs.RouteReady && mode == STORAGE_DATABASE)
16+
return (rs.PVCReady && rs.DeploymentReady && rs.RouteReady && mode == STORAGE_PVC) ||
17+
(rs.DeploymentReady && rs.RouteReady && rs.DBReady && mode == STORAGE_DATABASE)
1718
}
1819

1920
// AvailabilityStatus has the readiness status of various resources.
@@ -22,6 +23,7 @@ type AvailabilityStatus struct {
2223
DeploymentReady bool
2324
RouteReady bool
2425
InferenceServiceReady bool
26+
DBReady bool
2527
}
2628

2729
func (r *TrustyAIServiceReconciler) updateStatus(ctx context.Context, original *trustyaiopendatahubiov1alpha1.TrustyAIService, update func(saved *trustyaiopendatahubiov1alpha1.TrustyAIService),
@@ -53,25 +55,17 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
5355
if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() {
5456
// Check for PVC readiness
5557
status.PVCReady, err = r.checkPVCReady(ctx, instance)
56-
if err != nil || !status.PVCReady {
57-
// PVC not ready, requeue
58-
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "PVC not ready")
59-
}
6058
}
6159

6260
// Check for deployment readiness
6361
status.DeploymentReady, err = r.checkDeploymentReady(ctx, instance)
64-
if err != nil || !status.DeploymentReady {
65-
// Deployment not ready, requeue
66-
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Deployment not ready")
62+
63+
if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
64+
status.DBReady, _ = r.checkDatabaseAccessible(ctx, instance)
6765
}
6866

6967
// Check for route readiness
7068
status.RouteReady, err = r.checkRouteReady(ctx, instance)
71-
if err != nil || !status.RouteReady {
72-
// Route not ready, requeue
73-
return RequeueWithDelayMessage(ctx, defaultRequeueDelay, "Route not ready")
74-
}
7569

7670
// Check if InferenceServices present
7771
status.InferenceServiceReady, err = r.checkInferenceServicesPresent(ctx, instance.Namespace)
@@ -89,9 +83,15 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
8983
if instance.Spec.Storage.IsStoragePVC() || instance.IsMigration() {
9084
UpdatePVCAvailable(saved)
9185
}
86+
9287
UpdateRouteAvailable(saved)
88+
89+
if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
90+
UpdateDBAvailable(saved)
91+
}
92+
9393
UpdateTrustyAIServiceAvailable(saved)
94-
saved.Status.Phase = "Ready"
94+
saved.Status.Phase = PhaseReady
9595
saved.Status.Ready = v1.ConditionTrue
9696
})
9797
if updateErr != nil {
@@ -114,13 +114,18 @@ func (r *TrustyAIServiceReconciler) reconcileStatuses(ctx context.Context, insta
114114
}
115115
}
116116

117+
if instance.Spec.Storage.IsStorageDatabase() || instance.IsMigration() {
118+
UpdateDBConnectionError(saved)
119+
}
120+
117121
if status.RouteReady {
118122
UpdateRouteAvailable(saved)
119123
} else {
120124
UpdateRouteNotAvailable(saved)
121125
}
126+
122127
UpdateTrustyAIServiceNotAvailable(saved)
123-
saved.Status.Phase = "Ready"
128+
saved.Status.Phase = PhaseNotReady
124129
saved.Status.Ready = v1.ConditionFalse
125130
})
126131
if updateErr != nil {
@@ -143,7 +148,7 @@ func UpdateInferenceServicePresent(saved *trustyaiopendatahubiov1alpha1.TrustyAI
143148

144149
func UpdatePVCNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
145150
saved.SetStatus(StatusTypePVCAvailable, StatusReasonPVCNotFound, "PersistentVolumeClaim not found", v1.ConditionFalse)
146-
saved.Status.Phase = "Not Ready"
151+
saved.Status.Phase = PhaseNotReady
147152
saved.Status.Ready = v1.ConditionFalse
148153
}
149154

@@ -165,4 +170,28 @@ func UpdateTrustyAIServiceAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyA
165170

166171
func UpdateTrustyAIServiceNotAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
167172
saved.SetStatus(StatusTypeAvailable, StatusNotAvailable, "Not all components available", v1.ConditionFalse)
173+
saved.Status.Phase = PhaseNotReady
174+
saved.Status.Ready = v1.ConditionFalse
175+
}
176+
177+
func UpdateDBCredentialsNotFound(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
178+
saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsNotFound, "Database credentials not found", v1.ConditionFalse)
179+
saved.Status.Phase = PhaseNotReady
180+
saved.Status.Ready = v1.ConditionFalse
181+
}
182+
183+
func UpdateDBCredentialsError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
184+
saved.SetStatus(StatusTypeDBAvailable, StatusDBCredentialsError, "Error with database credentials", v1.ConditionFalse)
185+
saved.Status.Phase = PhaseNotReady
186+
saved.Status.Ready = v1.ConditionFalse
187+
}
188+
189+
func UpdateDBConnectionError(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
190+
saved.SetStatus(StatusTypeDBAvailable, StatusDBConnectionError, "Error connecting to database", v1.ConditionFalse)
191+
saved.Status.Phase = PhaseNotReady
192+
saved.Status.Ready = v1.ConditionFalse
193+
}
194+
195+
func UpdateDBAvailable(saved *trustyaiopendatahubiov1alpha1.TrustyAIService) {
196+
saved.SetStatus(StatusTypeDBAvailable, StatusDBAvailable, "Database available", v1.ConditionTrue)
168197
}

controllers/statuses_test.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func setupAndTestStatusNoComponent(instance *trustyaiopendatahubiov1alpha1.Trust
3535
// Call the reconcileStatuses function
3636
_, _ = reconciler.reconcileStatuses(ctx, instance)
3737

38-
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
38+
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
3939
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
4040
if readyCondition != nil {
4141
Expect(statusMatch).To(Equal(corev1.ConditionFalse), "Ready condition should be true")
@@ -127,7 +127,7 @@ var _ = Describe("Status and condition tests", func() {
127127
}, instance)
128128
}, "failed to get updated instance")
129129

130-
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
130+
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
131131
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
132132
if readyCondition != nil {
133133
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
@@ -191,7 +191,7 @@ var _ = Describe("Status and condition tests", func() {
191191
}, instance)
192192
}, "failed to get updated instance")
193193

194-
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
194+
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
195195
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
196196
if readyCondition != nil {
197197
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
@@ -260,8 +260,7 @@ var _ = Describe("Status and condition tests", func() {
260260
Namespace: instance.Namespace,
261261
}, instance)
262262
}, "failed to get updated instance")
263-
264-
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
263+
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
265264
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
266265
if readyCondition != nil {
267266
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")
@@ -344,7 +343,7 @@ var _ = Describe("Status and condition tests", func() {
344343
}, instance)
345344
}, "failed to get updated instance")
346345

347-
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, "Ready", corev1.ConditionTrue, true)
346+
readyCondition, statusMatch, err := checkCondition(instance.Status.Conditions, PhaseReady, corev1.ConditionTrue, true)
348347
Expect(err).NotTo(HaveOccurred(), "Error checking Ready condition")
349348
if readyCondition != nil {
350349
Expect(statusMatch).To(Equal(corev1.ConditionTrue), "Ready condition should be true")

0 commit comments

Comments
 (0)