Skip to content

sync: sync up dev/lm-eval branch with main branch #336

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/build-and-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ jobs:
if: env.BUILD_CONTEXT == 'ci'
run: |
sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/base/params.env
sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/overlays/odh/params.env
sed -i "s#quay.io/trustyai/trustyai-service-operator:latest#${{ env.IMAGE_NAME }}:$TAG#" ./config/overlays/rhoai/params.env
rm -Rf $(ls . | grep -v config)
rm -Rf .gitignore .dockerignore .github .git .yamllint.yaml
# pysh to ci-manifest repo
Expand Down Expand Up @@ -146,4 +148,12 @@ jobs:
📦 [LMES job image](https://quay.io/trustyai/ta-lmes-job:${{ github.event.pull_request.head.sha }}): `quay.io/trustyai/ta-lmes-job:${{ github.event.pull_request.head.sha }}`

🗂️ [CI manifests](https://github.com/trustyai-explainability/trustyai-service-operator-ci/tree/operator-${{ env.TAG }})

```
devFlags:
manifests:
- contextDir: config
sourcePath: ''
uri: https://api.github.com/repos/trustyai-explainability/trustyai-service-operator-ci/tarball/operator-${{ env.TAG }}
```

10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,20 @@ through its `status` field. Below are the status types and reasons that are avai
| `PVCAvailable` | `PVCNotFound` | `PersistentVolumeClaim` not found. |
| `PVCAvailable` | `PVCFound` | `PersistentVolumeClaim` found. |

#### Database Status

| Status Type | Status Reason | Description |
|---------------|-------------------------|---------------------------------------------------|
| `DBAvailable` | `DBCredentialsNotFound` | Database credentials secret not found |
| `DBAvailable` | `DBCredentialsError` | Database credentials malformed (e.g. missing key) |
| `DBAvailable` | `DBConnectionError` | Service error connecting to the database |
| `DBAvailable` | `DBAvailable` | Successfully connected to the database |


#### Status Behavior

- If a PVC is not available, the `Ready` status of `TrustyAIService` will be set to `False`.
- If on database mode, any `DBAvailable` reason other than `DBAvailable` will set the `TrustyAIService` to `Not Ready`
- However, if `InferenceServices` are not found, the `Ready` status of `TrustyAIService` will not be affected, _i.e._, it is `Ready` by all other conditions, it will remain so.

## Contributing
Expand Down
2 changes: 2 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
kservev1beta1 "github.com/kserve/kserve/pkg/apis/serving/v1beta1"
routev1 "github.com/openshift/api/route/v1"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
Expand Down Expand Up @@ -58,6 +59,7 @@ func init() {
utilruntime.Must(kservev1alpha1.AddToScheme(scheme))
utilruntime.Must(kservev1beta1.AddToScheme(scheme))
utilruntime.Must(routev1.AddToScheme(scheme))
utilruntime.Must(apiextensionsv1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}

Expand Down
1 change: 1 addition & 0 deletions config/base/params.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ lmes-image-pull-policy=Always
lmes-max-batch-size=24
lmes-default-batch-size=8
lmes-detect-device=true

2 changes: 1 addition & 1 deletion config/overlays/rhoai/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ patchesStrategicMerge:
configMapGenerator:
- env: params.env
behavior: merge
name: config
name: config
2 changes: 1 addition & 1 deletion config/rbac/auth_proxy_service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ spec:
- name: https
port: 8443
protocol: TCP
targetPort: 8080
targetPort: 8081
selector:
control-plane: controller-manager
21 changes: 21 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ rules:
- list
- update
- watch
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
Expand Down Expand Up @@ -99,6 +107,19 @@ rules:
- create
- list
- watch
- apiGroups:
- networking.istio.io
resources:
- destinationrules
- virtualservices
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- rbac.authorization.k8s.io
resources:
Expand Down
15 changes: 15 additions & 0 deletions controllers/tas/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const (
StatusTypePVCAvailable = "PVCAvailable"
StatusTypeRouteAvailable = "RouteAvailable"
StatusTypeAvailable = "Available"
StatusTypeDBAvailable = "DBAvailable"
)

// Status reasons
Expand All @@ -59,6 +60,10 @@ const (
StatusReasonRouteFound = "RouteFound"
StatusAvailable = "AllComponentsReady"
StatusNotAvailable = "NotAllComponentsReady"
StatusDBCredentialsNotFound = "DBCredentialsNotFound"
StatusDBCredentialsError = "DBCredentialsError"
StatusDBConnectionError = "DBConnectionError"
StatusDBAvailable = "DBAvailable"
)

// Event reasons
Expand All @@ -68,4 +73,14 @@ const (
EventReasonServiceMonitorCreated = "ServiceMonitorCreated"
)

const (
StateReasonCrashLoopBackOff = "CrashLoopBackOff"
)

// Phases
const (
PhaseReady = "Ready"
PhaseNotReady = "Not Ready"
)

const migrationAnnotationKey = "trustyai.opendatahub.io/db-migration"
62 changes: 62 additions & 0 deletions controllers/tas/database.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package tas

import (
"context"
"strings"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func (r *TrustyAIServiceReconciler) checkDatabaseAccessible(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}
err := r.Get(ctx, types.NamespacedName{Name: instance.Name, Namespace: instance.Namespace}, deployment)
if err != nil {
if errors.IsNotFound(err) {
return false, nil
}
return false, err
}

for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == "trustyai-service" {
if cs.State.Running != nil {
return true, nil
}

if cs.LastTerminationState.Terminated != nil {
termination := cs.LastTerminationState.Terminated
if termination.Reason == "Error" && termination.Message != "" {
if strings.Contains(termination.Message, "Socket fail to connect to host:address") {
return false, nil
}
}
}

if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
}
}
}
}
}

return false, nil
}
31 changes: 26 additions & 5 deletions controllers/tas/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ import (
"reflect"
"strconv"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
"github.com/trustyai-explainability/trustyai-service-operator/controllers/constants"
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/tas/templates"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)

Expand Down Expand Up @@ -74,13 +74,13 @@ func (r *TrustyAIServiceReconciler) createDeploymentObject(ctx context.Context,
}

if instance.Spec.Storage.IsStorageDatabase() {
_, err := r.getSecret(ctx, instance.Name+"-db-tls", instance.Namespace)
_, err := r.getSecret(ctx, instance.Name+"-db-ca", instance.Namespace)
if err != nil {
deploymentConfig.UseDBTLSCerts = false
log.FromContext(ctx).Error(err, "Using insecure database connection. Certificates "+instance.Name+"-db-tls not found")
log.FromContext(ctx).Info("Using insecure database connection. Certificates " + instance.Name + "-db-ca not found")
} else {
deploymentConfig.UseDBTLSCerts = true
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-tls")
log.FromContext(ctx).Info("Using secure database connection with certificates " + instance.Name + "-db-ca")
}
} else {
deploymentConfig.UseDBTLSCerts = false
Expand Down Expand Up @@ -203,6 +203,7 @@ func (r *TrustyAIServiceReconciler) ensureDeployment(ctx context.Context, instan
return nil
}

// checkDeploymentReady verifies that a TrustyAI service deployment is ready
func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) (bool, error) {
deployment := &appsv1.Deployment{}

Expand All @@ -217,6 +218,26 @@ func (r *TrustyAIServiceReconciler) checkDeploymentReady(ctx context.Context, in
for _, cond := range deployment.Status.Conditions {
if cond.Type == appsv1.DeploymentAvailable && cond.Status == corev1.ConditionTrue {
if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas {
podList := &corev1.PodList{}
listOpts := []client.ListOption{
client.InNamespace(instance.Namespace),
client.MatchingLabels(deployment.Spec.Selector.MatchLabels),
}
if err := r.List(ctx, podList, listOpts...); err != nil {
return false, err
}

for _, pod := range podList.Items {
for _, cs := range pod.Status.ContainerStatuses {
if cs.State.Waiting != nil && cs.State.Waiting.Reason == StateReasonCrashLoopBackOff {
return false, nil
}
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
return false, nil
}
}
}

return true, nil
}
}
Expand Down
89 changes: 89 additions & 0 deletions controllers/tas/destination_rule.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package tas

import (
"context"
"fmt"
"reflect"

trustyaiopendatahubiov1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
templateParser "github.com/trustyai-explainability/trustyai-service-operator/controllers/tas/templates"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
)

const (
destinationRuleTemplatePath = "service/destination-rule.tmpl.yaml"
destinationRuleCDRName = "destinationrules.networking.istio.io"
)

// DestinationRuleConfig has the variables for the DestinationRule template
type DestinationRuleConfig struct {
Name string
Namespace string
DestinationRuleName string
}

// isDestinationRuleCRDPresent returns true if the DestinationRule CRD is present, false otherwise
func (r *TrustyAIServiceReconciler) isDestinationRuleCRDPresent(ctx context.Context) (bool, error) {
crd := &apiextensionsv1.CustomResourceDefinition{}

err := r.Get(ctx, types.NamespacedName{Name: destinationRuleCDRName}, crd)
if err != nil {
if !errors.IsNotFound(err) {
return false, fmt.Errorf("error getting "+destinationRuleCDRName+" CRD: %v", err)
}
// Not found
return false, nil
}

// Found
return true, nil
}

func (r *TrustyAIServiceReconciler) ensureDestinationRule(ctx context.Context, instance *trustyaiopendatahubiov1alpha1.TrustyAIService) error {

destinationRuleName := instance.Name + "-internal"

existingDestinationRule := &unstructured.Unstructured{}
existingDestinationRule.SetKind("DestinationRule")
existingDestinationRule.SetAPIVersion("networking.istio.io/v1beta1")

// Check if the DestinationRule already exists
err := r.Get(ctx, types.NamespacedName{Name: destinationRuleName, Namespace: instance.Namespace}, existingDestinationRule)
if err == nil {
// DestinationRule exists
return nil
}

if !errors.IsNotFound(err) {
return fmt.Errorf("failed to check for existing DestinationRule: %v", err)
}

destinationRuleConfig := DestinationRuleConfig{
Name: instance.Name,
Namespace: instance.Namespace,
DestinationRuleName: destinationRuleName,
}

var destinationRule *unstructured.Unstructured
destinationRule, err = templateParser.ParseResource[unstructured.Unstructured](destinationRuleTemplatePath, destinationRuleConfig, reflect.TypeOf(&unstructured.Unstructured{}))
if err != nil {
log.FromContext(ctx).Error(err, "could not parse the DestinationRule template")
return err
}

if err := ctrl.SetControllerReference(instance, destinationRule, r.Scheme); err != nil {
return err
}

err = r.Create(ctx, destinationRule)
if err != nil {
return fmt.Errorf("failed to create DestinationRule: %v", err)
}

return nil
}
Loading
Loading