fix: Address comments

gaocegege · gaocegege · commit 2fc6c57f6342 · 2021-11-26T10:25:34.000+08:00
Signed-off-by: Ce Gao &lt;ce.gao@outlook.com&gt;
diff --git a/examples/pytorch/elastic/echo/Dockerfile b/examples/pytorch/elastic/echo/Dockerfile
@@ -1,6 +1,6 @@
 FROM python:3.8-buster
 WORKDIR /workspace
-RUN pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple torch==1.10.0 numpy
+RUN pip install torch==1.10.0 numpy
 # TODO Replace this with the PIP version when available
 ADD echo.py echo.py
 ENV PYTHONPATH /workspace
diff --git a/examples/pytorch/elastic/imagenet/Dockerfile b/examples/pytorch/elastic/imagenet/Dockerfile
@@ -2,7 +2,7 @@ ARG BASE_IMAGE=pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
 FROM $BASE_IMAGE
 
 # install utilities and dependencies
-RUN pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple classy-vision
+RUN pip install classy-vision
 
 WORKDIR /workspace
 
diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh
@@ -113,7 +113,6 @@ cd - >/dev/null
 # Notice: The code in kube-openapi does not generate defaulter by default.
 # We need to build binary from pkg cmd folder.
 echo "Building openapi-gen"
-echo ${OPENAPI_PKG}
 go build -o openapi-gen ${OPENAPI_PKG}/cmd/openapi-gen
 
 echo "Generating OpenAPI specification for tensorflow/v1"
diff --git a/pkg/apis/pytorch/v1/types.go b/pkg/apis/pytorch/v1/types.go
@@ -88,14 +88,14 @@ type ElasticPolicy struct {
 
 	MaxRestarts *int32 `json:"maxRestarts,omitempty"`
 
-	// metrics contains the specifications for which to use to calculate the
+	// Metrics contains the specifications which are used to calculate the
 	// desired replica count (the maximum replica count across all metrics will
-	// be used).  The desired replica count is calculated multiplying the
+	// be used).  The desired replica count is calculated with multiplying the
 	// ratio between the target value and the current value by the current
-	// number of pods.  Ergo, metrics used must decrease as the pod count is
+	// number of pods. Ergo, metrics used must decrease as the pod count is
 	// increased, and vice-versa.  See the individual metric source types for
 	// more information about how each type of metric must respond.
-	// If not set, the default metric will be set to 80% average CPU utilization.
+	// If not set, the HPA will not be created.
 	// +optional
 	Metrics []autoscalingv2beta2.MetricSpec `json:"metrics,omitempty"`
 }
@@ -111,14 +111,14 @@ const (
 	// BackendC10D is the rendezvous backend type for C10d.
 	BackendC10D RDZVBackend = "c10d"
 	// BackendETCD is the rendezvous backend type for ETCD.
-	BackendETCD = "etcd"
+	BackendETCD RDZVBackend = "etcd"
 	// BackendETCDV2 is the rendezvous backend type for ETCD v2.
-	BackendETCDV2 = "etcd-v2"
+	BackendETCDV2 RDZVBackend = "etcd-v2"
 
 	// PyTorchReplicaTypeMaster is the type of Master of distributed PyTorch
 	PyTorchReplicaTypeMaster common.ReplicaType = "Master"
 	// PyTorchReplicaTypeWorker is the type for workers of distributed PyTorch.
-	PyTorchReplicaTypeWorker = "Worker"
+	PyTorchReplicaTypeWorker common.ReplicaType = "Worker"
 )
 
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
diff --git a/pkg/controller.v1/pytorch/hpa.go b/pkg/controller.v1/pytorch/hpa.go
@@ -31,40 +31,40 @@ import (
 func (r *PyTorchJobReconciler) ReconcileHPA(pytorhcjob *pytorchv1.PyTorchJob) error {
 	logger := r.Log.WithValues(pytorchv1.Singular, pytorhcjob.Name)
 
-	if pytorhcjob.Spec.ElasticPolicy == nil {
-		logger.V(1).Info("No ElasicPolicy is specified, skipping HPA reconciling process")
+	if pytorhcjob.Spec.ElasticPolicy == nil || pytorhcjob.Spec.ElasticPolicy.Metrics == nil {
+		logger.V(1).Info(
+			"No ElasicPolicy or Metric is specified, skipping HPA reconciling process")
 		return nil
 	}
 
-	// Create or update HPA
-	hpa := &autoscalingv2beta2.HorizontalPodAutoscaler{}
-	err := r.Get(context.TODO(), types.NamespacedName{Name: pytorhcjob.Name, Namespace: pytorhcjob.Namespace}, hpa)
+	current := &autoscalingv2beta2.HorizontalPodAutoscaler{}
+
+	// Get the exepected HPA.
+	expected, err := desiredHPA(pytorhcjob, r.Scheme)
 	if err != nil {
+		return err
+	}
+
+	if err := r.Get(context.TODO(), types.NamespacedName{
+		Name:      pytorhcjob.Name,
+		Namespace: pytorhcjob.Namespace,
+	}, current); err != nil {
 		if !errors.IsNotFound(err) {
 			return err
 		}
 
 		// Create the new HPA.
-		hpa, err = desiredHPA(pytorhcjob, r.Scheme)
-		if err != nil {
-			return err
-		}
-		logger.V(1).Info("Creating HPA", "HPA.Namespace", hpa.Namespace, "HPA.Name", hpa.Name)
-		err = r.Create(context.TODO(), hpa)
+		logger.V(1).Info("Creating HPA", "namespace", expected.Namespace, "name", expected.Name)
+		err = r.Create(context.TODO(), expected)
 		if err != nil {
 			return err
 		}
 		return nil
 	}
 
-	// Update HPA
-	expected, err := desiredHPA(pytorhcjob, r.Scheme)
-	if err != nil {
-		return err
-	}
-	if !equality.Semantic.DeepEqual(expected.Spec, hpa.Spec) {
-		logger.V(1).Info("Updating HPA", "HPA.Namespace", hpa.Namespace, "HPA.Name", hpa.Name)
-		expected.ResourceVersion = hpa.ResourceVersion
+	if !equality.Semantic.DeepEqual(expected.Spec, current.Spec) {
+		logger.V(1).Info("Updating HPA", "namespace", current.Namespace, "name", current.Name)
+		expected.ResourceVersion = current.ResourceVersion
 		err = r.Update(context.TODO(), expected)
 		if err != nil {
 			return err
diff --git a/pkg/controller.v1/pytorch/pytorch.go b/pkg/controller.v1/pytorch/pytorch.go
@@ -53,11 +53,7 @@ func SetPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,
 			if err != nil {
 				return err
 			}
-			if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeMaster)) {
-				if rank != 0 {
-					return fmt.Errorf("invalid config: There should be only a single master with index=0")
-				}
-			} else {
+			if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeWorker)) {
 				rank = rank + 1
 			}
 

Original file line number	Diff line number	Diff line change
`@@ -53,11 +53,7 @@ func SetPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,`
`53`	`53`	`if err != nil {`
`54`	`54`	`return err`
`55`	`55`	`}`
`56`		`- if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeMaster)) {`
`57`		`- if rank != 0 {`
`58`		`- return fmt.Errorf("invalid config: There should be only a single master with index=0")`
`59`		`- }`
`60`		`- } else {`
	`56`	`+ if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeWorker)) {`
`61`	`57`	`rank = rank + 1`
`62`	`58`	`}`
`63`	`59`