Skip to content

Commit 2fc6c57

Browse files
committed
fix: Address comments
Signed-off-by: Ce Gao <[email protected]>
1 parent 877110d commit 2fc6c57

File tree

6 files changed

+29
-34
lines changed

6 files changed

+29
-34
lines changed

examples/pytorch/elastic/echo/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
FROM python:3.8-buster
22
WORKDIR /workspace
3-
RUN pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple torch==1.10.0 numpy
3+
RUN pip install torch==1.10.0 numpy
44
# TODO Replace this with the PIP version when available
55
ADD echo.py echo.py
66
ENV PYTHONPATH /workspace

examples/pytorch/elastic/imagenet/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ ARG BASE_IMAGE=pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
22
FROM $BASE_IMAGE
33

44
# install utilities and dependencies
5-
RUN pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple classy-vision
5+
RUN pip install classy-vision
66

77
WORKDIR /workspace
88

hack/update-codegen.sh

-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ cd - >/dev/null
113113
# Notice: The code in kube-openapi does not generate defaulter by default.
114114
# We need to build binary from pkg cmd folder.
115115
echo "Building openapi-gen"
116-
echo ${OPENAPI_PKG}
117116
go build -o openapi-gen ${OPENAPI_PKG}/cmd/openapi-gen
118117

119118
echo "Generating OpenAPI specification for tensorflow/v1"

pkg/apis/pytorch/v1/types.go

+7-7
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,14 @@ type ElasticPolicy struct {
8888

8989
MaxRestarts *int32 `json:"maxRestarts,omitempty"`
9090

91-
// metrics contains the specifications for which to use to calculate the
91+
// Metrics contains the specifications which are used to calculate the
9292
// desired replica count (the maximum replica count across all metrics will
93-
// be used). The desired replica count is calculated multiplying the
93+
// be used). The desired replica count is calculated with multiplying the
9494
// ratio between the target value and the current value by the current
95-
// number of pods. Ergo, metrics used must decrease as the pod count is
95+
// number of pods. Ergo, metrics used must decrease as the pod count is
9696
// increased, and vice-versa. See the individual metric source types for
9797
// more information about how each type of metric must respond.
98-
// If not set, the default metric will be set to 80% average CPU utilization.
98+
// If not set, the HPA will not be created.
9999
// +optional
100100
Metrics []autoscalingv2beta2.MetricSpec `json:"metrics,omitempty"`
101101
}
@@ -111,14 +111,14 @@ const (
111111
// BackendC10D is the rendezvous backend type for C10d.
112112
BackendC10D RDZVBackend = "c10d"
113113
// BackendETCD is the rendezvous backend type for ETCD.
114-
BackendETCD = "etcd"
114+
BackendETCD RDZVBackend = "etcd"
115115
// BackendETCDV2 is the rendezvous backend type for ETCD v2.
116-
BackendETCDV2 = "etcd-v2"
116+
BackendETCDV2 RDZVBackend = "etcd-v2"
117117

118118
// PyTorchReplicaTypeMaster is the type of Master of distributed PyTorch
119119
PyTorchReplicaTypeMaster common.ReplicaType = "Master"
120120
// PyTorchReplicaTypeWorker is the type for workers of distributed PyTorch.
121-
PyTorchReplicaTypeWorker = "Worker"
121+
PyTorchReplicaTypeWorker common.ReplicaType = "Worker"
122122
)
123123

124124
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

pkg/controller.v1/pytorch/hpa.go

+19-19
Original file line numberDiff line numberDiff line change
@@ -31,40 +31,40 @@ import (
3131
func (r *PyTorchJobReconciler) ReconcileHPA(pytorhcjob *pytorchv1.PyTorchJob) error {
3232
logger := r.Log.WithValues(pytorchv1.Singular, pytorhcjob.Name)
3333

34-
if pytorhcjob.Spec.ElasticPolicy == nil {
35-
logger.V(1).Info("No ElasicPolicy is specified, skipping HPA reconciling process")
34+
if pytorhcjob.Spec.ElasticPolicy == nil || pytorhcjob.Spec.ElasticPolicy.Metrics == nil {
35+
logger.V(1).Info(
36+
"No ElasicPolicy or Metric is specified, skipping HPA reconciling process")
3637
return nil
3738
}
3839

39-
// Create or update HPA
40-
hpa := &autoscalingv2beta2.HorizontalPodAutoscaler{}
41-
err := r.Get(context.TODO(), types.NamespacedName{Name: pytorhcjob.Name, Namespace: pytorhcjob.Namespace}, hpa)
40+
current := &autoscalingv2beta2.HorizontalPodAutoscaler{}
41+
42+
// Get the exepected HPA.
43+
expected, err := desiredHPA(pytorhcjob, r.Scheme)
4244
if err != nil {
45+
return err
46+
}
47+
48+
if err := r.Get(context.TODO(), types.NamespacedName{
49+
Name: pytorhcjob.Name,
50+
Namespace: pytorhcjob.Namespace,
51+
}, current); err != nil {
4352
if !errors.IsNotFound(err) {
4453
return err
4554
}
4655

4756
// Create the new HPA.
48-
hpa, err = desiredHPA(pytorhcjob, r.Scheme)
49-
if err != nil {
50-
return err
51-
}
52-
logger.V(1).Info("Creating HPA", "HPA.Namespace", hpa.Namespace, "HPA.Name", hpa.Name)
53-
err = r.Create(context.TODO(), hpa)
57+
logger.V(1).Info("Creating HPA", "namespace", expected.Namespace, "name", expected.Name)
58+
err = r.Create(context.TODO(), expected)
5459
if err != nil {
5560
return err
5661
}
5762
return nil
5863
}
5964

60-
// Update HPA
61-
expected, err := desiredHPA(pytorhcjob, r.Scheme)
62-
if err != nil {
63-
return err
64-
}
65-
if !equality.Semantic.DeepEqual(expected.Spec, hpa.Spec) {
66-
logger.V(1).Info("Updating HPA", "HPA.Namespace", hpa.Namespace, "HPA.Name", hpa.Name)
67-
expected.ResourceVersion = hpa.ResourceVersion
65+
if !equality.Semantic.DeepEqual(expected.Spec, current.Spec) {
66+
logger.V(1).Info("Updating HPA", "namespace", current.Namespace, "name", current.Name)
67+
expected.ResourceVersion = current.ResourceVersion
6868
err = r.Update(context.TODO(), expected)
6969
if err != nil {
7070
return err

pkg/controller.v1/pytorch/pytorch.go

+1-5
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,7 @@ func SetPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,
5353
if err != nil {
5454
return err
5555
}
56-
if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeMaster)) {
57-
if rank != 0 {
58-
return fmt.Errorf("invalid config: There should be only a single master with index=0")
59-
}
60-
} else {
56+
if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeWorker)) {
6157
rank = rank + 1
6258
}
6359

0 commit comments

Comments
 (0)