Skip to content

Commit 5318ba3

Browse files
authored
feat(host): (#245)
1. adapt pytorch distributed communication style in torch-elastic. 2. support traffic no redirect in hostnetwork mode. Signed-off-by: SimonCqk <[email protected]>
1 parent 32bc0df commit 5318ba3

File tree

3 files changed

+26
-8
lines changed

3 files changed

+26
-8
lines changed

controllers/pytorch/pytorchjob_controller.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838

3939
training "github.com/alibaba/kubedl/apis/training/v1alpha1"
4040
"github.com/alibaba/kubedl/cmd/options"
41+
"github.com/alibaba/kubedl/pkg/features"
4142
"github.com/alibaba/kubedl/pkg/gang_schedule/registry"
4243
"github.com/alibaba/kubedl/pkg/job_controller"
4344
v1 "github.com/alibaba/kubedl/pkg/job_controller/api/v1"
@@ -210,14 +211,20 @@ func (r *PytorchJobReconciler) SetClusterSpec(ctx context.Context, job interface
210211
return err
211212
}
212213

214+
masterRole := rtype == strings.ToLower(string(training.PyTorchReplicaTypeMaster))
215+
if masterHostPort, ok := job_controller.GetHostNetworkPortFromContext(ctx, "master", "0"); job_controller.EnableHostNetwork(pytorchJob) && ok {
216+
if masterRole || features.KubeDLFeatureGates.Enabled(features.HostNetWithHeadlessSvc) {
217+
masterPort = masterHostPort
218+
}
219+
}
220+
213221
masterAddr := commonutil.GenGeneralName(pytorchJob.Name, strings.ToLower(string(training.PyTorchReplicaTypeMaster)), strconv.Itoa(0))
214-
if rtype == strings.ToLower(string(training.PyTorchReplicaTypeMaster)) {
222+
if masterRole {
215223
if rank != 0 {
216224
return fmt.Errorf("invalid config: There should be only a single master with index=0")
217225
}
218-
masterAddr = "localhost"
219-
if hostPort, ok := job_controller.GetHostNetworkPortFromContext(ctx, rtype, index); ok && job_controller.EnableHostNetwork(pytorchJob) {
220-
masterPort = hostPort
226+
if features.KubeDLFeatureGates.Enabled(features.PyTorchLocalMasterAddr) {
227+
masterAddr = "localhost"
221228
}
222229
} else {
223230
rank++

pkg/features/features.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,15 @@ const (
2828
// DAGScheduling enables DAG scheduling workflow between different job roles.
2929
DAGScheduling featuregate.Feature = "DAGScheduling"
3030

31-
// TODO: migrate other features into featuregates pattern.
31+
// PyTorchLocalMasterAddr explicitly declare to use localhost as master self listened
32+
// address, it's usually adopted in version < torch 1.9, in >=1.9 distributed communication
33+
// style, master address value should be aligned with workers, set by master service name.
34+
PyTorchLocalMasterAddr featuregate.Feature = "PyTorchLocalMasterAddr"
35+
36+
// HostNetWithHeadlessSvc constructs connections intra pods leveraging headless service
37+
// instead of normal service with different port/targetPort, it bypasses traffic routing
38+
// but pod may not find correct host port after fail-overed.
39+
HostNetWithHeadlessSvc featuregate.Feature = "HostNetWithHeadlessSvc"
3240
)
3341

3442
func init() {
@@ -39,7 +47,9 @@ var (
3947
KubeDLFeatureGates = featuregate.NewFeatureGate()
4048

4149
defaultKubeDLFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
42-
GangScheduling: {Default: true, PreRelease: featuregate.Beta},
43-
DAGScheduling: {Default: true, PreRelease: featuregate.Beta},
50+
GangScheduling: {Default: true, PreRelease: featuregate.Beta},
51+
DAGScheduling: {Default: true, PreRelease: featuregate.Beta},
52+
PyTorchLocalMasterAddr: {Default: true, PreRelease: featuregate.Beta},
53+
HostNetWithHeadlessSvc: {Default: false, PreRelease: featuregate.Alpha},
4454
}
4555
)

pkg/job_controller/service.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333

3434
log "github.com/sirupsen/logrus"
3535

36+
"github.com/alibaba/kubedl/pkg/features"
3637
apiv1 "github.com/alibaba/kubedl/pkg/job_controller/api/v1"
3738
commonutil "github.com/alibaba/kubedl/pkg/util"
3839
)
@@ -290,7 +291,7 @@ func (jc *JobController) CreateNewService(ctx context.Context, job metav1.Object
290291
targetPort := svcPort
291292
clusterIP := "None"
292293

293-
if EnableHostNetwork(job) {
294+
if !features.KubeDLFeatureGates.Enabled(features.HostNetWithHeadlessSvc) && EnableHostNetwork(job) {
294295
// Communications between replicas use headless services by default, as for hostnetwork mode,
295296
// headless service can not forward traffic from one port to another, so we use normal service
296297
// when hostnetwork enabled.

0 commit comments

Comments
 (0)