Skip to content

Commit 36acf2c

Browse files
committed
NSM datapath monitoring on proxy NSC
Introduce optional NSM datapath monitoring/healing (or liveness check) checking if the connection is alive by pinging endpoint from the client. This option is for now available only between the proxy and LBs and has to be enabled via new env variables introduced in the proxy.
1 parent db3066c commit 36acf2c

File tree

11 files changed

+277
-28
lines changed

11 files changed

+277
-28
lines changed

cmd/proxy/internal/client/fullmesh.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/google/uuid"
2929
"github.com/networkservicemesh/api/pkg/api/networkservice"
3030
"github.com/networkservicemesh/api/pkg/api/registry"
31+
"github.com/networkservicemesh/sdk/pkg/networkservice/common/heal"
3132
registryrefresh "github.com/networkservicemesh/sdk/pkg/registry/common/refresh"
3233
registrysendfd "github.com/networkservicemesh/sdk/pkg/registry/common/sendfd"
3334
registrychain "github.com/networkservicemesh/sdk/pkg/registry/core/chain"
@@ -237,9 +238,9 @@ func (fmnsc *FullMeshNetworkServiceClient) prepareQuery() *registry.NetworkServi
237238
// monitoring Network Service Endpoints belonging to the Network Service of the request.
238239
// Connects to each new Network Service Endpoint, and closes connection when a known
239240
// endpoint disappears.
240-
func NewFullMeshNetworkServiceClient(ctx context.Context, config *Config, additionalFunctionality ...networkservice.NetworkServiceClient) NetworkServiceClient {
241+
func NewFullMeshNetworkServiceClient(ctx context.Context, config *Config, healOptions []heal.Option, additionalFunctionality ...networkservice.NetworkServiceClient) NetworkServiceClient {
241242
// create base client relying on NSM's client.NewClient API
242-
client := newClient(ctx, config.Name, config.APIClient, additionalFunctionality...)
243+
client := newClient(ctx, config.Name, config.APIClient, healOptions, additionalFunctionality...)
243244

244245
fullMeshNetworkServiceClient := &FullMeshNetworkServiceClient{
245246
networkServiceClients: make(map[string]NetworkServiceClient),

cmd/proxy/internal/client/utils.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,15 @@ func expirationTimeIsNull(expirationTime *timestamppb.Timestamp) bool {
4141
// Refresh Client comes from the NSM sdk version used. (In case of NSM v1.1.1 the built-in
4242
// refresh might lead to connection issues if the different path segments have different
4343
// maxTokenLifetime configured (unless the NSC side has the lowest maxtokenlifetime)).
44-
func newClient(ctx context.Context, name string, nsmAPIClient *nsm.APIClient, additionalFunctionality ...networkservice.NetworkServiceClient) networkservice.NetworkServiceClient {
44+
func newClient(ctx context.Context, name string, nsmAPIClient *nsm.APIClient, healOptions []heal.Option, additionalFunctionality ...networkservice.NetworkServiceClient) networkservice.NetworkServiceClient {
4545
additionalFunctionality = append(additionalFunctionality,
4646
sendfd.NewClient(),
4747
)
4848

4949
return client.NewClient(ctx,
5050
client.WithClientURL(&nsmAPIClient.Config.ConnectTo),
5151
client.WithName(name),
52-
client.WithHealClient(heal.NewClient(ctx)),
52+
client.WithHealClient(heal.NewClient(ctx, healOptions...)),
5353
client.WithAdditionalFunctionality(additionalFunctionality...),
5454
client.WithDialTimeout(nsmAPIClient.Config.DialTimeout),
5555
client.WithDialOptions(nsmAPIClient.GRPCDialOption...),

cmd/proxy/internal/config/config.go

+24-21
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,30 @@ import (
2525

2626
// Config for the proxy
2727
type Config struct {
28-
Name string `default:"proxy" desc:"Pod Name"`
29-
ServiceName string `default:"proxy" desc:"Name of the Network Service" split_words:"true"`
30-
ConnectTo url.URL `default:"unix:///var/lib/networkservicemesh/nsm.io.sock" desc:"url to connect to NSM" split_words:"true"`
31-
DialTimeout time.Duration `default:"5s" desc:"timeout to dial NSMgr" split_words:"true"`
32-
RequestTimeout time.Duration `default:"15s" desc:"timeout to request NSE" split_words:"true"`
33-
MaxTokenLifetime time.Duration `default:"24h" desc:"maximum lifetime of tokens" split_words:"true"`
34-
IPAMService string `default:"ipam-service:7777" desc:"IP (or domain) and port of the IPAM Service" split_words:"true"`
35-
Host string `default:"" desc:"Host name the proxy is running on" split_words:"true"`
36-
NetworkServiceName string `default:"load-balancer" desc:"Name of the network service the proxy request the connection" split_words:"true"`
37-
Namespace string `default:"default" desc:"Namespace the pod is running on" split_words:"true"`
38-
Trench string `default:"default" desc:"Trench the pod is running on" split_words:"true"`
39-
Conduit string `default:"load-balancer" desc:"Name of the conduit" split_words:"true"`
40-
NSPServiceName string `default:"nsp-service" desc:"IP (or domain) of the NSP Service" split_words:"true"`
41-
NSPServicePort int `default:"7778" desc:"port of the NSP Service" split_words:"true"`
42-
IPFamily string `default:"dualstack" desc:"ip family" envconfig:"ip_family"`
43-
LogLevel string `default:"DEBUG" desc:"Log level" split_words:"true"`
44-
MTU int `default:"1500" desc:"Conduit MTU considered by local NSCs and NSE composing the network mesh" split_words:"true"`
45-
GRPCKeepaliveTime time.Duration `default:"30s" desc:"gRPC keepalive timeout"`
46-
GRPCProbeRPCTimeout time.Duration `default:"1s" desc:"RPC timeout of internal gRPC health probe" envconfig:"grpc_probe_rpc_timeout"`
47-
GRPCMaxBackoff time.Duration `default:"5s" desc:"Upper bound on gRPC connection backoff delay" envconfig:"grpc_max_backoff"`
48-
IPReleaseDelay time.Duration `default:"20s" desc:"delay releasing IP address of NSM connection" envconfig:"ip_release_delay"`
28+
Name string `default:"proxy" desc:"Pod Name"`
29+
ServiceName string `default:"proxy" desc:"Name of the Network Service" split_words:"true"`
30+
ConnectTo url.URL `default:"unix:///var/lib/networkservicemesh/nsm.io.sock" desc:"url to connect to NSM" split_words:"true"`
31+
DialTimeout time.Duration `default:"5s" desc:"timeout to dial NSMgr" split_words:"true"`
32+
RequestTimeout time.Duration `default:"15s" desc:"timeout to request NSE" split_words:"true"`
33+
MaxTokenLifetime time.Duration `default:"24h" desc:"maximum lifetime of tokens" split_words:"true"`
34+
IPAMService string `default:"ipam-service:7777" desc:"IP (or domain) and port of the IPAM Service" split_words:"true"`
35+
Host string `default:"" desc:"Host name the proxy is running on" split_words:"true"`
36+
NetworkServiceName string `default:"load-balancer" desc:"Name of the network service the proxy request the connection" split_words:"true"`
37+
Namespace string `default:"default" desc:"Namespace the pod is running on" split_words:"true"`
38+
Trench string `default:"default" desc:"Trench the pod is running on" split_words:"true"`
39+
Conduit string `default:"load-balancer" desc:"Name of the conduit" split_words:"true"`
40+
NSPServiceName string `default:"nsp-service" desc:"IP (or domain) of the NSP Service" split_words:"true"`
41+
NSPServicePort int `default:"7778" desc:"port of the NSP Service" split_words:"true"`
42+
IPFamily string `default:"dualstack" desc:"ip family" envconfig:"ip_family"`
43+
LogLevel string `default:"DEBUG" desc:"Log level" split_words:"true"`
44+
MTU int `default:"1500" desc:"Conduit MTU considered by local NSCs and NSE composing the network mesh" split_words:"true"`
45+
GRPCKeepaliveTime time.Duration `default:"30s" desc:"gRPC keepalive timeout"`
46+
GRPCProbeRPCTimeout time.Duration `default:"1s" desc:"RPC timeout of internal gRPC health probe" envconfig:"grpc_probe_rpc_timeout"`
47+
GRPCMaxBackoff time.Duration `default:"5s" desc:"Upper bound on gRPC connection backoff delay" envconfig:"grpc_max_backoff"`
48+
IPReleaseDelay time.Duration `default:"20s" desc:"delay releasing IP address of NSM connection" envconfig:"ip_release_delay"`
49+
LivenessCheckInterval time.Duration `default:"2s" desc:"Dataplane liveness check interval" split_words:"true"`
50+
LivenessCheckTimeout time.Duration `default:"1s" desc:"Dataplane liveness check timeout" split_words:"true"`
51+
LivenessCheckEnabled bool `default:"false" desc:"Dataplane liveness check enabled/disabled" split_words:"true"`
4952
}
5053

5154
// IsValid checks if the configuration is valid

cmd/proxy/internal/service/client.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/networkservicemesh/api/pkg/api/networkservice/payload"
2727
"github.com/networkservicemesh/sdk-sriov/pkg/networkservice/common/mechanisms/vfio"
2828
sriovtoken "github.com/networkservicemesh/sdk-sriov/pkg/networkservice/common/token"
29+
"github.com/networkservicemesh/sdk/pkg/networkservice/common/heal"
2930
"github.com/networkservicemesh/sdk/pkg/networkservice/common/mechanisms"
3031
"github.com/networkservicemesh/sdk/pkg/networkservice/common/mechanisms/kernel"
3132
"github.com/networkservicemesh/sdk/pkg/networkservice/core/chain"
@@ -46,6 +47,7 @@ func GetNSC(ctx context.Context,
4647
config *config.Config,
4748
nsmAPIClient *nsm.APIClient,
4849
p *proxy.Proxy,
50+
healOptions []heal.Option,
4951
interfaceMonitorClient networkservice.NetworkServiceClient) client.NetworkServiceClient {
5052

5153
logger := log.FromContextOrGlobal(ctx).WithValues("func", "GetNSC")
@@ -70,7 +72,7 @@ func GetNSC(ctx context.Context,
7072
proxyHealth.NewClient(),
7173
fullmeshtracker.NewClient(),
7274
)
73-
fullMeshClient := client.NewFullMeshNetworkServiceClient(ctx, clientConfig, additionalFunctionality)
75+
fullMeshClient := client.NewFullMeshNetworkServiceClient(ctx, clientConfig, healOptions, additionalFunctionality)
7476

7577
return fullMeshClient
7678
}

cmd/proxy/main.go

+12-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828

2929
"github.com/kelseyhightower/envconfig"
3030
"github.com/networkservicemesh/api/pkg/api/networkservice"
31+
"github.com/networkservicemesh/sdk/pkg/networkservice/common/heal"
3132
"github.com/networkservicemesh/sdk/pkg/tools/grpcutils"
3233
nsmlog "github.com/networkservicemesh/sdk/pkg/tools/log"
3334
ipamAPI "github.com/nordix/meridio/api/ipam/v1"
@@ -41,6 +42,7 @@ import (
4142
"github.com/nordix/meridio/pkg/health/probe"
4243
linuxKernel "github.com/nordix/meridio/pkg/kernel"
4344
"github.com/nordix/meridio/pkg/nsm"
45+
kernelheal "github.com/nordix/meridio/pkg/nsm/heal"
4446
"github.com/nordix/meridio/pkg/nsm/interfacemonitor"
4547
nsmmonitor "github.com/nordix/meridio/pkg/nsm/monitor"
4648
"github.com/nordix/meridio/pkg/nsp"
@@ -236,9 +238,18 @@ func main() {
236238
monitorClient := networkservice.NewMonitorConnectionClient(cc)
237239
go nsmmonitor.ConnectionMonitor(ctx, config.Name, monitorClient)
238240

241+
healOptions := []heal.Option{}
242+
if config.LivenessCheckEnabled {
243+
healOptions = []heal.Option{
244+
heal.WithLivenessCheckInterval(config.LivenessCheckInterval),
245+
heal.WithLivenessCheckTimeout(config.LivenessCheckTimeout),
246+
heal.WithLivenessCheck(kernelheal.KernelLivenessCheck),
247+
}
248+
}
249+
239250
// create and start NSC that connects all remote NSE belonging to the right service
240251
interfaceMonitorClient := interfacemonitor.NewClient(interfaceMonitor, p, netUtils)
241-
nsmClient := service.GetNSC(ctx, &config, nsmAPIClient, p, interfaceMonitorClient)
252+
nsmClient := service.GetNSC(ctx, &config, nsmAPIClient, p, healOptions, interfaceMonitorClient)
242253
defer nsmClient.Close()
243254
go func() {
244255
service.StartNSC(nsmClient, config.NetworkServiceName)

cmd/tapa/main.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"os"
2626
"os/signal"
2727
"syscall"
28+
"time"
2829

2930
"github.com/go-logr/logr"
3031
"github.com/kelseyhightower/envconfig"
@@ -49,6 +50,7 @@ import (
4950
linuxKernel "github.com/nordix/meridio/pkg/kernel"
5051
"github.com/nordix/meridio/pkg/log"
5152
"github.com/nordix/meridio/pkg/nsm"
53+
kernelheal "github.com/nordix/meridio/pkg/nsm/heal"
5254
"github.com/nordix/meridio/pkg/nsm/interfacename"
5355
"github.com/sirupsen/logrus"
5456
"google.golang.org/grpc"
@@ -155,10 +157,16 @@ func main() {
155157
sendfd.NewClient(),
156158
}
157159

160+
healOptions := []heal.Option{
161+
heal.WithLivenessCheckInterval(2 * time.Second),
162+
heal.WithLivenessCheckTimeout(1 * time.Second),
163+
heal.WithLivenessCheck(kernelheal.KernelLivenessCheck),
164+
}
165+
158166
networkServiceClient := client.NewClient(ctx,
159167
client.WithClientURL(&nsmAPIClient.Config.ConnectTo),
160168
client.WithName(config.Name),
161-
client.WithHealClient(heal.NewClient(ctx)),
169+
client.WithHealClient(heal.NewClient(ctx, healOptions...)),
162170
client.WithAdditionalFunctionality(additionalFunctionality...),
163171
client.WithDialTimeout(nsmAPIClient.Config.DialTimeout),
164172
client.WithDialOptions(nsmAPIClient.GRPCDialOption...),

docs/components/proxy.md

+8
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ NSM_NSP_SERVICE_NAME | string | IP (or domain) of the NSP Service | nsp-service
3939
NSM_NSP_SERVICE_PORT | int | port of the NSP Service | 7778
4040
NSM_IP_FAMILY | string | ip family | dualstack
4141
NSM_LOG_LEVEL | string | Log level | DEBUG
42+
NSM_MTU | string | Conduit MTU considered by local NSCs and NSE composing the network mesh | 1500
43+
NSM_GRPC_KEEPALIVE_TIME | time.Duration | gRPC keepalive timeout | 30s
44+
NSM_GRPC_PROBE_RPC_TIMEOUT | time.Duration | RPC timeout of internal gRPC health probe | 1s
45+
NSM_GRPC_MAX_BACKOFF | time.Duration | Upper bound on gRPC connection backoff delay | 5s
46+
NSM_IP_RELEASE_DELAY | time.Duration | delay releasing IP address of NSM connection | 20s
47+
NSM_LIVENESS_CHECK_INTERVAL | time.Duration | Dataplane liveness check interval | 2s
48+
NSM_LIVENESS_CHECK_TIMEOUT | time.Duration | Dataplane liveness check timeout | 1s
49+
NSM_LIVENESS_CHECK_ENABLED | bool | Dataplane liveness check enabled/disabled | false
4250

4351
## Command Line
4452

go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ require (
77
github.com/faisal-memon/sviddisk v0.0.0-20211007205134-77ccea0b9271
88
github.com/go-logr/logr v1.4.1
99
github.com/go-logr/zapr v1.3.0
10+
github.com/go-ping/ping v1.0.0
1011
github.com/golang/mock v1.6.0
1112
github.com/google/nftables v0.1.0
1213
github.com/google/uuid v1.3.1

go.sum

+3
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2Kv
106106
github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
107107
github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g=
108108
github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
109+
github.com/go-ping/ping v1.0.0 h1:34GZiqLDqqIHEeL5NZIz7jSnMluK7/p0qDB436yO6H0=
110+
github.com/go-ping/ping v1.0.0/go.mod h1:35JbSyV/BYqHwwRA6Zr1uVDm1637YlNOU61wI797NPI=
109111
github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE=
110112
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
111113
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
@@ -412,6 +414,7 @@ golang.org/x/net v0.0.0-20191007182048-72f939374954/go.mod h1:z5CRVTTTmAJ677TzLL
412414
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
413415
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
414416
golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
417+
golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
415418
golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
416419
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
417420
golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=

0 commit comments

Comments
 (0)