Skip to content

Commit 37863ac

Browse files
authored
[Monit] Restart telemetry container if memory usage is beyond the threshold (#7645)
Signed-off-by: Yong Zhao [email protected] Why I did it This PR aims to monitor the memory usage of streaming telemetry container and restart streaming telemetry container if memory usage is larger than the pre-defined threshold. How I did it I borrowed the system tool Monit to run a script memory_checker which will periodically check the memory usage of streaming telemetry container. If the memory usage of telemetry container is larger than the pre-defined threshold for 10 times during 20 cycles, then an alerting message will be written into syslog and at the same time Monit will run the script restart_service to restart the streaming telemetry container. How to verify it I verified this implementation on device str-7260cx3-acs-1.
1 parent 0c5c487 commit 37863ac

File tree

4 files changed

+221
-0
lines changed

4 files changed

+221
-0
lines changed

dockers/docker-sonic-telemetry/base_image_files/monit_telemetry

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,6 @@ check program telemetry|telemetry with path "/usr/bin/process_checker telemetry
99

1010
check program telemetry|dialout_client with path "/usr/bin/process_checker telemetry /usr/sbin/dialout_client_cli"
1111
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
12+
13+
check program container_memory_telemetry with path "/usr/bin/memory_checker telemetry 419430400"
14+
if status == 3 for 10 times within 20 cycles then exec "/usr/bin/restart_service telemetry"

files/build_templates/sonic_debian_extension.j2

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,10 @@ sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/
334334
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker
335335
sudo cp $IMAGE_CONFIGS/monit/container_checker $FILESYSTEM_ROOT/usr/bin/
336336
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/container_checker
337+
sudo cp $IMAGE_CONFIGS/monit/memory_checker $FILESYSTEM_ROOT/usr/bin/
338+
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/memory_checker
339+
sudo cp $IMAGE_CONFIGS/monit/restart_service $FILESYSTEM_ROOT/usr/bin/
340+
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/restart_service
337341

338342

339343
# Install custom-built openssh sshd
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
memory_checker
5+
6+
This script is part of the feature which will restart the container if memory
7+
usage of it is larger than the threshold value.
8+
9+
This script is used to check the memory usage of specified cotnainer and
10+
is intended to be run by Monit. It will write an alerting message into
11+
syslog if memory usage of the container is larger than the threshold value for X
12+
times within Y cycles/minutes. Note that if print(...) statement in this script
13+
was executed, the string in it will be appended to Monit syslog messages.
14+
15+
The following is an example in Monit configuration file to show how Monit will run
16+
this script:
17+
18+
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
19+
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
20+
"""
21+
22+
import argparse
23+
import subprocess
24+
import sys
25+
import syslog
26+
import re
27+
28+
29+
def get_command_result(command):
30+
"""Executes the command and return the resulting output.
31+
32+
Args:
33+
command: A string contains the command to be executed.
34+
35+
Returns:
36+
A string which contains the output of command.
37+
"""
38+
command_stdout = ""
39+
40+
try:
41+
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
42+
shell=True, universal_newlines=True)
43+
command_stdout, command_stderr = proc_instance.communicate()
44+
if proc_instance.returncode != 0:
45+
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Return code: '{}'"
46+
.format(command, proc_instance.returncode))
47+
sys.exit(1)
48+
except (OSError, ValueError) as err:
49+
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to execute the command '{}'. Error: '{}'"
50+
.format(command, err))
51+
sys.exit(2)
52+
53+
return command_stdout.strip()
54+
55+
56+
def check_memory_usage(container_name, threshold_value):
57+
"""Checks the memory usage of a container and writes an alerting messages into
58+
the syslog if the memory usage is larger than the threshold value.
59+
60+
Args:
61+
container_name: A string represtents name of a container
62+
threshold_value: An integer indicates the threshold value (Bytes) of memory usage.
63+
64+
Returns:
65+
None.
66+
"""
67+
command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name)
68+
command_stdout = get_command_result(command)
69+
mem_usage = command_stdout.split("/")[0].strip()
70+
match_obj = re.match(r"\d+\.?\d*", mem_usage)
71+
if match_obj:
72+
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
73+
mem_usage_unit = mem_usage[match_obj.end():]
74+
75+
mem_usage_bytes = 0.0
76+
if mem_usage_unit == "B":
77+
mem_usage_bytes = mem_usage_value
78+
elif mem_usage_unit == "KiB":
79+
mem_usage_bytes = mem_usage_value * 1024
80+
elif mem_usage_unit == "MiB":
81+
mem_usage_bytes = mem_usage_value * 1024 ** 2
82+
elif mem_usage_unit == "GiB":
83+
mem_usage_bytes = mem_usage_value * 1024 ** 3
84+
85+
if mem_usage_bytes > threshold_value:
86+
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
87+
.format(container_name, mem_usage_bytes, threshold_value))
88+
sys.exit(3)
89+
else:
90+
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
91+
.format(mem_usage))
92+
sys.exit(4)
93+
94+
95+
def main():
96+
parser = argparse.ArgumentParser(description="Check memory usage of a container \
97+
and an alerting message will be written into syslog if memory usage \
98+
is larger than the threshold value", usage="/usr/bin/memory_checker <container_name> <threshold_value_in_bytes>")
99+
parser.add_argument("container_name", help="container name")
100+
# TODO: Currently the threshold value is hard coded as a command line argument and will
101+
# remove this in the new version since we want to read this value from 'CONFIG_DB'.
102+
parser.add_argument("threshold_value", type=int, help="threshold value in bytes")
103+
args = parser.parse_args()
104+
105+
check_memory_usage(args.container_name, args.threshold_value)
106+
107+
108+
if __name__ == "__main__":
109+
main()
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
restart_service
5+
6+
This script is part of the feature which will restart the container if memory
7+
usage of it is larger than the threshold value.
8+
9+
This script is intended to be run by Monit and is used to restart the specified
10+
container if the memory usage of it is larger than the threshold value for X
11+
times within Y cycles/minutes.
12+
13+
The following is an example in Monit configuration file to show how Monit will run
14+
this script:
15+
16+
check program container_memory_<container_name> with path "/usr/bin/memory_checker <container_name> <threshold_value>"
17+
if status == 3 for X times within Y cycles exec "/usr/bin/restart_service <container_name>"
18+
"""
19+
20+
import argparse
21+
import sys
22+
import syslog
23+
import subprocess
24+
25+
26+
def get_command_result(command):
27+
"""Executes command and return the exit code, stdout and stderr.
28+
29+
Args:
30+
command: A string contains the command to be executed.
31+
32+
Returns:
33+
An integer contains the exit code.
34+
A string contains the output of stdout.
35+
A string contains the output of stderr.
36+
"""
37+
command_stdout = ""
38+
command_stderr = ""
39+
40+
try:
41+
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
42+
shell=True, universal_newlines=True)
43+
command_stdout, command_stderr = proc_instance.communicate()
44+
if proc_instance.returncode != 0:
45+
return 1, command_stdout.strip(), command_stderr.strip()
46+
except (OSError, ValueError) as err:
47+
return 2, command_stdout.strip(), err
48+
49+
return 0, command_stdout.strip(), command_stderr.strip()
50+
51+
52+
def reset_failed_flag(service_name):
53+
"""Reset the failed status of a service.
54+
55+
Args:
56+
service_name: Name of the service.
57+
58+
Returns:
59+
None
60+
"""
61+
reset_failed_command = "sudo systemctl reset-failed {}.service".format(service_name)
62+
63+
syslog.syslog(syslog.LOG_INFO, "Resetting failed status of service '{}' ..."
64+
.format(service_name))
65+
66+
exit_code, command_stdout, command_stderr = get_command_result(reset_failed_command)
67+
if exit_code == 0:
68+
syslog.syslog(syslog.LOG_INFO, "Succeeded to reset failed status of service '{}.service'."
69+
.format(service_name))
70+
else:
71+
syslog.syslog(syslog.LOG_ERR, "Failed to reset failed status of service '{}'. Error: {}"
72+
.format(service_name, command_stderr))
73+
74+
75+
def restart_service(service_name):
76+
"""Reset the failed status of a service and then restart it.
77+
78+
Args:
79+
service_name: Name of specified service.
80+
81+
Returns:
82+
None.
83+
"""
84+
restart_command = "sudo systemctl restart {}.service".format(service_name)
85+
86+
reset_failed_flag(service_name)
87+
88+
syslog.syslog(syslog.LOG_INFO, "Restarting service '{}' ...".format(service_name))
89+
exit_code, command_stdout, command_stderr = get_command_result(restart_command)
90+
if exit_code != 0:
91+
syslog.syslog(syslog.LOG_ERR, "Failed to restart the service '{}'. Error: {}"
92+
.format(service_name, command_stderr))
93+
94+
95+
def main():
96+
parser = argparse.ArgumentParser(description="Restart a specific service",
97+
usage="/usr/bin/restart_service <service_name>")
98+
parser.add_argument("service_name", help="service name")
99+
args = parser.parse_args()
100+
101+
restart_service(args.service_name)
102+
103+
104+
if __name__ == "__main__":
105+
main()

0 commit comments

Comments
 (0)