Skip to content

Commit cec9d7b

Browse files
authored
Revert "Add watchdog mechanism to swss service and generate alert when swss have issue. (#14686)" (#15390)
This reverts commit 44427a2. Docker image not updated during PR validation and caused PR check failures. Force merge this revert. After cache is updated after this PR is merged, issue should be fixed.
1 parent 0f194c5 commit cec9d7b

File tree

4 files changed

+16
-44
lines changed

4 files changed

+16
-44
lines changed

dockers/docker-orchagent/docker-init.j2

-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ CFGGEN_PARAMS=" \
1818
-t /usr/share/sonic/templates/vlan_vars.j2 \
1919
-t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \
2020
-t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \
21-
-t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \
2221
-t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf
2322
-t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \
2423
"

dockers/docker-orchagent/supervisord.conf.j2

+1-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ buffer_size=1024
1414

1515
[eventlistener:supervisor-proc-exit-listener]
1616
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
17-
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT
17+
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
1818
autostart=true
1919
autorestart=unexpected
2020
buffer_size=1024
@@ -75,7 +75,6 @@ command=/usr/bin/orchagent.sh
7575
priority=4
7676
autostart=false
7777
autorestart=false
78-
stdout_capture_maxbytes=1MB
7978
stdout_logfile=syslog
8079
stderr_logfile=syslog
8180
dependent_startup=true

dockers/docker-orchagent/watchdog_processes.j2

-1
This file was deleted.

files/scripts/supervisor-proc-exit-listener

+15-40
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,6 @@ from swsscommon import swsscommon
1414

1515
from supervisor import childutils
1616

17-
# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
18-
# following format:
19-
#
20-
# program:<process_name>
21-
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
22-
2317
# Each line of this file should specify either one critical process or one
2418
# critical process group, (as defined in supervisord.conf file), in the
2519
# following format:
@@ -40,40 +34,40 @@ ALERTING_INTERVAL_SECS = 60
4034
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
4135
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
4236

43-
def get_group_and_process_list(process_file):
37+
def get_critical_group_and_process_list():
4438
"""
45-
@summary: Read the critical processes/group names.
39+
@summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
4640
@return: Two lists which contain critical processes and group names respectively.
4741
"""
48-
group_list = []
49-
process_list = []
42+
critical_group_list = []
43+
critical_process_list = []
5044

51-
with open(process_file, 'r') as file:
45+
with open(CRITICAL_PROCESSES_FILE, 'r') as file:
5246
for line in file:
5347
# ignore blank lines
5448
if re.match(r"^\s*$", line):
5549
continue
5650
line_info = line.strip(' \n').split(':')
5751
if len(line_info) != 2:
5852
syslog.syslog(syslog.LOG_ERR,
59-
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
53+
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
6054
sys.exit(5)
6155

6256
identifier_key = line_info[0].strip()
6357
identifier_value = line_info[1].strip()
6458
if identifier_key == "group" and identifier_value:
65-
group_list.append(identifier_value)
59+
critical_group_list.append(identifier_value)
6660
elif identifier_key == "program" and identifier_value:
67-
process_list.append(identifier_value)
61+
critical_process_list.append(identifier_value)
6862
else:
6963
syslog.syslog(syslog.LOG_ERR,
70-
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
64+
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
7165
sys.exit(6)
7266

73-
return group_list, process_list
67+
return critical_group_list, critical_process_list
7468

7569

76-
def generate_alerting_message(process_name, status, dead_minutes):
70+
def generate_alerting_message(process_name, dead_minutes):
7771
"""
7872
@summary: If a critical process was not running, this function will determine it resides in host
7973
or in a specific namespace. Then an alerting message will be written into syslog.
@@ -86,8 +80,8 @@ def generate_alerting_message(process_name, status, dead_minutes):
8680
else:
8781
namespace = namespace_prefix + namespace_id
8882

89-
syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)."
90-
.format(process_name, status, namespace, dead_minutes))
83+
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
84+
.format(process_name, namespace, dead_minutes))
9185

9286

9387
def get_autorestart_state(container_name):
@@ -131,11 +125,9 @@ def main(argv):
131125
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
132126
sys.exit(1)
133127

134-
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
135-
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)
128+
critical_group_list, critical_process_list = get_critical_group_and_process_list()
136129

137130
process_under_alerting = defaultdict(dict)
138-
process_heart_beat_info = defaultdict(dict)
139131
# Transition from ACKNOWLEDGED to READY
140132
childutils.listener.ready()
141133
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
@@ -175,15 +167,6 @@ def main(argv):
175167
if process_name in process_under_alerting:
176168
process_under_alerting.pop(process_name)
177169

178-
# Handle the PROCESS_COMMUNICATION_STDOUT event
179-
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
180-
payload_headers, payload_data = childutils.eventdata(payload + '\n')
181-
process_name = payload_headers['processname']
182-
183-
# update process heart beat time
184-
if (process_name in watch_process_list):
185-
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()
186-
187170
# Transition from BUSY to ACKNOWLEDGED
188171
childutils.listener.ok()
189172

@@ -198,15 +181,7 @@ def main(argv):
198181
elapsed_mins = elapsed_secs // 60
199182
process_under_alerting[process_name]["last_alerted"] = epoch_time
200183
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
201-
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"])
202-
203-
# Check whether we need write alerting messages into syslog
204-
for process in process_heart_beat_info.keys():
205-
epoch_time = time.time()
206-
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
207-
if elapsed_secs >= ALERTING_INTERVAL_SECS:
208-
elapsed_mins = elapsed_secs // 60
209-
generate_alerting_message(process, "stuck", elapsed_mins)
184+
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
210185

211186
if __name__ == "__main__":
212187
main(sys.argv[1:])

0 commit comments

Comments
 (0)