From a8c54ad1914f6a4b8f663c7b37909aa7e59a984e Mon Sep 17 00:00:00 2001 From: Stepan Blyschak Date: Fri, 5 Jul 2024 13:10:34 +0000 Subject: [PATCH 1/2] [healthd] fix healthd shutdown race Signed-off-by: Stepan Blyschak --- src/system-health/health_checker/sysmonitor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py index 115dbfbe9ea0..306419512865 100755 --- a/src/system-health/health_checker/sysmonitor.py +++ b/src/system-health/health_checker/sysmonitor.py @@ -507,8 +507,6 @@ def task_worker(self): def task_stop(self): # Signal the process to stop self.task_stopping_event.set() - #Clear the resources of mpmgr- Queue - self.mpmgr.shutdown() # Wait for the process to exit self._task_process.join(self._stop_timeout_secs) From ade9ec54291e20a084a68141b61e0098b62c1518 Mon Sep 17 00:00:00 2001 From: Stepan Blyschak Date: Thu, 17 Oct 2024 10:12:06 +0000 Subject: [PATCH 2/2] fix shutdown hang Signed-off-by: Stepan Blyschak --- .../health_checker/sysmonitor.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py index 306419512865..4c2e4abcde61 100755 --- a/src/system-health/health_checker/sysmonitor.py +++ b/src/system-health/health_checker/sysmonitor.py @@ -104,6 +104,11 @@ def task_worker(self): logger.log_info("Start Listening to systemd bus (pid {0})".format(os.getpid())) self.subscribe_sysbus() + def task_stop(self): + # FIXME: Gracefully stop `loop.run()`. + self._task_process.kill() + return True + def task_notify(self, msg): if self.task_stopping_event.is_set(): return @@ -478,9 +483,11 @@ def system_service(self): from queue import Empty # Queue to receive the STATEDB and Systemd state change event - while not self.task_stopping_event.is_set(): + while True: try: msg = self.myQ.get(timeout=QUEUE_TIMEOUT) + if msg == "stop": + break event = msg["unit"] event_src = msg["evt_src"] event_time = msg["time"] @@ -500,13 +507,10 @@ def system_service(self): monitor_statedb_table.task_stop() def task_worker(self): - if self.task_stopping_event.is_set(): - return self.system_service() def task_stop(self): - # Signal the process to stop - self.task_stopping_event.set() + self.myQ.put("stop") # Wait for the process to exit self._task_process.join(self._stop_timeout_secs) @@ -514,12 +518,8 @@ def task_stop(self): # If the process didn't exit, attempt to kill it if self._task_process.is_alive(): logger.log_notice("Attempting to kill sysmon main process with pid {}".format(self._task_process.pid)) - os.kill(self._task_process.pid, signal.SIGKILL) - - if self._task_process.is_alive(): - logger.log_error("Sysmon main process with pid {} could not be killed".format(self._task_process.pid)) + self._task_process.kill() + self._task_process.join() return False return True - -