Skip to content

Commit 99a2a26

Browse files
authored
Add orchagent heart beat message for watchdog. (sonic-net#2737)
**What I did** Improve orch agent: output heartbeat message to systemd. **Why I did it** Currently SONiC monit system only monit orchagent process exist or not. If orchagent process stuck and stop processing, current monit can't find and report it. **How I verified it** Pass all UT. Manually validate the heartbeat message works correctly. **Details if related** Another inprogress PR will add watchdog for this heartbeat message: sonic-net#14686 sonic-mgmt UT PR: sonic-net/sonic-mgmt#8306
1 parent e37e55f commit 99a2a26

File tree

2 files changed

+22
-0
lines changed

2 files changed

+22
-0
lines changed

orchagent/orchdaemon.cpp

+18
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "logger.h"
77
#include <sairedis.h>
88
#include "warm_restart.h"
9+
#include <iostream>
910

1011
#define SAI_SWITCH_ATTR_CUSTOM_RANGE_BASE SAI_SWITCH_ATTR_CUSTOM_RANGE_START
1112
#include "sairedis.h"
@@ -18,6 +19,9 @@ using namespace swss;
1819
#define SELECT_TIMEOUT 1000
1920
#define PFC_WD_POLL_MSECS 100
2021

22+
/* orchagent heart beat message interval */
23+
#define HEART_BEAT_INTERVAL_MSECS 10 * 1000
24+
2125
extern sai_switch_api_t* sai_switch_api;
2226
extern sai_object_id_t gSwitchId;
2327
extern bool gSaiRedisLogRotate;
@@ -72,6 +76,7 @@ OrchDaemon::OrchDaemon(DBConnector *applDb, DBConnector *configDb, DBConnector *
7276
{
7377
SWSS_LOG_ENTER();
7478
m_select = new Select();
79+
m_lastHeartBeat = std::chrono::high_resolution_clock::now();
7580
}
7681

7782
OrchDaemon::~OrchDaemon()
@@ -722,6 +727,7 @@ void OrchDaemon::start()
722727
ret = m_select->select(&s, SELECT_TIMEOUT);
723728

724729
auto tend = std::chrono::high_resolution_clock::now();
730+
heartBeat(tend);
725731

726732
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(tend - tstart);
727733

@@ -957,6 +963,18 @@ void OrchDaemon::addOrchList(Orch *o)
957963
m_orchList.push_back(o);
958964
}
959965

966+
void OrchDaemon::heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent)
967+
{
968+
// output heart beat message to SYSLOG
969+
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(tcurrent - m_lastHeartBeat);
970+
if (diff.count() >= HEART_BEAT_INTERVAL_MSECS)
971+
{
972+
m_lastHeartBeat = tcurrent;
973+
// output heart beat message to supervisord with 'PROCESS_COMMUNICATION_STDOUT' event: http://supervisord.org/events.html
974+
cout << "<!--XSUPERVISOR:BEGIN-->heartbeat<!--XSUPERVISOR:END-->" << endl;
975+
}
976+
}
977+
960978
FabricOrchDaemon::FabricOrchDaemon(DBConnector *applDb, DBConnector *configDb, DBConnector *stateDb, DBConnector *chassisAppDb) :
961979
OrchDaemon(applDb, configDb, stateDb, chassisAppDb),
962980
m_applDb(applDb),

orchagent/orchdaemon.h

+4
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,12 @@ class OrchDaemon
9090

9191
std::vector<Orch *> m_orchList;
9292
Select *m_select;
93+
94+
std::chrono::time_point<std::chrono::high_resolution_clock> m_lastHeartBeat;
9395

9496
void flush();
97+
98+
void heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent);
9599
};
96100

97101
class FabricOrchDaemon : public OrchDaemon

0 commit comments

Comments
 (0)