Skip to content

Commit 257d32d

Browse files
authored
Add heartbeat interval parameter. (#3458)
* Add heart beat interval parameter * Disable feature when interval is 0 Why I did it Make this feature can be disable, because log spam issue on small disk device: sonic-net/sonic-buildimage#21157 Work item tracking Microsoft ADO: 30594076
1 parent 455027e commit 257d32d

File tree

4 files changed

+74
-16
lines changed

4 files changed

+74
-16
lines changed

orchagent/main.cpp

+24-3
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ extern bool gIsNatSupported;
6262
#define SWSS_RECORD_ENABLE (0x1 << 1)
6363
#define RESPONSE_PUBLISHER_RECORD_ENABLE (0x1 << 2)
6464

65+
/* orchagent heart beat message interval */
66+
#define HEART_BEAT_INTERVAL_MSECS_DEFAULT 10 * 1000
67+
6568
string gMySwitchType = "";
6669
int32_t gVoqMySwitchId = -1;
6770
int32_t gVoqMaxCores = 0;
@@ -73,7 +76,7 @@ uint32_t create_switch_timeout = 0;
7376

7477
void usage()
7578
{
76-
cout << "usage: orchagent [-h] [-r record_type] [-d record_location] [-f swss_rec_filename] [-j sairedis_rec_filename] [-b batch_size] [-m MAC] [-i INST_ID] [-s] [-z mode] [-k bulk_size] [-q zmq_server_address] [-c mode] [-t create_switch_timeout] [-v VRF]" << endl;
79+
cout << "usage: orchagent [-h] [-r record_type] [-d record_location] [-f swss_rec_filename] [-j sairedis_rec_filename] [-b batch_size] [-m MAC] [-i INST_ID] [-s] [-z mode] [-k bulk_size] [-q zmq_server_address] [-c mode] [-t create_switch_timeout] [-v VRF] [-I heart_beat_interval]" << endl;
7780
cout << " -h: display this message" << endl;
7881
cout << " -r record_type: record orchagent logs with type (default 3)" << endl;
7982
cout << " Bit 0: sairedis.rec, Bit 1: swss.rec, Bit 2: responsepublisher.rec. For example:" << endl;
@@ -95,6 +98,7 @@ void usage()
9598
cout << " -c counter mode (traditional|asic_db), default: asic_db" << endl;
9699
cout << " -t Override create switch timeout, in sec" << endl;
97100
cout << " -v vrf: VRF name (default empty)" << endl;
101+
cout << " -I heart_beat_interval: Heart beat interval in millisecond (default 10)" << endl;
98102
}
99103

100104
void sighup_handler(int signo)
@@ -349,8 +353,9 @@ int main(int argc, char **argv)
349353
bool enable_zmq = false;
350354
string responsepublisher_rec_filename = Recorder::RESPPUB_FNAME;
351355
int record_type = 3; // Only swss and sairedis recordings enabled by default.
356+
long heartBeatInterval = HEART_BEAT_INTERVAL_MSECS_DEFAULT;
352357

353-
while ((opt = getopt(argc, argv, "b:m:r:f:j:d:i:hsz:k:q:c:t:v:")) != -1)
358+
while ((opt = getopt(argc, argv, "b:m:r:f:j:d:i:hsz:k:q:c:t:v:I:")) != -1)
354359
{
355360
switch (opt)
356361
{
@@ -450,6 +455,22 @@ int main(int argc, char **argv)
450455
vrf = optarg;
451456
}
452457
break;
458+
case 'I':
459+
if (optarg)
460+
{
461+
auto interval = atoi(optarg);
462+
if (interval >= 0)
463+
{
464+
heartBeatInterval = interval;
465+
SWSS_LOG_NOTICE("Setting heartbeat interval as %ld", heartBeatInterval);
466+
}
467+
else
468+
{
469+
heartBeatInterval = HEART_BEAT_INTERVAL_MSECS_DEFAULT;
470+
SWSS_LOG_ERROR("Invalid input for heartbeat interval: %d. use default interval: %ld", interval, heartBeatInterval);
471+
}
472+
}
473+
break;
453474
default: /* '?' */
454475
exit(EXIT_FAILURE);
455476
}
@@ -815,7 +836,7 @@ int main(int argc, char **argv)
815836
syncd_apply_view();
816837
}
817838

818-
orchDaemon->start();
839+
orchDaemon->start(heartBeatInterval);
819840

820841
return 0;
821842
}

orchagent/orchdaemon.cpp

+13-10
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@ using namespace swss;
2323
#define APP_FABRIC_MONITOR_PORT_TABLE_NAME "FABRIC_PORT_TABLE"
2424
#define APP_FABRIC_MONITOR_DATA_TABLE_NAME "FABRIC_MONITOR_TABLE"
2525

26-
/* orchagent heart beat message interval */
27-
#define HEART_BEAT_INTERVAL_MSECS 10 * 1000
28-
2926
extern sai_switch_api_t* sai_switch_api;
3027
extern sai_object_id_t gSwitchId;
3128
extern string gMySwitchType;
@@ -828,7 +825,7 @@ void OrchDaemon::logRotate() {
828825
}
829826

830827

831-
void OrchDaemon::start()
828+
void OrchDaemon::start(long heartBeatInterval)
832829
{
833830
SWSS_LOG_ENTER();
834831

@@ -849,7 +846,7 @@ void OrchDaemon::start()
849846
ret = m_select->select(&s, SELECT_TIMEOUT);
850847

851848
auto tend = std::chrono::high_resolution_clock::now();
852-
heartBeat(tend);
849+
heartBeat(tend, heartBeatInterval);
853850

854851
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(tend - tstart);
855852

@@ -926,7 +923,7 @@ void OrchDaemon::start()
926923
flush();
927924

928925
SWSS_LOG_WARN("Orchagent is frozen for warm restart!");
929-
freezeAndHeartBeat(UINT_MAX);
926+
freezeAndHeartBeat(UINT_MAX, heartBeatInterval);
930927
}
931928
}
932929
}
@@ -1090,25 +1087,31 @@ void OrchDaemon::addOrchList(Orch *o)
10901087
m_orchList.push_back(o);
10911088
}
10921089

1093-
void OrchDaemon::heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent)
1090+
void OrchDaemon::heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent, long interval)
10941091
{
1092+
if (interval == 0)
1093+
{
1094+
// disable heart beat feature when interval is 0
1095+
return;
1096+
}
1097+
10951098
// output heart beat message to SYSLOG
10961099
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(tcurrent - m_lastHeartBeat);
1097-
if (diff.count() >= HEART_BEAT_INTERVAL_MSECS)
1100+
if (diff.count() >= interval)
10981101
{
10991102
m_lastHeartBeat = tcurrent;
11001103
// output heart beat message to supervisord with 'PROCESS_COMMUNICATION_STDOUT' event: http://supervisord.org/events.html
11011104
cout << "<!--XSUPERVISOR:BEGIN-->heartbeat<!--XSUPERVISOR:END-->" << endl;
11021105
}
11031106
}
11041107

1105-
void OrchDaemon::freezeAndHeartBeat(unsigned int duration)
1108+
void OrchDaemon::freezeAndHeartBeat(unsigned int duration, long interval)
11061109
{
11071110
while (duration > 0)
11081111
{
11091112
// Send heartbeat message to prevent Orchagent stuck alert.
11101113
auto tend = std::chrono::high_resolution_clock::now();
1111-
heartBeat(tend);
1114+
heartBeat(tend, interval);
11121115

11131116
duration--;
11141117
sleep(1);

orchagent/orchdaemon.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class OrchDaemon
6363
~OrchDaemon();
6464

6565
virtual bool init();
66-
void start();
66+
void start(long heartBeatInterval);
6767
bool warmRestoreAndSyncUp();
6868
void getTaskToSync(vector<string> &ts);
6969
bool warmRestoreValidation();
@@ -102,9 +102,9 @@ class OrchDaemon
102102

103103
void flush();
104104

105-
void heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent);
105+
void heartBeat(std::chrono::time_point<std::chrono::high_resolution_clock> tcurrent, long interval);
106106

107-
void freezeAndHeartBeat(unsigned int duration);
107+
void freezeAndHeartBeat(unsigned int duration, long interval);
108108
};
109109

110110
class FabricOrchDaemon : public OrchDaemon

tests/test_zmq.py

+34
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,37 @@ def test_vrf(self, dvs):
120120
dvs.runcmd("cp /usr/bin/orchagent.sh_vrf_ut_backup /usr/bin/orchagent.sh")
121121
dvs.stop_swss()
122122
dvs.start_swss()
123+
124+
def test_heartbeat(self, dvs):
125+
# Improve test code coverage, change orchagent to disable heartbeat
126+
dvs.runcmd("cp /usr/bin/orchagent.sh /usr/bin/orchagent.sh_hb_ut_backup")
127+
dvs.runcmd("sed -i.bak 's/\/usr\/bin\/orchagent /\/usr\/bin\/orchagent -I 0 /g' /usr/bin/orchagent.sh")
128+
dvs.stop_swss()
129+
dvs.start_swss()
130+
131+
# wait orchagent start
132+
time.sleep(3)
133+
process_statue = dvs.runcmd("ps -ef")
134+
zmq_logger.debug("Process status: {}".format(process_statue))
135+
136+
# revert change
137+
dvs.runcmd("cp /usr/bin/orchagent.sh_hb_ut_backup /usr/bin/orchagent.sh")
138+
dvs.stop_swss()
139+
dvs.start_swss()
140+
141+
def test_usage(self, dvs):
142+
# Improve test code coverage, change orchagent to display usage
143+
dvs.runcmd("cp /usr/bin/orchagent.sh /usr/bin/orchagent.sh_usage_ut_backup")
144+
dvs.runcmd("sed -i.bak 's/\/usr\/bin\/orchagent /\/usr\/bin\/orchagent -h /g' /usr/bin/orchagent.sh")
145+
dvs.stop_swss()
146+
dvs.start_swss()
147+
148+
# wait orchagent start
149+
time.sleep(3)
150+
process_statue = dvs.runcmd("ps -ef")
151+
zmq_logger.debug("Process status: {}".format(process_statue))
152+
153+
# revert change
154+
dvs.runcmd("cp /usr/bin/orchagent.sh_usage_ut_backup /usr/bin/orchagent.sh")
155+
dvs.stop_swss()
156+
dvs.start_swss()

0 commit comments

Comments
 (0)