Skip to content

Commit 62a09a0

Browse files
authored
[sai_failure_dump]Invoking dump during SAI failure (#2644) (#2661)
* [sai_failure_dump]Invoking dump during SAI failure * Added logic to invoke SAI failure dump during any SAI programming failure before invoking abort.
1 parent 076f63e commit 62a09a0

File tree

7 files changed

+208
-14
lines changed

7 files changed

+208
-14
lines changed

orchagent/main.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ void syncd_apply_view()
126126
if (status != SAI_STATUS_SUCCESS)
127127
{
128128
SWSS_LOG_ERROR("Failed to notify syncd APPLY_VIEW %d", status);
129-
exit(EXIT_FAILURE);
129+
handleSaiFailure(true);
130130
}
131131
}
132132

@@ -603,7 +603,7 @@ int main(int argc, char **argv)
603603
if (status != SAI_STATUS_SUCCESS)
604604
{
605605
SWSS_LOG_ERROR("Failed to create a switch, rv:%d", status);
606-
exit(EXIT_FAILURE);
606+
handleSaiFailure(true);
607607
}
608608
SWSS_LOG_NOTICE("Create a switch, id:%" PRIu64, gSwitchId);
609609

@@ -634,7 +634,7 @@ int main(int argc, char **argv)
634634
if (status != SAI_STATUS_SUCCESS)
635635
{
636636
SWSS_LOG_ERROR("Failed to get MAC address from switch, rv:%d", status);
637-
exit(EXIT_FAILURE);
637+
handleSaiFailure(true);
638638
}
639639
else
640640
{
@@ -649,7 +649,7 @@ int main(int argc, char **argv)
649649
if (status != SAI_STATUS_SUCCESS)
650650
{
651651
SWSS_LOG_ERROR("Fail to get switch virtual router ID %d", status);
652-
exit(EXIT_FAILURE);
652+
handleSaiFailure(true);
653653
}
654654

655655
gVirtualRouterId = attr.value.oid;
@@ -691,7 +691,7 @@ int main(int argc, char **argv)
691691
if (status != SAI_STATUS_SUCCESS)
692692
{
693693
SWSS_LOG_ERROR("Failed to create underlay router interface %d", status);
694-
exit(EXIT_FAILURE);
694+
handleSaiFailure(true);
695695
}
696696

697697
SWSS_LOG_NOTICE("Created underlay router interface ID %" PRIx64, gUnderlayIfId);

orchagent/orchdaemon.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,7 @@ void OrchDaemon::flush()
670670
if (status != SAI_STATUS_SUCCESS)
671671
{
672672
SWSS_LOG_ERROR("Failed to flush redis pipeline %d", status);
673-
abort();
673+
handleSaiFailure(true);
674674
}
675675
}
676676

orchagent/saihelper.cpp

+37-7
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,8 @@ task_process_status handleSaiCreateStatus(sai_api_t api, sai_status_t status, vo
490490
default:
491491
SWSS_LOG_ERROR("Encountered failure in create operation, exiting orchagent, SAI API: %s, status: %s",
492492
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
493-
abort();
493+
handleSaiFailure(true);
494+
break;
494495
}
495496
break;
496497
case SAI_API_HOSTIF:
@@ -508,8 +509,10 @@ task_process_status handleSaiCreateStatus(sai_api_t api, sai_status_t status, vo
508509
default:
509510
SWSS_LOG_ERROR("Encountered failure in create operation, exiting orchagent, SAI API: %s, status: %s",
510511
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
511-
abort();
512+
handleSaiFailure(true);
513+
break;
512514
}
515+
break;
513516
default:
514517
switch (status)
515518
{
@@ -519,7 +522,8 @@ task_process_status handleSaiCreateStatus(sai_api_t api, sai_status_t status, vo
519522
default:
520523
SWSS_LOG_ERROR("Encountered failure in create operation, exiting orchagent, SAI API: %s, status: %s",
521524
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
522-
abort();
525+
handleSaiFailure(true);
526+
break;
523527
}
524528
}
525529
return task_need_retry;
@@ -560,8 +564,10 @@ task_process_status handleSaiSetStatus(sai_api_t api, sai_status_t status, void
560564
default:
561565
SWSS_LOG_ERROR("Encountered failure in set operation, exiting orchagent, SAI API: %s, status: %s",
562566
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
563-
abort();
567+
handleSaiFailure(true);
568+
break;
564569
}
570+
break;
565571
case SAI_API_TUNNEL:
566572
switch (status)
567573
{
@@ -572,12 +578,15 @@ task_process_status handleSaiSetStatus(sai_api_t api, sai_status_t status, void
572578
default:
573579
SWSS_LOG_ERROR("Encountered failure in set operation, exiting orchagent, SAI API: %s, status: %s",
574580
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
575-
abort();
581+
handleSaiFailure(true);
582+
break;
576583
}
584+
break;
577585
default:
578586
SWSS_LOG_ERROR("Encountered failure in set operation, exiting orchagent, SAI API: %s, status: %s",
579587
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
580-
abort();
588+
handleSaiFailure(true);
589+
break;
581590
}
582591

583592
return task_need_retry;
@@ -605,7 +614,8 @@ task_process_status handleSaiRemoveStatus(sai_api_t api, sai_status_t status, vo
605614
default:
606615
SWSS_LOG_ERROR("Encountered failure in remove operation, exiting orchagent, SAI API: %s, status: %s",
607616
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
608-
abort();
617+
handleSaiFailure(true);
618+
break;
609619
}
610620
return task_need_retry;
611621
}
@@ -657,3 +667,23 @@ bool parseHandleSaiStatusFailure(task_process_status status)
657667
}
658668
return true;
659669
}
670+
671+
/* Handling SAI failure. Request redis to invoke SAI failure dump and abort if set*/
672+
void handleSaiFailure(bool abort_on_failure)
673+
{
674+
SWSS_LOG_ENTER();
675+
676+
sai_attribute_t attr;
677+
678+
attr.id = SAI_REDIS_SWITCH_ATTR_NOTIFY_SYNCD;
679+
attr.value.s32 = SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP;
680+
sai_status_t status = sai_switch_api->set_switch_attribute(gSwitchId, &attr);
681+
if (status != SAI_STATUS_SUCCESS)
682+
{
683+
SWSS_LOG_ERROR("Failed to take sai failure dump %d", status);
684+
}
685+
if (abort_on_failure)
686+
{
687+
abort();
688+
}
689+
}

orchagent/saihelper.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ task_process_status handleSaiSetStatus(sai_api_t api, sai_status_t status, void
1818
task_process_status handleSaiRemoveStatus(sai_api_t api, sai_status_t status, void *context = nullptr);
1919
task_process_status handleSaiGetStatus(sai_api_t api, sai_status_t status, void *context = nullptr);
2020
bool parseHandleSaiStatusFailure(task_process_status status);
21-
21+
void handleSaiFailure(bool abort_on_failure);

tests/mock_tests/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ tests_SOURCES = aclorch_ut.cpp \
4646
swssnet_ut.cpp \
4747
flowcounterrouteorch_ut.cpp \
4848
orchdaemon_ut.cpp \
49+
test_failure_handling.cpp \
4950
$(top_srcdir)/lib/gearboxutils.cpp \
5051
$(top_srcdir)/lib/subintf.cpp \
5152
$(top_srcdir)/orchagent/orchdaemon.cpp \

tests/mock_tests/portsorch_ut.cpp

+81
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "notifier.h"
1010
#define private public
1111
#include "pfcactionhandler.h"
12+
#include <sys/mman.h>
1213
#undef private
1314

1415
#include <sstream>
@@ -21,6 +22,8 @@ namespace portsorch_test
2122

2223
sai_port_api_t ut_sai_port_api;
2324
sai_port_api_t *pold_sai_port_api;
25+
sai_switch_api_t ut_sai_switch_api;
26+
sai_switch_api_t *pold_sai_switch_api;
2427

2528
bool not_support_fetching_fec;
2629
vector<sai_port_fec_mode_t> mock_port_fec_modes = {SAI_PORT_FEC_MODE_RS, SAI_PORT_FEC_MODE_FC};
@@ -66,9 +69,28 @@ namespace portsorch_test
6669
_sai_set_port_fec_count++;
6770
_sai_port_fec_mode = attr[0].value.s32;
6871
}
72+
else if (attr[0].id == SAI_PORT_ATTR_AUTO_NEG_MODE)
73+
{
74+
/* Simulating failure case */
75+
return SAI_STATUS_FAILURE;
76+
}
6977
return pold_sai_port_api->set_port_attribute(port_id, attr);
7078
}
7179

80+
uint32_t *_sai_syncd_notifications_count;
81+
int32_t *_sai_syncd_notification_event;
82+
sai_status_t _ut_stub_sai_set_switch_attribute(
83+
_In_ sai_object_id_t switch_id,
84+
_In_ const sai_attribute_t *attr)
85+
{
86+
if (attr[0].id == SAI_REDIS_SWITCH_ATTR_NOTIFY_SYNCD)
87+
{
88+
*_sai_syncd_notifications_count =+ 1;
89+
*_sai_syncd_notification_event = attr[0].value.s32;
90+
}
91+
return pold_sai_switch_api->set_switch_attribute(switch_id, attr);
92+
}
93+
7294
void _hook_sai_port_api()
7395
{
7496
ut_sai_port_api = *sai_port_api;
@@ -83,6 +105,19 @@ namespace portsorch_test
83105
sai_port_api = pold_sai_port_api;
84106
}
85107

108+
void _hook_sai_switch_api()
109+
{
110+
ut_sai_switch_api = *sai_switch_api;
111+
pold_sai_switch_api = sai_switch_api;
112+
ut_sai_switch_api.set_switch_attribute = _ut_stub_sai_set_switch_attribute;
113+
sai_switch_api = &ut_sai_switch_api;
114+
}
115+
116+
void _unhook_sai_switch_api()
117+
{
118+
sai_switch_api = pold_sai_switch_api;
119+
}
120+
86121
sai_queue_api_t ut_sai_queue_api;
87122
sai_queue_api_t *pold_sai_queue_api;
88123
int _sai_set_queue_attr_count = 0;
@@ -473,6 +508,52 @@ namespace portsorch_test
473508
_unhook_sai_port_api();
474509
}
475510

511+
TEST_F(PortsOrchTest, PortTestSAIFailureHandling)
512+
{
513+
_hook_sai_port_api();
514+
_hook_sai_switch_api();
515+
Table portTable = Table(m_app_db.get(), APP_PORT_TABLE_NAME);
516+
std::deque<KeyOpFieldsValuesTuple> entries;
517+
518+
not_support_fetching_fec = false;
519+
// Get SAI default ports to populate DB
520+
auto ports = ut_helper::getInitialSaiPorts();
521+
522+
for (const auto &it : ports)
523+
{
524+
portTable.set(it.first, it.second);
525+
}
526+
527+
// Set PortConfigDone
528+
portTable.set("PortConfigDone", { { "count", to_string(ports.size()) } });
529+
530+
// refill consumer
531+
gPortsOrch->addExistingData(&portTable);
532+
533+
// Apply configuration :
534+
// create ports
535+
static_cast<Orch *>(gPortsOrch)->doTask();
536+
537+
_sai_syncd_notifications_count = (uint32_t*)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
538+
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
539+
_sai_syncd_notification_event = (int32_t*)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
540+
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
541+
*_sai_syncd_notifications_count = 0;
542+
543+
entries.push_back({"Ethernet0", "SET",
544+
{
545+
{"autoneg", "on"}
546+
}});
547+
auto consumer = dynamic_cast<Consumer *>(gPortsOrch->getExecutor(APP_PORT_TABLE_NAME));
548+
consumer->addToSync(entries);
549+
ASSERT_DEATH({static_cast<Orch *>(gPortsOrch)->doTask();}, "");
550+
551+
ASSERT_EQ(*_sai_syncd_notifications_count, 1);
552+
ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP);
553+
_unhook_sai_port_api();
554+
_unhook_sai_switch_api();
555+
}
556+
476557
TEST_F(PortsOrchTest, PortReadinessColdBoot)
477558
{
478559
Table portTable = Table(m_app_db.get(), APP_PORT_TABLE_NAME);
+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#include "saihelper.h"
2+
#include "ut_helper.h"
3+
#include <sys/mman.h>
4+
5+
extern sai_switch_api_t *sai_switch_api;
6+
7+
namespace saifailure_test
8+
{
9+
struct SaiFailureTest : public ::testing::Test
10+
{
11+
};
12+
uint32_t *_sai_syncd_notifications_count;
13+
int32_t *_sai_syncd_notification_event;
14+
sai_switch_api_t *pold_sai_switch_api;
15+
sai_switch_api_t ut_sai_switch_api;
16+
17+
sai_status_t _ut_stub_sai_set_switch_attribute(
18+
_In_ sai_object_id_t switch_id,
19+
_In_ const sai_attribute_t *attr)
20+
{
21+
if (attr[0].id == SAI_REDIS_SWITCH_ATTR_NOTIFY_SYNCD)
22+
{
23+
*_sai_syncd_notifications_count = *_sai_syncd_notifications_count + 1;
24+
*_sai_syncd_notification_event = attr[0].value.s32;
25+
}
26+
return pold_sai_switch_api->set_switch_attribute(switch_id, attr);
27+
}
28+
29+
void _hook_sai_switch_api()
30+
{
31+
ut_sai_switch_api = *sai_switch_api;
32+
pold_sai_switch_api = sai_switch_api;
33+
ut_sai_switch_api.set_switch_attribute = _ut_stub_sai_set_switch_attribute;
34+
sai_switch_api = &ut_sai_switch_api;
35+
}
36+
37+
void _unhook_sai_switch_api()
38+
{
39+
sai_switch_api = pold_sai_switch_api;
40+
}
41+
42+
TEST_F(SaiFailureTest, handleSaiFailure)
43+
{
44+
_hook_sai_switch_api();
45+
_sai_syncd_notifications_count = (uint32_t*)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
46+
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
47+
_sai_syncd_notification_event = (int32_t*)mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
48+
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
49+
*_sai_syncd_notifications_count = 0;
50+
uint32_t notif_count = *_sai_syncd_notifications_count;
51+
52+
ASSERT_DEATH({handleSaiCreateStatus(SAI_API_FDB, SAI_STATUS_FAILURE);}, "");
53+
ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count);
54+
ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP);
55+
56+
ASSERT_DEATH({handleSaiCreateStatus(SAI_API_HOSTIF, SAI_STATUS_INVALID_PARAMETER);}, "");
57+
ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count);
58+
ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP);
59+
60+
ASSERT_DEATH({handleSaiCreateStatus(SAI_API_PORT, SAI_STATUS_FAILURE);}, "");
61+
ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count);
62+
ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP);
63+
64+
ASSERT_DEATH({handleSaiSetStatus(SAI_API_HOSTIF, SAI_STATUS_FAILURE);}, "");
65+
ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count);
66+
ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP);
67+
68+
ASSERT_DEATH({handleSaiSetStatus(SAI_API_PORT, SAI_STATUS_FAILURE);}, "");
69+
ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count);
70+
ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP);
71+
72+
ASSERT_DEATH({handleSaiSetStatus(SAI_API_TUNNEL, SAI_STATUS_FAILURE);}, "");
73+
ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count);
74+
ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP);
75+
76+
ASSERT_DEATH({handleSaiRemoveStatus(SAI_API_LAG, SAI_STATUS_FAILURE);}, "");
77+
ASSERT_EQ(*_sai_syncd_notifications_count, ++notif_count);
78+
ASSERT_EQ(*_sai_syncd_notification_event, SAI_REDIS_NOTIFY_SYNCD_INVOKE_DUMP);
79+
80+
_unhook_sai_switch_api();
81+
}
82+
}

0 commit comments

Comments
 (0)