Skip to content

Commit dec4570

Browse files
theasianpianistSumukha Tumkur Vani
andauthored
Handle dual ToR neighbor miss scenario (sonic-net#2151)
* Handle dual ToR neighbor miss scenario (sonic-net#2137) - When orchagent receives a neighbor update with a zero MAC: - If the neighbor IP is configured for a specific mux cable port in the MUX_CABLE table in CONFIG_DB, handle the neighbor normally (if active for the port, no action is needed. if standby, a tunnel route is created for the neighbor IP) - If the neighbor IP is not configured for a specific port, create a tunnel route for the IP to the peer switch. - When these neighbor IPs are eventually resolved, remove the tunnel route and handle the neighbor normally. - When creating/initializing a mux cable object, set the internal state to standby to match the constructor behavior. - Various formatting fixes inside test_mux.py - Remove references to deprecated `@pytest.yield_fixture` - Add dual ToR neighbor miss test cases: - Test cases and expected results are described in `mux_neigh_miss_tests.py`. These descriptions are used by the generic test runner `test_neighbor_miss` function to execute the test actions and verify expected results - Various setup fixtures and test info fixtures were added - Existing test cases were changed to use these setup fixtures for consistency Signed-off-by: Lawrence Lee <[email protected]> Co-authored-by: Sumukha Tumkur Vani <[email protected]>
1 parent 9eb4422 commit dec4570

14 files changed

+773
-153
lines changed

neighsyncd/neighsync.cpp

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ NeighSync::NeighSync(RedisPipeline *pipelineAppDB, DBConnector *stateDb, DBConne
2323
m_stateNeighRestoreTable(stateDb, STATE_NEIGH_RESTORE_TABLE_NAME),
2424
m_cfgInterfaceTable(cfgDb, CFG_INTF_TABLE_NAME),
2525
m_cfgLagInterfaceTable(cfgDb, CFG_LAG_INTF_TABLE_NAME),
26-
m_cfgVlanInterfaceTable(cfgDb, CFG_VLAN_INTF_TABLE_NAME)
26+
m_cfgVlanInterfaceTable(cfgDb, CFG_VLAN_INTF_TABLE_NAME),
27+
m_cfgPeerSwitchTable(cfgDb, CFG_PEER_SWITCH_TABLE_NAME)
2728
{
2829
m_AppRestartAssist = new AppRestartAssist(pipelineAppDB, "neighsyncd", "swss", DEFAULT_NEIGHSYNC_WARMSTART_TIMER);
2930
if (m_AppRestartAssist)
@@ -108,14 +109,39 @@ void NeighSync::onMsg(int nlmsg_type, struct nl_object *obj)
108109
return;
109110
}
110111

112+
std::vector<std::string> peerSwitchKeys;
111113
bool delete_key = false;
112-
if ((nlmsg_type == RTM_DELNEIGH) || (state == NUD_INCOMPLETE) ||
113-
(state == NUD_FAILED))
114+
bool use_zero_mac = false;
115+
m_cfgPeerSwitchTable.getKeys(peerSwitchKeys);
116+
bool is_dualtor = peerSwitchKeys.size() > 0;
117+
if (is_dualtor && (state == NUD_INCOMPLETE || state == NUD_FAILED))
118+
{
119+
SWSS_LOG_INFO("Unable to resolve %s, setting zero MAC", key.c_str());
120+
use_zero_mac = true;
121+
122+
// Unresolved neighbor deletion on dual ToR devices must be handled
123+
// separately, otherwise delete_key is never set to true
124+
// and neighorch is never able to remove the neighbor
125+
if (nlmsg_type == RTM_DELNEIGH)
126+
{
127+
delete_key = true;
128+
}
129+
}
130+
else if ((nlmsg_type == RTM_DELNEIGH) ||
131+
(state == NUD_INCOMPLETE) || (state == NUD_FAILED))
114132
{
115133
delete_key = true;
116134
}
117135

118-
nl_addr2str(rtnl_neigh_get_lladdr(neigh), macStr, MAX_ADDR_SIZE);
136+
if (use_zero_mac)
137+
{
138+
std::string zero_mac = "00:00:00:00:00:00";
139+
strncpy(macStr, zero_mac.c_str(), zero_mac.length());
140+
}
141+
else
142+
{
143+
nl_addr2str(rtnl_neigh_get_lladdr(neigh), macStr, MAX_ADDR_SIZE);
144+
}
119145

120146
/* Ignore neighbor entries with Broadcast Mac - Trigger for directed broadcast */
121147
if (!delete_key && (MacAddress(macStr) == MacAddress("ff:ff:ff:ff:ff:ff")))

neighsyncd/neighsync.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class NeighSync : public NetMsg
3636
}
3737

3838
private:
39-
Table m_stateNeighRestoreTable;
39+
Table m_stateNeighRestoreTable, m_cfgPeerSwitchTable;
4040
ProducerStateTable m_neighTable;
4141
AppRestartAssist *m_AppRestartAssist;
4242
Table m_cfgVlanInterfaceTable, m_cfgLagInterfaceTable, m_cfgInterfaceTable;

orchagent/muxorch.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,37 @@ void MuxOrch::updateNeighbor(const NeighborUpdate& update)
10691069
return;
10701070
}
10711071

1072+
auto standalone_tunnel_neigh_it = standalone_tunnel_neighbors_.find(update.entry.ip_address);
1073+
// Handling zero MAC neighbor updates
1074+
if (!update.mac)
1075+
{
1076+
/* For neighbors that were previously resolvable but are now unresolvable,
1077+
* we expect such neighbor entries to be deleted prior to a zero MAC update
1078+
* arriving for that same neighbor.
1079+
*/
1080+
1081+
if (update.add)
1082+
{
1083+
if (standalone_tunnel_neigh_it == standalone_tunnel_neighbors_.end())
1084+
{
1085+
createStandaloneTunnelRoute(update.entry.ip_address);
1086+
}
1087+
/* If the MAC address in the neighbor entry is zero but the neighbor IP
1088+
* is already present in standalone_tunnel_neighbors_, assume we have already
1089+
* added a tunnel route for it and exit early
1090+
*/
1091+
return;
1092+
}
1093+
}
1094+
/* If the update operation for a neighbor contains a non-zero MAC, we must
1095+
* make sure to remove any existing tunnel routes to prevent conflicts.
1096+
* This block also covers the case of neighbor deletion.
1097+
*/
1098+
if (standalone_tunnel_neigh_it != standalone_tunnel_neighbors_.end())
1099+
{
1100+
removeStandaloneTunnelRoute(update.entry.ip_address);
1101+
}
1102+
10721103
for (auto it = mux_cable_tb_.begin(); it != mux_cable_tb_.end(); it++)
10731104
{
10741105
MuxCable* ptr = it->second.get();
@@ -1376,6 +1407,27 @@ bool MuxOrch::delOperation(const Request& request)
13761407
return true;
13771408
}
13781409

1410+
void MuxOrch::createStandaloneTunnelRoute(IpAddress neighborIp)
1411+
{
1412+
SWSS_LOG_INFO("Creating standalone tunnel route for neighbor %s", neighborIp.to_string().c_str());
1413+
sai_object_id_t tunnel_nexthop = getNextHopTunnelId(MUX_TUNNEL, mux_peer_switch_);
1414+
if (tunnel_nexthop == SAI_NULL_OBJECT_ID) {
1415+
SWSS_LOG_NOTICE("%s nexthop not created yet, ignoring tunnel route creation for %s", MUX_TUNNEL, neighborIp.to_string().c_str());
1416+
return;
1417+
}
1418+
IpPrefix pfx = neighborIp.to_string();
1419+
create_route(pfx, tunnel_nexthop);
1420+
standalone_tunnel_neighbors_.insert(neighborIp);
1421+
}
1422+
1423+
void MuxOrch::removeStandaloneTunnelRoute(IpAddress neighborIp)
1424+
{
1425+
SWSS_LOG_INFO("Removing standalone tunnel route for neighbor %s", neighborIp.to_string().c_str());
1426+
IpPrefix pfx = neighborIp.to_string();
1427+
remove_route(pfx);
1428+
standalone_tunnel_neighbors_.erase(neighborIp);
1429+
}
1430+
13791431
MuxCableOrch::MuxCableOrch(DBConnector *db, DBConnector *sdb, const std::string& tableName):
13801432
Orch2(db, tableName, request_),
13811433
app_tunnel_route_table_(db, APP_TUNNEL_ROUTE_TABLE_NAME),

orchagent/muxorch.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,13 @@ class MuxOrch : public Orch2, public Observer, public Subject
205205

206206
bool getMuxPort(const MacAddress&, const string&, string&);
207207

208+
/***
209+
* Methods for managing tunnel routes for neighbor IPs not associated
210+
* with a specific mux cable
211+
***/
212+
void createStandaloneTunnelRoute(IpAddress neighborIp);
213+
void removeStandaloneTunnelRoute(IpAddress neighborIp);
214+
208215
IpAddress mux_peer_switch_ = 0x0;
209216
sai_object_id_t mux_tunnel_id_ = SAI_NULL_OBJECT_ID;
210217

@@ -219,6 +226,7 @@ class MuxOrch : public Orch2, public Observer, public Subject
219226
FdbOrch *fdb_orch_;
220227

221228
MuxCfgRequest request_;
229+
std::set<IpAddress> standalone_tunnel_neighbors_;
222230
};
223231

224232
const request_description_t mux_cable_request_description = {

orchagent/neighorch.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,16 @@ void NeighOrch::doTask(Consumer &consumer)
715715
if (m_syncdNeighbors.find(neighbor_entry) == m_syncdNeighbors.end()
716716
|| m_syncdNeighbors[neighbor_entry].mac != mac_address)
717717
{
718-
if (addNeighbor(neighbor_entry, mac_address))
718+
// only for unresolvable neighbors that are new
719+
if (!mac_address)
720+
{
721+
if (m_syncdNeighbors.find(neighbor_entry) == m_syncdNeighbors.end())
722+
{
723+
addZeroMacTunnelRoute(neighbor_entry, mac_address);
724+
}
725+
it = consumer.m_toSync.erase(it);
726+
}
727+
else if (addNeighbor(neighbor_entry, mac_address))
719728
{
720729
it = consumer.m_toSync.erase(it);
721730
}
@@ -1716,3 +1725,12 @@ void NeighOrch::updateSrv6Nexthop(const NextHopKey &nh, const sai_object_id_t &n
17161725
m_syncdNextHops.erase(nh);
17171726
}
17181727
}
1728+
void NeighOrch::addZeroMacTunnelRoute(const NeighborEntry& entry, const MacAddress& mac)
1729+
{
1730+
SWSS_LOG_INFO("Creating tunnel route for neighbor %s", entry.ip_address.to_string().c_str());
1731+
MuxOrch* mux_orch = gDirectory.get<MuxOrch*>();
1732+
NeighborUpdate update = {entry, mac, true};
1733+
mux_orch->update(SUBJECT_TYPE_NEIGH_CHANGE, static_cast<void *>(&update));
1734+
m_syncdNeighbors[entry] = { mac, false };
1735+
}
1736+

orchagent/neighorch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ class NeighOrch : public Orch, public Subject, public Observer
115115

116116
bool resolveNeighborEntry(const NeighborEntry &, const MacAddress &);
117117
void clearResolvedNeighborEntry(const NeighborEntry &);
118+
119+
void addZeroMacTunnelRoute(const NeighborEntry &, const MacAddress &);
118120
};
119121

120122
#endif /* SWSS_NEIGHORCH_H */

tests/conftest.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1794,15 +1794,15 @@ def update_dvs(log_path, new_dvs_env=[]):
17941794
dvs.runcmd("mv /etc/sonic/config_db.json.orig /etc/sonic/config_db.json")
17951795
dvs.ctn_restart()
17961796

1797-
@pytest.yield_fixture(scope="module")
1797+
@pytest.fixture(scope="module")
17981798
def dvs(request, manage_dvs) -> DockerVirtualSwitch:
17991799
dvs_env = getattr(request.module, "DVS_ENV", [])
18001800
name = request.config.getoption("--dvsname")
18011801
log_path = name if name else request.module.__name__
18021802

18031803
return manage_dvs(log_path, dvs_env)
18041804

1805-
@pytest.yield_fixture(scope="module")
1805+
@pytest.fixture(scope="module")
18061806
def vct(request):
18071807
vctns = request.config.getoption("--vctns")
18081808
topo = request.config.getoption("--topo")
@@ -1821,7 +1821,8 @@ def vct(request):
18211821
vct.get_logs(request.module.__name__)
18221822
vct.destroy()
18231823

1824-
@pytest.yield_fixture
1824+
1825+
@pytest.fixture
18251826
def testlog(request, dvs):
18261827
dvs.runcmd(f"logger -t pytest === start test {request.node.nodeid} ===")
18271828
yield testlog
@@ -1850,27 +1851,29 @@ def dvs_route(request, dvs) -> DVSRoute:
18501851

18511852
# FIXME: The rest of these also need to be reverted back to normal fixtures to
18521853
# appease the linter.
1853-
@pytest.yield_fixture(scope="class")
1854+
@pytest.fixture(scope="class")
18541855
def dvs_lag_manager(request, dvs):
18551856
request.cls.dvs_lag = dvs_lag.DVSLag(dvs.get_asic_db(),
18561857
dvs.get_config_db(),
18571858
dvs)
18581859

18591860

1860-
@pytest.yield_fixture(scope="class")
1861+
@pytest.fixture(scope="class")
18611862
def dvs_vlan_manager(request, dvs):
18621863
request.cls.dvs_vlan = dvs_vlan.DVSVlan(dvs.get_asic_db(),
18631864
dvs.get_config_db(),
18641865
dvs.get_state_db(),
18651866
dvs.get_counters_db(),
18661867
dvs.get_app_db())
18671868

1868-
@pytest.yield_fixture(scope="class")
1869+
1870+
@pytest.fixture(scope="class")
18691871
def dvs_port_manager(request, dvs):
18701872
request.cls.dvs_port = dvs_port.DVSPort(dvs.get_asic_db(),
18711873
dvs.get_config_db())
18721874

1873-
@pytest.yield_fixture(scope="class")
1875+
1876+
@pytest.fixture(scope="class")
18741877
def dvs_mirror_manager(request, dvs):
18751878
request.cls.dvs_mirror = dvs_mirror.DVSMirror(dvs.get_asic_db(),
18761879
dvs.get_config_db(),
@@ -1879,7 +1882,7 @@ def dvs_mirror_manager(request, dvs):
18791882
dvs.get_app_db())
18801883

18811884

1882-
@pytest.yield_fixture(scope="class")
1885+
@pytest.fixture(scope="class")
18831886
def dvs_policer_manager(request, dvs):
18841887
request.cls.dvs_policer = dvs_policer.DVSPolicer(dvs.get_asic_db(),
18851888
dvs.get_config_db())
@@ -1897,7 +1900,8 @@ def remove_dpb_config_file(dvs):
18971900
cmd = "mv /etc/sonic/config_db.json.bak /etc/sonic/config_db.json"
18981901
dvs.runcmd(cmd)
18991902

1900-
@pytest.yield_fixture(scope="module")
1903+
1904+
@pytest.fixture(scope="module")
19011905
def dpb_setup_fixture(dvs):
19021906
create_dpb_config_file(dvs)
19031907
if dvs.vct is None:

0 commit comments

Comments
 (0)