Skip to content

Commit a35e23c

Browse files
authored
bgpcfgd: add support for software bfd sessions (#20981)
bgpcfgd: add support for software bfd sessions
1 parent accf5b3 commit a35e23c

File tree

8 files changed

+1150
-0
lines changed

8 files changed

+1150
-0
lines changed

dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2

+11
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,17 @@ dependent_startup_wait_for=bgpd:running
206206

207207
{% endif %}
208208

209+
{% if DEVICE_METADATA.localhost.switch_type is defined and DEVICE_METADATA.localhost.switch_type == "dpu" %}
210+
[program:bfdmon]
211+
command=/usr/local/bin/bfdmon
212+
priority=6
213+
autostart=true
214+
autorestart=true
215+
startsecs=0
216+
stdout_logfile=syslog
217+
stderr_logfile=syslog
218+
{% endif %}
219+
209220
{% if DEVICE_METADATA.localhost.docker_routing_config_mode is defined and (DEVICE_METADATA.localhost.docker_routing_config_mode == "unified" or DEVICE_METADATA.localhost.docker_routing_config_mode == "split-unified") %}
210221
[program:vtysh_b]
211222
command=/usr/bin/vtysh -b

src/sonic-bgpcfgd/bfdmon/__init__.py

Whitespace-only changes.

src/sonic-bgpcfgd/bfdmon/bfdmon.py

+141
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import json
2+
import subprocess
3+
import time
4+
import syslog
5+
from swsscommon import swsscommon
6+
from sonic_py_common.general import getstatusoutput_noshell
7+
8+
class BfdFrrMon:
9+
def __init__(self):
10+
# Initialize local sets to store current BFD peer states
11+
self.local_v4_peers = set()
12+
self.local_v6_peers = set()
13+
self.status_table = "DPU_BFD_PROBE_STATE"
14+
self.db_connector = swsscommon.DBConnector("STATE_DB", 0)
15+
self.table = swsscommon.Table(self.db_connector, self.status_table)
16+
17+
self.bfdd_running = False
18+
self.init_done = False
19+
self.MAX_RETRY_ATTEMPTS = 3
20+
21+
def check_bfdd(self):
22+
"""
23+
Check if bfdd is running.
24+
Return: True if bfdd process is running, False otherwise.
25+
"""
26+
try:
27+
# Use pgrep to check if the process is running
28+
rc, output = getstatusoutput_noshell(["pgrep", "-f", "bfdd"])
29+
if not rc:
30+
self.bfdd_running = True
31+
return True
32+
except Exception as e:
33+
return False
34+
35+
return False
36+
37+
def get_bfd_sessions(self):
38+
"""
39+
Get BFD session information from FRR using vtysh.
40+
Updates two sets: one for IPv4 peers and another for IPv6 peers whose BFD state is 'up'.
41+
Returns True if peer info was retreived successfully, False otherwise.
42+
"""
43+
44+
self.frr_v4_peers = set()
45+
self.frr_v6_peers = set()
46+
47+
# Update bfdd state if it wasn't previously running
48+
if not self.bfdd_running:
49+
self.bfdd_running = self.check_bfdd()
50+
51+
if not self.bfdd_running:
52+
syslog.syslog(syslog.LOG_WARNING, "*WARNING* bfdd not currently running")
53+
return False
54+
55+
retry_attempt = 0
56+
cmd = ['vtysh', '-c', 'show bfd peers json']
57+
while retry_attempt < self.MAX_RETRY_ATTEMPTS:
58+
try:
59+
rc, output = getstatusoutput_noshell(cmd)
60+
if rc:
61+
syslog.syslog(syslog.LOG_ERR, "*ERROR* Failed with rc:{} when execute: {}".format(rc, cmd))
62+
return False
63+
if len(output) == 0:
64+
syslog.syslog(syslog.LOG_WARNING, "*WARNING* output none when execute: {}".format(cmd))
65+
return False
66+
67+
bfd_data = json.loads(output)
68+
if bfd_data:
69+
for session in bfd_data:
70+
if "status" in session and session["status"] == "up":
71+
if "peer" in session:
72+
if ":" in session["peer"]: # IPv6
73+
self.frr_v6_peers.add(session["peer"])
74+
else: # IPv4
75+
self.frr_v4_peers.add(session["peer"])
76+
return True
77+
except json.JSONDecodeError as e:
78+
# Log the exception and retry if within the maximum attempts
79+
retry_attempt += 1
80+
syslog.syslog(syslog.LOG_WARNING,
81+
"*WARNING* JSONDecodeError: {} when execute: {} Retry attempt: {}".format(e, cmd, retry_attempt))
82+
time.sleep(1)
83+
continue
84+
except Exception as e:
85+
# Log other exceptions and return failure
86+
retry_attempt += 1
87+
syslog.syslog(syslog.LOG_WARNING,
88+
"*WARNING* An unexpected error occurred: {} when execute: {} Retry attempt: {}".format(
89+
e, cmd, retry_attempt))
90+
time.sleep(1)
91+
continue
92+
93+
# Log an error if the maximum retry attempts are reached
94+
syslog.syslog(syslog.LOG_ERR,
95+
"*ERROR* Maximum retry attempts reached. Failed to execute: {}".format(cmd))
96+
return False
97+
98+
def update_state_db(self):
99+
"""
100+
Update the state DB only with changes (additions or deletions) to the peer list.
101+
"""
102+
# Check differences between local sets and new data
103+
new_v4_peers = self.frr_v4_peers - self.local_v4_peers # Peers to add
104+
removed_v4_peers = self.local_v4_peers - self.frr_v4_peers # Peers to remove
105+
106+
new_v6_peers = self.frr_v6_peers - self.local_v6_peers # Peers to add
107+
removed_v6_peers = self.local_v6_peers - self.frr_v6_peers # Peers to remove
108+
109+
if new_v4_peers or removed_v4_peers or new_v6_peers or removed_v6_peers or not self.init_done:
110+
# Update local sets with the new data
111+
self.local_v4_peers = self.frr_v4_peers
112+
self.local_v6_peers = self.frr_v6_peers
113+
114+
# Update Redis with the new peer sets
115+
values = [
116+
("v4_bfd_up_sessions", json.dumps(list(self.local_v4_peers))),
117+
("v6_bfd_up_sessions", json.dumps(list(self.local_v6_peers)))
118+
]
119+
self.table.set("", values)
120+
syslog.syslog(syslog.LOG_INFO,
121+
"{} table in STATE_DB updated. v4_peers: {}, v6_peers: {}".format(
122+
self.status_table, self.local_v4_peers, self.local_v6_peers))
123+
124+
self.init_done = True
125+
126+
def main():
127+
SLEEP_TIME = 2 # Wait in seconds between each iteration
128+
syslog.syslog(syslog.LOG_INFO, "bfdmon service started")
129+
bfd_mon = BfdFrrMon()
130+
131+
while True:
132+
# Sleep for a while before checking again (adjust as necessary)
133+
time.sleep(SLEEP_TIME)
134+
135+
if bfd_mon.get_bfd_sessions():
136+
bfd_mon.update_state_db()
137+
138+
syslog.syslog(syslog.LOG_INFO, "bfdmon service stopped")
139+
140+
if __name__ == "__main__":
141+
main()

src/sonic-bgpcfgd/bgpcfgd/main.py

+6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from .managers_rm import RouteMapMgr
2323
from .managers_device_global import DeviceGlobalCfgMgr
2424
from .managers_chassis_app_db import ChassisAppDbMgr
25+
from .managers_bfd import BfdMgr
2526
from .managers_srv6 import SRv6Mgr
2627
from .static_rt_timer import StaticRouteTimer
2728
from .runner import Runner, signal_handler
@@ -84,6 +85,11 @@ def do_work():
8485
if device_info.is_chassis():
8586
managers.append(ChassisAppDbMgr(common_objs, "CHASSIS_APP_DB", "BGP_DEVICE_GLOBAL"))
8687

88+
switch_type = device_info.get_localhost_info("switch_type")
89+
if switch_type and switch_type == "dpu":
90+
log_notice("switch type is dpu, starting bfd manager")
91+
managers.append(BfdMgr(common_objs, "STATE_DB", swsscommon.STATE_BFD_SOFTWARE_SESSION_TABLE_NAME))
92+
8793
runner = Runner(common_objs['cfg_mgr'])
8894
for mgr in managers:
8995
runner.add_manager(mgr)

0 commit comments

Comments
 (0)