Skip to content

Commit e06d3ed

Browse files
authored
T2-Route-Conv: Process Crash Optimization (#603)
<!-- Please make sure you've read and understood our contributing guidelines: https://github.com/Azure/SONiC/blob/gh-pages/CONTRIBUTING.md failure_prs.log skip_prs.log Make sure all your commits include a signature generated with `git commit -s` ** If this is a bug fix, make sure your description includes "fixes #xxxx", or "closes #xxxx" or "resolves #xxxx" Please provide the following information: --> #### Why I did it Fixes issue: sonic-net/sonic-buildimage#21586 ##### Work item tracking - Microsoft ADO **31196012**: #### How I did it Run TSA-TSB service upon swss/swss0/swss1/.. startup. If the service is already running, reset the TSA-TSB timer. #### How to verify it Ran the T2 process crash sonic-mgmt snappi test to verify the convergence. Before fix: ~10second After Fix: <10ms <!-- If PR needs to be backported, then the PR must be tested against the base branch and the earliest backport release branch and provide tested image version on these two branches. For example, if the PR is requested for master, 202211 and 202012, then the requester needs to provide test results on master and 202012. --> #### Which release branch to backport (provide reason below if selected) <!-- - Note we only backport fixes to a release branch, *not* features! - Please also provide a reason for the backporting below. - e.g. - [x] 202006 --> - [ ] 201811 - [ ] 201911 - [ ] 202006 - [ ] 202012 - [ ] 202106 - [ ] 202111 - [ ] 202205 - [ ] 202211 - [ ] 202305 #### Tested branch (Please provide the tested image version) SONiC.20240532.04 <!-- - Please provide tested image version - e.g. - [x] 20201231.100 --> - [ ] <!-- image version 1 --> - [ ] <!-- image version 2 --> #### Description for the changelog <!-- Write a short (one line) summary that describes the changes in this pull request for inclusion in the changelog: --> <!-- Ensure to add label/tag for the feature raised. example - PR#2174 under sonic-utilities repo. where, Generic Config and Update feature has been labelled as GCU. --> #### Link to config_db schema for YANG module changes <!-- Provide a link to config_db schema for the table for which YANG model is defined Link should point to correct section on https://github.com/Azure/sonic-buildimage/blob/master/src/sonic-yang-models/doc/Configuration.md --> #### A picture of a cute animal (not mandatory but encouraged)
1 parent a45d2c1 commit e06d3ed

File tree

4 files changed

+47
-8
lines changed

4 files changed

+47
-8
lines changed

dockers/docker-fpm-frr/base_image_files/TSA

+5
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ if [ -z "$STARTED_BY_TSA_TSB_SERVICE" ]; then
3838
[[ $(/bin/systemctl show $service --property SubState --value) == "running" ]]; then
3939
echo "Stopping $service before configuring TSA"
4040
systemctl stop $service
41+
if sonic-db-cli STATE_DB HDEL "ALL_SERVICE_STATUS|tsa_tsb_service" "running" >/dev/null; then
42+
echo "Successfully removed TSA-TSB service flag."
43+
else
44+
echo "Failed to remove TSA-TSB service flag!" >&2
45+
fi
4146
fi
4247
fi
4348

dockers/docker-fpm-frr/base_image_files/TSB

+5
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ if [ -z "$STARTED_BY_TSA_TSB_SERVICE" ]; then
3737
[[ $(/bin/systemctl show $service --property SubState --value) == "running" ]]; then
3838
echo "Stopping $service before configuring TSB"
3939
systemctl stop $service
40+
if sonic-db-cli STATE_DB HDEL "ALL_SERVICE_STATUS|tsa_tsb_service" "running" >/dev/null; then
41+
echo "Successfully removed TSA-TSB service flag."
42+
else
43+
echo "Failed to remove TSA-TSB service flag!" >&2
44+
fi
4045
fi
4146
fi
4247

files/scripts/startup_tsa_tsb.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -69,19 +69,42 @@ def config_tsa():
6969
if tsa_ena == True:
7070
logger.log_info("Configuring TSA")
7171
subprocess.check_output(['TSA']).strip()
72+
logger.log_info("Setting TSA-TSB service field in STATE_DB")
73+
subprocess.check_output([
74+
'sonic-db-cli', 'STATE_DB', 'HSET', 'ALL_SERVICE_STATUS|tsa_tsb_service', 'running', 'OK'
75+
]).strip()
7276
else:
73-
if num_asics > 1:
74-
logger.log_info("Either TSA is already configured or switch sub_role is not Frontend - not configuring TSA")
77+
#check if tsa_tsb service is already running, restart the timer
78+
try:
79+
startup_tsa_tsb_service_status = subprocess.check_output([
80+
'sonic-db-cli', 'STATE_DB', 'HGET', 'ALL_SERVICE_STATUS|tsa_tsb_service', 'running'
81+
]).strip().decode('utf-8') # Convert bytes to string
82+
except subprocess.CalledProcessError:
83+
startup_tsa_tsb_service_status = None # Default if the field is missing
84+
85+
if startup_tsa_tsb_service_status == 'OK':
86+
logger.log_info("TSA-TSB service is already running, just restart the timer")
87+
return True
7588
else:
76-
logger.log_info("Either TSA is already configured - not configuring TSA")
89+
if num_asics > 1:
90+
logger.log_info("Either TSA is already configured or switch sub_role is not Frontend - not configuring TSA")
91+
else:
92+
logger.log_info("Either TSA is already configured - not configuring TSA")
7793
return tsa_ena
7894

7995
def config_tsb():
8096
logger.log_info("Configuring TSB")
8197
subprocess.check_output(['TSB']).strip()
98+
99+
logger.log_info("Removing the TSA-TSB service field from STATE_DB")
100+
subprocess.check_output([
101+
'sonic-db-cli', 'STATE_DB', 'HDEL', 'ALL_SERVICE_STATUS|tsa_tsb_service', 'running'
102+
]).strip()
103+
82104
tsb_issued = True
83105
return
84106

107+
85108
def start_tsb_timer(interval):
86109
global timer
87110
logger.log_info("Starting timer with interval {} seconds to configure TSB".format(interval))

files/scripts/swss.sh

+11-5
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ DEBUGLOG="/tmp/swss-syncd-debug$DEV.log"
88
LOCKFILE="/tmp/swss-syncd-lock$DEV"
99
NAMESPACE_PREFIX="asic"
1010
ETC_SONIC_PATH="/etc/sonic/"
11-
11+
TSA_TSB_SERVICE="startup_tsa_tsb.service"
1212

1313
. /usr/local/bin/asic_status.sh
1414

@@ -109,9 +109,9 @@ function clean_up_tables()
109109

110110
# This function cleans up the chassis db table entries created ONLY by this asic
111111
# This is used to do the clean up operation when the line card / asic reboots
112-
# When the asic/lc is RE-booting, the chassis db server is supposed to be running
113-
# in the supervisor. So the clean up is done when only the chassis db connectable.
114-
# Otherwise no need to do the clean up since both the supervisor and line card may be
112+
# When the asic/lc is RE-booting, the chassis db server is supposed to be running
113+
# in the supervisor. So the clean up is done when only the chassis db connectable.
114+
# Otherwise no need to do the clean up since both the supervisor and line card may be
115115
# rebooting (the whole chassis scenario)
116116
# The clean up operation is required to delete only those entries created by
117117
# the asic that is rebooted. Entries from the following tables are deleted in the order
@@ -212,7 +212,7 @@ function clean_up_chassis_db_tables()
212212
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_MEMBER_TABLE entries deleted: $num_lag_mem"
213213

214214
# Wait for some time before deleting system lag so that the all the memebers of the
215-
# system lag will be cleared.
215+
# system lag will be cleared.
216216
# This delay is needed only if some system lag members were deleted
217217

218218
if [[ $num_lag_mem > 0 ]]; then
@@ -258,6 +258,12 @@ start_peer_and_dependent_services() {
258258
check_warm_boot
259259

260260
if [[ x"$WARM_BOOT" != x"true" ]]; then
261+
SERVICES_CONF="/usr/share/sonic/device/$PLATFORM/services.conf"
262+
if [[ -f $SERVICES_CONF ]] && grep -q "^startup_tsa_tsb.service$" $SERVICES_CONF; then
263+
echo "${SERVICE}$DEV: starting TSA-TSB service"
264+
/bin/systemctl restart $TSA_TSB_SERVICE
265+
fi
266+
261267
for peer in ${PEER}; do
262268
if [[ ! -z $DEV ]]; then
263269
/bin/systemctl start ${peer}@$DEV

0 commit comments

Comments
 (0)