Skip to content

Commit 388c50c

Browse files
authored
[202012][warmboot] Add new preboot health check: verify db integrity (#1839)
Porting changes from master PRs- #1785, #1828. The PR on master cannot be cherrypicked cleanly, hence a separate PR for 202012: Verify database integrity before proceeding with warm reboot or fast reboot. This integrity check uses a JSON schema to validate DBs. To start with, only counters_db's table COUNTERS_PORT_NAME_MAP presence is verified. But, this list can advance in future. The test logic is designed to be generic; any more databases or tables within them can be just added to schema list, and the verification logic needs no change.
1 parent d73dc98 commit 388c50c

File tree

3 files changed

+114
-4
lines changed

3 files changed

+114
-4
lines changed

scripts/check_db_integrity.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This is to verify if Database has critical tables present before warmboot can proceed.
5+
If warmboot is allowed with missing critical tables, it can lead to issues in going
6+
down path or during the recovery path. This test detects such issues before proceeding.
7+
The verification procedure here uses JSON schemas to verify the DB entities.
8+
9+
In future, to verify new tables or their content, just the schema modification is needed.
10+
No modification may be needed to the integrity check logic.
11+
"""
12+
13+
import os, sys
14+
import json, jsonschema
15+
import syslog
16+
import subprocess
17+
import traceback
18+
19+
DB_SCHEMA = {
20+
"COUNTERS_DB":
21+
{
22+
"$schema": "http://json-schema.org/draft-06/schema",
23+
"type": "object",
24+
"title": "Schema for COUNTERS DB's entities",
25+
"required": ["COUNTERS_PORT_NAME_MAP"],
26+
"properties": {
27+
"COUNTERS_PORT_NAME_MAP": {"$id": "#/properties/COUNTERS_PORT_NAME_MAP", "type": "object"}
28+
}
29+
}
30+
}
31+
32+
33+
def main():
34+
if not DB_SCHEMA:
35+
return 0
36+
37+
for db_name, schema in DB_SCHEMA.items():
38+
db_dump_file = "/tmp/{}.json".format(db_name)
39+
dump_db_cmd = "sonic-db-dump -n 'COUNTERS_DB' -y > {}".format(db_dump_file)
40+
p = subprocess.Popen(dump_db_cmd, shell=True, text=True,
41+
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
42+
(_, err) = p.communicate()
43+
rc = p.wait()
44+
if rc != 0:
45+
print("Failed to dump db {}. Return code: {} with err: {}".format(db_name, rc, err))
46+
47+
try:
48+
with open(db_dump_file) as fp:
49+
db_dump_data = json.load(fp)
50+
except ValueError as err:
51+
syslog.syslog(syslog.LOG_DEBUG, "DB json file is not a valid json file. " +\
52+
"Error: {}".format(str(err)))
53+
return 1
54+
55+
# What: Validate if critical tables and entries are present in DB.
56+
# Why: This is needed to avoid warmbooting with a bad DB; which can
57+
# potentially trigger failures in the reboot recovery path.
58+
# How: Validate DB against a schema which defines required tables.
59+
try:
60+
jsonschema.validate(instance=db_dump_data, schema=schema)
61+
except jsonschema.exceptions.ValidationError as err:
62+
syslog.syslog(syslog.LOG_ERR, "Database is missing tables/entries needed for reboot procedure. " +\
63+
"DB integrity check failed with:\n{}".format(str(err.message)))
64+
return 1
65+
syslog.syslog(syslog.LOG_DEBUG, "Database integrity checks passed.")
66+
return 0
67+
68+
69+
if __name__ == '__main__':
70+
res = 0
71+
try:
72+
res = main()
73+
except KeyboardInterrupt:
74+
syslog.syslog(syslog.LOG_NOTICE, "SIGINT received. Quitting")
75+
res = 1
76+
except Exception as e:
77+
syslog.syslog(syslog.LOG_ERR, "Got an exception %s: Traceback: %s" % (str(e), traceback.format_exc()))
78+
res = 2
79+
finally:
80+
syslog.closelog()
81+
try:
82+
sys.exit(res)
83+
except SystemExit:
84+
os._exit(res)

scripts/fast-reboot

+28-3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ REBOOT_TYPE="${REBOOT_SCRIPT_NAME}"
1010
VERBOSE=no
1111
FORCE=no
1212
IGNORE_ASIC=no
13+
IGNORE_DB_CHECK=no
1314
STRICT=no
1415
REBOOT_METHOD="/sbin/kexec -e"
1516
ASSISTANT_IP_LIST=""
@@ -36,6 +37,7 @@ EXIT_ORCHAGENT_SHUTDOWN=10
3637
EXIT_SYNCD_SHUTDOWN=11
3738
EXIT_FAST_REBOOT_DUMP_FAILURE=12
3839
EXIT_FILTER_FDB_ENTRIES_FAILURE=13
40+
EXIT_DB_INTEGRITY_FAILURE=15
3941
EXIT_NO_CONTROL_PLANE_ASSISTANT=20
4042
EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21
4143

@@ -57,8 +59,9 @@ function showHelpAndExit()
5759
echo "Usage: ${REBOOT_SCRIPT_NAME} [options]"
5860
echo " -h,-? : get this help"
5961
echo " -v : turn on verbose"
60-
echo " -f : force execution"
61-
echo " -i : ignore MD5-checksum-verification of ASIC configuration files"
62+
echo " -f : force execution - ignore Orchagent RESTARTCHECK failure"
63+
echo " -i : force execution - ignore ASIC MD5-checksum-verification"
64+
echo " -d : force execution - ignore database integrity check"
6265
echo " -r : reboot with /sbin/reboot"
6366
echo " -k : reboot with /sbin/kexec -e [default]"
6467
echo " -x : execute script with -x flag"
@@ -72,7 +75,7 @@ function showHelpAndExit()
7275

7376
function parseOptions()
7477
{
75-
while getopts "vfih?rkxc:s" opt; do
78+
while getopts "vfidh?rkxc:s" opt; do
7679
case ${opt} in
7780
h|\? )
7881
showHelpAndExit
@@ -86,6 +89,9 @@ function parseOptions()
8689
i )
8790
IGNORE_ASIC=yes
8891
;;
92+
d )
93+
IGNORE_DB_CHECK=yes
94+
;;
8995
r )
9096
REBOOT_METHOD="/sbin/reboot"
9197
;;
@@ -325,6 +331,23 @@ function check_docker_exec()
325331
done
326332
}
327333
334+
function check_db_integrity()
335+
{
336+
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
337+
CHECK_DB_INTEGRITY=0
338+
/usr/local/bin/check_db_integrity.py || CHECK_DB_INTEGRITY=$?
339+
if [[ CHECK_DB_INTEGRITY -ne 0 ]]; then
340+
if [[ x"${IGNORE_DB_CHECK}" == x"yes" ]]; then
341+
debug "Ignoring Database integrity checks..."
342+
else
343+
error "Failed to validate DB's integrity. Exit code: ${CHECK_DB_INTEGRITY}. \
344+
Use '-d' option to force ignore this check."
345+
exit ${EXIT_DB_INTEGRITY_FAILURE}
346+
fi
347+
fi
348+
fi
349+
}
350+
328351
function reboot_pre_check()
329352
{
330353
check_docker_exec
@@ -335,6 +358,8 @@ function reboot_pre_check()
335358
fi
336359
rm ${filename}
337360
361+
check_db_integrity
362+
338363
# Make sure /host has enough space for warm reboot temp files
339364
avail=$(df -k /host | tail -1 | awk '{ print $4 }')
340365
if [[ ${avail} -lt ${MIN_HD_SPACE_NEEDED} ]]; then

setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@
123123
'scripts/watermarkcfg',
124124
'scripts/sonic-kdump-config',
125125
'scripts/centralize_database',
126-
'scripts/null_route_helper'
126+
'scripts/null_route_helper',
127+
'scripts/check_db_integrity.py'
127128
],
128129
entry_points={
129130
'console_scripts': [

0 commit comments

Comments
 (0)