Skip to content

Commit 11c2716

Browse files
authored
Improve SONiC disk checker to handle disk full case and mount overlay fs to allow remote user login. (sonic-net#3700)
Improve SONiC disk checker to handle disk full case and mount overlay fs to allow remote user login. This PR depends on DB schema change: sonic-net#21351 What I did Currently disk checker only handle RO disk case, but when disk no free space, remote user also can't login. How I did it Check disk free space and mount overlay fs to allow TACACS login. How to verify it Create big file on device, make device no free space. then login with remote user should success.
1 parent 13619aa commit 11c2716

File tree

2 files changed

+159
-21
lines changed

2 files changed

+159
-21
lines changed

scripts/disk_check.py

+89-20
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,40 @@
33

44
"""
55
What:
6-
There have been cases, where disk turns Read-only due to kernel bug.
7-
In Read-only state, system blocks new remote user login via TACACS.
8-
This utility is to check & make transient recovery as needed.
6+
This utility is designed to address two specific issues:
7+
1. Disk becoming read-only due to kernel bugs.
8+
2. Disk running out of space.
9+
When either of these issues occurs, the system prevents new remote user logins via TACACS.
910
1011
How:
11-
check for Read-Write permission. If Read-only, create writable overlay using tmpfs.
12+
Checks for read-write permissions and available disk space.
13+
If an issue is detected, create writable overlay using tmpfs.
1214
13-
By default "/etc" & "/home" are checked and if in Read-only state, make them Read-Write
15+
By default "/etc" & "/home" are checked and if issue detected, make them Read-Write
1416
using overlay on top of tmpfs.
1517
1618
Making /etc & /home as writable lets successful new remote user login.
1719
18-
If in Read-only state or in Read-Write state with the help of tmpfs overlay,
19-
syslog ERR messages are written, to help raise alerts.
20+
Write syslog ERR messages to help raise alerts in the following cases:
21+
1. Disk in read-only state.
22+
2. Disk out of space.
23+
3. Mounted tmpfs overlay.
2024
2125
Monit may be used to invoke it periodically, to help scan & fix and
2226
report via syslog.
2327
2428
Tidbit:
25-
If you would like to test this script, you could simulate a RO disk
26-
with the following command. Reboot will revert the effect.
29+
To test this script:
30+
1. Simulate a RO disk with the following command. Reboot will revert the effect.
2731
sudo bash -c "echo u > /proc/sysrq-trigger"
32+
2. Use up all disk space by create big file in /var/dump/:
33+
dd if=/dev/zero of=/var/dump/sonic_dump_devicename_20241126_204132.tar bs=1G count=50
2834
2935
"""
3036

3137
import argparse
3238
import os
39+
import shutil
3340
import sys
3441
import syslog
3542
import subprocess
@@ -40,10 +47,16 @@
4047
WORK_DIR = "/run/mount/work"
4148
MOUNTS_FILE = "/proc/mounts"
4249

50+
# Threshold of free block counts: On most file systems, the block size is 4096 bytes.
51+
FREE_SPACE_THRESHOLD = 1024
52+
4353
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
4454
EVENTS_PUBLISHER_TAG = "event-disk"
4555
events_handle = None
4656

57+
DISK_RO_EVENT = "read_only"
58+
DISK_FULL_EVENT = "disk_full"
59+
4760
chk_log_level = syslog.LOG_ERR
4861

4962
def _log_msg(lvl, pfx, msg):
@@ -64,21 +77,35 @@ def log_debug(m):
6477
_log_msg(syslog.LOG_DEBUG, "Debug", m)
6578

6679

67-
def event_pub():
80+
def event_pub(event):
6881
param_dict = FieldValueMap()
69-
param_dict["fail_type"] = "read_only"
82+
param_dict["fail_type"] = event
7083
event_publish(events_handle, EVENTS_PUBLISHER_TAG, param_dict)
7184

7285

86+
def test_disk_full(dirs):
87+
for d in dirs:
88+
space = os.statvfs(d)
89+
if space.f_bavail < FREE_SPACE_THRESHOLD:
90+
log_err("{} has no free disk space".format(d))
91+
event_pub(DISK_FULL_EVENT)
92+
return True
93+
else:
94+
log_debug("{} has enough disk space".format(d))
95+
96+
return False
97+
98+
7399
def test_writable(dirs):
74100
for d in dirs:
75101
rw = os.access(d, os.W_OK)
76102
if not rw:
77103
log_err("{} is not read-write".format(d))
78-
event_pub()
104+
event_pub(DISK_RO_EVENT)
79105
return False
80106
else:
81107
log_debug("{} is Read-Write".format(d))
108+
82109
return True
83110

84111

@@ -101,7 +128,7 @@ def get_dname(path_name):
101128
return os.path.basename(os.path.normpath(path_name))
102129

103130

104-
def do_mnt(dirs):
131+
def do_mnt(dirs, overlay_prefix):
105132
if os.path.exists(UPPER_DIR):
106133
log_err("Already mounted")
107134
return 1
@@ -110,7 +137,7 @@ def do_mnt(dirs):
110137
try:
111138
os.mkdir(i)
112139
except OSError as error:
113-
log_err("Failed to create {}".format(i))
140+
log_err("Failed to create {}, error: {}".format(i, error))
114141
return 1
115142

116143
for d in dirs:
@@ -120,7 +147,7 @@ def do_mnt(dirs):
120147
os.mkdir(d_upper)
121148
os.mkdir(d_work)
122149

123-
ret = run_cmd(["mount", "-t", "overlay", "overlay_{}".format(d_name),\
150+
ret = run_cmd(["mount", "-t", "overlay", "{}_{}".format(overlay_prefix, d_name),
124151
"-o", "lowerdir={},upperdir={},workdir={}".format(d, d_upper, d_work), d])
125152
if ret:
126153
break
@@ -132,13 +159,36 @@ def do_mnt(dirs):
132159
return ret
133160

134161

135-
def is_mounted(dirs):
162+
def do_unmnt(dirs, overlay_prefix):
163+
for d in dirs:
164+
d_name = get_dname(d)
165+
166+
ret = run_cmd(["umount", "-l", "{}_{}".format(overlay_prefix, d_name)])
167+
if ret:
168+
break
169+
170+
if ret:
171+
log_err("Failed to umount {}".format(dirs))
172+
else:
173+
log_info("{} are unmounted".format(dirs))
174+
175+
for i in (UPPER_DIR, WORK_DIR):
176+
try:
177+
shutil.rmtree(i)
178+
except OSError as error:
179+
log_err("Failed to delete {}, error: {}".format(i, error))
180+
return 1
181+
182+
return ret
183+
184+
185+
def is_mounted(dirs, overlay_prefix):
136186
if not os.path.exists(UPPER_DIR):
137187
return False
138188

139189
onames = set()
140190
for d in dirs:
141-
onames.add("overlay_{}".format(get_dname(d)))
191+
onames.add("{}_{}".format(overlay_prefix, get_dname(d)))
142192

143193
with open(MOUNTS_FILE, "r") as s:
144194
for ln in s.readlines():
@@ -153,12 +203,31 @@ def do_check(skip_mount, dirs):
153203
ret = 0
154204
if not test_writable(dirs):
155205
if not skip_mount:
156-
ret = do_mnt(dirs)
206+
ret = do_mnt(dirs, "overlay")
157207

158208
# Check if mounted
159-
if (not ret) and is_mounted(dirs):
209+
if (not ret) and is_mounted(dirs, "overlay"):
160210
log_err("READ-ONLY: Mounted {} to make Read-Write".format(dirs))
161-
event_pub()
211+
event_pub(DISK_RO_EVENT)
212+
213+
if ret:
214+
# When disk mounted, disk no free space issue also been fixed.
215+
return ret
216+
217+
# Handle disk no free space case
218+
if test_disk_full(dirs):
219+
if not skip_mount:
220+
ret = do_mnt(dirs, "overlay_disk_full")
221+
222+
# Check if mounted
223+
if (not ret) and is_mounted(dirs, "overlay_disk_full"):
224+
log_err("DISK-FULL: Mounted {} to make Read-Write".format(dirs))
225+
event_pub(DISK_FULL_EVENT)
226+
227+
# Unmount when disk space issue fixed
228+
if is_mounted(dirs, "overlay_disk_full") and not test_disk_full(["/"]):
229+
log_debug("umount for disk space issue fixed")
230+
do_unmnt(dirs, "overlay_disk_full")
162231

163232
return ret
164233

tests/disk_check_test.py

+70-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import sys
23
import syslog
34
from unittest.mock import patch
@@ -131,7 +132,9 @@ def setup(self):
131132

132133
@patch("disk_check.syslog.syslog")
133134
@patch("disk_check.subprocess.run")
134-
def test_readonly(self, mock_proc, mock_log):
135+
@patch('os.statvfs', return_value=os.statvfs_result((4096, 4096, 1909350, 1491513, 4096,
136+
971520, 883302, 883302, 4096, 255)))
137+
def test_readonly(self, mock_os_statvfs, mock_proc, mock_log):
135138
global err_data, cmds, max_log_lvl
136139

137140
mock_proc.side_effect = mock_subproc_run
@@ -177,6 +180,72 @@ def test_readonly(self, mock_proc, mock_log):
177180

178181
assert max_log_lvl == syslog.LOG_ERR
179182

183+
@patch("disk_check.syslog.syslog")
184+
@patch("disk_check.subprocess.run")
185+
@patch('os.access', return_value=True)
186+
@patch('os.statvfs', return_value=os.statvfs_result((4096, 4096, 1909350, 1491513, 0,
187+
971520, 883302, 883302, 4096, 255)))
188+
def test_mount_disk_full(self, mock_os_statvfs, mock_os_access, mock_proc, mock_log):
189+
global max_log_lvl
190+
max_log_lvl = -1
191+
mock_proc.side_effect = mock_subproc_run
192+
mock_log.side_effect = report_err_msg
193+
194+
tc = {
195+
"upperdir": "/tmp",
196+
}
197+
swap_upper(tc)
198+
199+
with patch('sys.argv', ["", "-d", "/tmpx"]):
200+
disk_check.main()
201+
202+
@patch("disk_check.syslog.syslog")
203+
@patch("disk_check.subprocess.run")
204+
@patch('shutil.rmtree')
205+
@patch('os.access', return_value=True)
206+
@patch('os.statvfs', return_value=os.statvfs_result((4096, 4096, 1909350, 1491513, 4096,
207+
971520, 883302, 883302, 4096, 255)))
208+
def test_unmount_disk_full(self, mock_os_statvfs, mock_os_access, mock_rmtree, mock_proc, mock_log):
209+
global max_log_lvl
210+
max_log_lvl = -1
211+
mock_proc.side_effect = mock_subproc_run
212+
mock_log.side_effect = report_err_msg
213+
214+
tc = {
215+
"upperdir": "/tmp/tmpx",
216+
"workdir": "/tmp/tmpy"
217+
}
218+
swap_upper(tc)
219+
swap_work(tc)
220+
221+
with patch('sys.argv', ["", "-d", "/tmpx"]):
222+
disk_check.main()
223+
224+
@patch("disk_check.syslog.syslog")
225+
@patch("disk_check.subprocess.run")
226+
@patch('os.access', return_value=True)
227+
@patch('os.statvfs', return_value=os.statvfs_result((4096, 4096, 1909350, 1491513, 0,
228+
971520, 883302, 883302, 4096, 255)))
229+
def test_diskfull(self, mock_os_statvfs, mock_os_access, mock_proc, mock_log):
230+
global max_log_lvl
231+
max_log_lvl = -1
232+
mock_proc.side_effect = mock_subproc_run
233+
mock_log.side_effect = report_err_msg
234+
235+
result = disk_check.test_disk_full(["/etc"])
236+
assert result is True
237+
238+
@patch("disk_check.syslog.syslog")
239+
@patch("disk_check.subprocess.run")
240+
def test_do_unmnt(self, mock_proc, mock_log):
241+
global max_log_lvl
242+
max_log_lvl = -1
243+
mock_proc.side_effect = mock_subproc_run
244+
mock_log.side_effect = report_err_msg
245+
246+
disk_check.do_unmnt(["/etc"], "overlay_prefix")
247+
248+
180249
@classmethod
181250
def teardown_class(cls):
182251
subprocess.run("rm -rf /tmp/tmp*", shell=True) # cleanup the temporary dirs

0 commit comments

Comments
 (0)