3
3
4
4
"""
5
5
What:
6
- There have been cases, where disk turns Read-only due to kernel bug.
7
- In Read-only state, system blocks new remote user login via TACACS.
8
- This utility is to check & make transient recovery as needed.
6
+ This utility is designed to address two specific issues:
7
+ 1. Disk becoming read-only due to kernel bugs.
8
+ 2. Disk running out of space.
9
+ When either of these issues occurs, the system prevents new remote user logins via TACACS.
9
10
10
11
How:
11
- check for Read-Write permission. If Read-only, create writable overlay using tmpfs.
12
+ Checks for read-write permissions and available disk space.
13
+ If an issue is detected, create writable overlay using tmpfs.
12
14
13
- By default "/etc" & "/home" are checked and if in Read-only state , make them Read-Write
15
+ By default "/etc" & "/home" are checked and if issue detected , make them Read-Write
14
16
using overlay on top of tmpfs.
15
17
16
18
Making /etc & /home as writable lets successful new remote user login.
17
19
18
- If in Read-only state or in Read-Write state with the help of tmpfs overlay,
19
- syslog ERR messages are written, to help raise alerts.
20
+ Write syslog ERR messages to help raise alerts in the following cases:
21
+ 1. Disk in read-only state.
22
+ 2. Disk out of space.
23
+ 3. Mounted tmpfs overlay.
20
24
21
25
Monit may be used to invoke it periodically, to help scan & fix and
22
26
report via syslog.
23
27
24
28
Tidbit:
25
- If you would like to test this script, you could simulate a RO disk
26
- with the following command. Reboot will revert the effect.
29
+ To test this script:
30
+ 1. Simulate a RO disk with the following command. Reboot will revert the effect.
27
31
sudo bash -c "echo u > /proc/sysrq-trigger"
32
+ 2. Use up all disk space by create big file in /var/dump/:
33
+ dd if=/dev/zero of=/var/dump/sonic_dump_devicename_20241126_204132.tar bs=1G count=50
28
34
29
35
"""
30
36
31
37
import argparse
32
38
import os
39
+ import shutil
33
40
import sys
34
41
import syslog
35
42
import subprocess
40
47
WORK_DIR = "/run/mount/work"
41
48
MOUNTS_FILE = "/proc/mounts"
42
49
50
+ # Threshold of free block counts: On most file systems, the block size is 4096 bytes.
51
+ FREE_SPACE_THRESHOLD = 1024
52
+
43
53
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
44
54
EVENTS_PUBLISHER_TAG = "event-disk"
45
55
events_handle = None
46
56
57
+ DISK_RO_EVENT = "read_only"
58
+ DISK_FULL_EVENT = "disk_full"
59
+
47
60
chk_log_level = syslog .LOG_ERR
48
61
49
62
def _log_msg (lvl , pfx , msg ):
@@ -64,21 +77,35 @@ def log_debug(m):
64
77
_log_msg (syslog .LOG_DEBUG , "Debug" , m )
65
78
66
79
67
- def event_pub ():
80
+ def event_pub (event ):
68
81
param_dict = FieldValueMap ()
69
- param_dict ["fail_type" ] = "read_only"
82
+ param_dict ["fail_type" ] = event
70
83
event_publish (events_handle , EVENTS_PUBLISHER_TAG , param_dict )
71
84
72
85
86
+ def test_disk_full (dirs ):
87
+ for d in dirs :
88
+ space = os .statvfs (d )
89
+ if space .f_bavail < FREE_SPACE_THRESHOLD :
90
+ log_err ("{} has no free disk space" .format (d ))
91
+ event_pub (DISK_FULL_EVENT )
92
+ return True
93
+ else :
94
+ log_debug ("{} has enough disk space" .format (d ))
95
+
96
+ return False
97
+
98
+
73
99
def test_writable (dirs ):
74
100
for d in dirs :
75
101
rw = os .access (d , os .W_OK )
76
102
if not rw :
77
103
log_err ("{} is not read-write" .format (d ))
78
- event_pub ()
104
+ event_pub (DISK_RO_EVENT )
79
105
return False
80
106
else :
81
107
log_debug ("{} is Read-Write" .format (d ))
108
+
82
109
return True
83
110
84
111
@@ -101,7 +128,7 @@ def get_dname(path_name):
101
128
return os .path .basename (os .path .normpath (path_name ))
102
129
103
130
104
- def do_mnt (dirs ):
131
+ def do_mnt (dirs , overlay_prefix ):
105
132
if os .path .exists (UPPER_DIR ):
106
133
log_err ("Already mounted" )
107
134
return 1
@@ -110,7 +137,7 @@ def do_mnt(dirs):
110
137
try :
111
138
os .mkdir (i )
112
139
except OSError as error :
113
- log_err ("Failed to create {}" .format (i ))
140
+ log_err ("Failed to create {}, error: {} " .format (i , error ))
114
141
return 1
115
142
116
143
for d in dirs :
@@ -120,7 +147,7 @@ def do_mnt(dirs):
120
147
os .mkdir (d_upper )
121
148
os .mkdir (d_work )
122
149
123
- ret = run_cmd (["mount" , "-t" , "overlay" , "overlay_{} " .format (d_name ),\
150
+ ret = run_cmd (["mount" , "-t" , "overlay" , "{}_{} " .format (overlay_prefix , d_name ),
124
151
"-o" , "lowerdir={},upperdir={},workdir={}" .format (d , d_upper , d_work ), d ])
125
152
if ret :
126
153
break
@@ -132,13 +159,36 @@ def do_mnt(dirs):
132
159
return ret
133
160
134
161
135
- def is_mounted (dirs ):
162
+ def do_unmnt (dirs , overlay_prefix ):
163
+ for d in dirs :
164
+ d_name = get_dname (d )
165
+
166
+ ret = run_cmd (["umount" , "-l" , "{}_{}" .format (overlay_prefix , d_name )])
167
+ if ret :
168
+ break
169
+
170
+ if ret :
171
+ log_err ("Failed to umount {}" .format (dirs ))
172
+ else :
173
+ log_info ("{} are unmounted" .format (dirs ))
174
+
175
+ for i in (UPPER_DIR , WORK_DIR ):
176
+ try :
177
+ shutil .rmtree (i )
178
+ except OSError as error :
179
+ log_err ("Failed to delete {}, error: {}" .format (i , error ))
180
+ return 1
181
+
182
+ return ret
183
+
184
+
185
+ def is_mounted (dirs , overlay_prefix ):
136
186
if not os .path .exists (UPPER_DIR ):
137
187
return False
138
188
139
189
onames = set ()
140
190
for d in dirs :
141
- onames .add ("overlay_{} " .format (get_dname (d )))
191
+ onames .add ("{}_{} " .format (overlay_prefix , get_dname (d )))
142
192
143
193
with open (MOUNTS_FILE , "r" ) as s :
144
194
for ln in s .readlines ():
@@ -153,12 +203,31 @@ def do_check(skip_mount, dirs):
153
203
ret = 0
154
204
if not test_writable (dirs ):
155
205
if not skip_mount :
156
- ret = do_mnt (dirs )
206
+ ret = do_mnt (dirs , "overlay" )
157
207
158
208
# Check if mounted
159
- if (not ret ) and is_mounted (dirs ):
209
+ if (not ret ) and is_mounted (dirs , "overlay" ):
160
210
log_err ("READ-ONLY: Mounted {} to make Read-Write" .format (dirs ))
161
- event_pub ()
211
+ event_pub (DISK_RO_EVENT )
212
+
213
+ if ret :
214
+ # When disk mounted, disk no free space issue also been fixed.
215
+ return ret
216
+
217
+ # Handle disk no free space case
218
+ if test_disk_full (dirs ):
219
+ if not skip_mount :
220
+ ret = do_mnt (dirs , "overlay_disk_full" )
221
+
222
+ # Check if mounted
223
+ if (not ret ) and is_mounted (dirs , "overlay_disk_full" ):
224
+ log_err ("DISK-FULL: Mounted {} to make Read-Write" .format (dirs ))
225
+ event_pub (DISK_FULL_EVENT )
226
+
227
+ # Unmount when disk space issue fixed
228
+ if is_mounted (dirs , "overlay_disk_full" ) and not test_disk_full (["/" ]):
229
+ log_debug ("umount for disk space issue fixed" )
230
+ do_unmnt (dirs , "overlay_disk_full" )
162
231
163
232
return ret
164
233
0 commit comments