5
5
PCIe device monitoring daemon for SONiC
6
6
"""
7
7
8
- try :
9
- import os
10
- import signal
11
- import sys
12
- import threading
8
+ import os
9
+ import signal
10
+ import sys
11
+ import threading
13
12
14
- from sonic_py_common import daemon_base , device_info
15
- from swsscommon import swsscommon
16
- except ImportError as e :
17
- raise ImportError (str (e ) + " - required module not found" )
13
+ from sonic_py_common import daemon_base , device_info
14
+ from swsscommon import swsscommon
18
15
19
16
#
20
17
# Constants ====================================================================
21
18
#
19
+
20
+ # TODO: Once we no longer support Python 2, we can eliminate this and get the
21
+ # name using the 'name' field (e.g., `signal.SIGINT.name`) starting with Python 3.5
22
+ SIGNALS_TO_NAMES_DICT = dict ((getattr (signal , n ), n )
23
+ for n in dir (signal ) if n .startswith ('SIG' ) and '_' not in n )
24
+
22
25
SYSLOG_IDENTIFIER = "pcied"
23
26
24
27
PCIE_RESULT_REGEX = "PCIe Device Checking All Test"
25
- PCIE_TABLE_NAME = "PCIE_STATUS"
26
28
PCIE_DEVICE_TABLE_NAME = "PCIE_DEVICE"
27
-
28
- PCIE_CONF_FILE = 'pcie.yaml'
29
+ PCIE_STATUS_TABLE_NAME = "PCIE_DEVICES"
29
30
30
31
PCIED_MAIN_THREAD_SLEEP_SECS = 60
31
- REDIS_HOSTIP = "127.0.0.1"
32
+
33
+ PCIEUTIL_CONF_FILE_ERROR = 1
34
+ PCIEUTIL_LOAD_ERROR = 2
35
+
36
+ platform_pcieutil = None
37
+
38
+ exit_code = 0
39
+
40
+ # wrapper functions to call the platform api
41
+ def load_platform_pcieutil ():
42
+ _platform_pcieutil = None
43
+ (platform_path , _ ) = device_info .get_paths_to_platform_and_hwsku_dirs ()
44
+ try :
45
+ from sonic_platform .pcie import Pcie
46
+ _platform_pcieutil = Pcie (platform_path )
47
+ except ImportError as e :
48
+ self .log_error ("Failed to load platform Pcie module. Error : {}" .format (str (e )), True )
49
+ try :
50
+ from sonic_platform_base .sonic_pcie .pcie_common import PcieUtil
51
+ _platform_pcieutil = PcieUtil (platform_path )
52
+ except ImportError as e :
53
+ self .log_error ("Failed to load default PcieUtil module. Error : {}" .format (str (e )), True )
54
+ return _platform_pcieutil
55
+
56
+ def read_id_file (device_name ):
57
+ id = None
58
+ dev_id_path = '/sys/bus/pci/devices/0000:%s/device' % device_name
59
+
60
+ if os .path .exists (dev_id_path ):
61
+ with open (dev_id_path , 'r' ) as fd :
62
+ id = fd .read ().strip ()
63
+ return id
32
64
33
65
#
34
66
# Daemon =======================================================================
@@ -39,142 +71,145 @@ class DaemonPcied(daemon_base.DaemonBase):
39
71
def __init__ (self , log_identifier ):
40
72
super (DaemonPcied , self ).__init__ (log_identifier )
41
73
42
- (platform_path , _ ) = device_info .get_paths_to_platform_and_hwsku_dirs ()
43
- pciefilePath = os .path .join (platform_path , PCIE_CONF_FILE )
44
- if not os .path .exists (pciefilePath ):
45
- self .log_error ("Platform pcie configuration file doesn't exist! Exiting ..." )
46
- sys .exit ("Platform PCIe Configuration file doesn't exist!" )
47
-
48
74
self .timeout = PCIED_MAIN_THREAD_SLEEP_SECS
49
75
self .stop_event = threading .Event ()
50
-
51
- self .state_db = swsscommon .SonicV2Connector (host = REDIS_HOSTIP )
52
- self .state_db .connect ("STATE_DB" )
53
- state_db = daemon_base .db_connect ("STATE_DB" )
54
- self .device_table = swsscommon .Table (state_db , PCIE_DEVICE_TABLE_NAME )
55
-
56
- # Load AER-fields into STATEDB
57
- def update_aer_to_statedb (self , device_name , aer_stats ):
76
+ self .state_db = None
77
+ self .device_table = None
78
+ self .table = None
79
+ self .resultInfo = []
80
+ self .device_name = None
81
+ self .aer_stats = {}
82
+
83
+ global platform_pcieutil
84
+
85
+ platform_pcieutil = load_platform_pcieutil ()
86
+ if platform_pcieutil is None :
87
+ sys .exit (PCIEUTIL_LOAD_ERROR )
88
+
89
+ # Connect to STATE_DB and create pcie device table
90
+ self .state_db = daemon_base .db_connect ("STATE_DB" )
91
+ self .device_table = swsscommon .Table (self .state_db , PCIE_DEVICE_TABLE_NAME )
92
+ self .status_table = swsscommon .Table (self .state_db , PCIE_STATUS_TABLE_NAME )
93
+
94
+ def __del__ (self ):
95
+ if self .device_table :
96
+ table_keys = self .device_table .getKeys ()
97
+ for tk in table_keys :
98
+ self .device_table ._del (tk )
99
+ if self .status_table :
100
+ stable_keys = self .status_table .getKeys ()
101
+ for stk in stable_keys :
102
+ self .status_table ._del (stk )
103
+
104
+ # load aer-fields into statedb
105
+ def update_aer_to_statedb (self ):
106
+ if self .aer_stats is None :
107
+ self .log_debug ("PCIe device {} has no AER Stats" .format (device_name ))
108
+ return
58
109
59
110
aer_fields = {}
60
111
61
- for field , value in aer_stats ['correctable' ].items ():
62
- correctable_field = "correctable|" + field
63
- aer_fields [correctable_field ] = value
64
-
65
- for field , value in aer_stats ['fatal' ].items ():
66
- fatal_field = "fatal|" + field
67
- aer_fields [fatal_field ] = value
68
-
69
- for field , value in aer_stats ['non_fatal' ].items ():
70
- non_fatal_field = "non_fatal|" + field
71
- aer_fields [non_fatal_field ] = value
112
+ for key , fv in self .aer_stats .items ():
113
+ for field , value in fv .items ():
114
+ key_field = "{}|{}" .format (key ,field )
115
+ aer_fields [key_field ] = value
72
116
73
117
if aer_fields :
74
118
formatted_fields = swsscommon .FieldValuePairs (list (aer_fields .items ()))
75
- self .device_table .set (device_name , formatted_fields )
119
+ self .device_table .set (self . device_name , formatted_fields )
76
120
else :
77
- self .log_debug ("PCIe device {} has no AER attriutes" .format (device_name ))
121
+ self .log_debug ("PCIe device {} has no AER attriutes" .format (self . device_name ))
78
122
79
- # Check the PCIe devices
80
- def check_pcie_devices (self ):
81
- try :
82
- platform_path , _ = device_info .get_paths_to_platform_and_hwsku_dirs ()
83
- from sonic_platform_base .sonic_pcie .pcie_common import PcieUtil
84
- platform_pcieutil = PcieUtil (platform_path )
85
- except ImportError as e :
86
- self .log_error ("Failed to load default PcieUtil module. Error : {}" .format (str (e )), True )
87
- raise e
88
123
89
- resultInfo = platform_pcieutil .get_pcie_check ()
90
- err = 0
124
+ # Check the PCIe AER Stats
125
+ def check_n_update_pcie_aer_stats (self , Bus , Dev , Fn ):
126
+ self .device_name = "%02x:%02x.%d" % (Bus , Dev , Fn )
91
127
92
- for item in resultInfo :
93
- if item ["result" ] == "Failed" :
94
- self .log_warning ("PCIe Device: " + item ["name" ] + " Not Found" )
95
- err += 1
128
+ Id = read_id_file (self .device_name )
96
129
130
+ self .aer_stats = {}
131
+ if Id is not None :
132
+ self .device_table .set (self .device_name , [('id' , Id )])
133
+ self .aer_stats = platform_pcieutil .get_pcie_aer_stats (bus = Bus , dev = Dev , func = Fn )
134
+ self .update_aer_to_statedb ()
135
+
136
+
137
+ # Update the PCIe devices status to DB
138
+ def update_pcie_devices_status_db (self , err ):
97
139
if err :
98
- self . update_state_db ( "PCIE_DEVICES" , "status" , "FAILED" )
99
- self .log_error ("PCIe device status check : FAILED" )
140
+ pcie_status = "FAILED"
141
+ self .log_error ("PCIe device status check : {}" . format ( pcie_status ) )
100
142
else :
101
- self .update_state_db ("PCIE_DEVICES" , "status" , "PASSED" )
102
- self .log_info ("PCIe device status check : PASSED" )
143
+ pcie_status = "PASSED"
144
+ self .log_info ("PCIe device status check : {}" .format (pcie_status ))
145
+ fvs = swsscommon .FieldValuePairs ([
146
+ ('status' , pcie_status )
147
+ ])
103
148
104
- # update AER-attributes to DB
105
- for item in resultInfo :
106
- if item ["result" ] == "Failed" :
107
- continue
149
+ self .status_table .set ("status" , fvs )
108
150
109
- Bus = int (item ["bus" ], 16 )
110
- Dev = int (item ["dev" ], 16 )
111
- Fn = int (item ["fn" ], 16 )
151
+ # Check the PCIe devices
152
+ def check_pcie_devices (self ):
153
+ self .resultInfo = platform_pcieutil .get_pcie_check ()
154
+ err = 0
155
+ if self .resultInfo is None :
156
+ return
112
157
113
- device_name = "%02x:%02x.%d" % (Bus , Dev , Fn )
114
- dev_id_path = '/sys/bus/pci/devices/0000:%s/device' % device_name
115
- with open (dev_id_path , 'r' ) as fd :
116
- Id = fd .read ().strip ()
158
+ for result in self .resultInfo :
159
+ if result ["result" ] == "Failed" :
160
+ self .log_warning ("PCIe Device: " + result ["name" ] + " Not Found" )
161
+ err += 1
162
+ else :
163
+ Bus = int (result ["bus" ], 16 )
164
+ Dev = int (result ["dev" ], 16 )
165
+ Fn = int (result ["fn" ], 16 )
166
+ # update AER-attributes to DB
167
+ self .check_n_update_pcie_aer_stats (Bus , Dev , Fn )
117
168
118
- self .device_table .set (device_name , [('id' , Id )])
119
- aer_stats = platform_pcieutil .get_pcie_aer_stats (bus = Bus , device = Dev , func = Fn )
120
- self .update_aer_to_statedb (device_name , aer_stats )
169
+ # update PCIe Device Status to DB
170
+ self .update_pcie_devices_status_db (err )
121
171
122
- def read_state_db (self , key1 , key2 ):
123
- return self .state_db .get ('STATE_DB' , key1 , key2 )
172
+ # Override signal handler from DaemonBase
173
+ def signal_handler (self , sig , frame ):
174
+ FATAL_SIGNALS = [signal .SIGINT , signal .SIGTERM ]
175
+ NONFATAL_SIGNALS = [signal .SIGHUP ]
124
176
125
- def update_state_db (self , key1 , key2 , value ):
126
- self .state_db .set ('STATE_DB' , key1 , key2 , value )
177
+ global exit_code
127
178
128
- # Signal handler
129
- def signal_handler (self , sig , frame ):
130
- if sig == signal .SIGHUP :
131
- self .log_info ("Caught SIGHUP - ignoring..." )
132
- elif sig == signal .SIGINT :
133
- self .log_info ("Caught SIGINT - exiting..." )
134
- self .stop_event .set ()
135
- elif sig == signal .SIGTERM :
136
- self .log_info ("Caught SIGTERM - exiting..." )
179
+ if sig in FATAL_SIGNALS :
180
+ self .log_info ("Caught signal '{}' - exiting..." .format (SIGNALS_TO_NAMES_DICT [sig ]))
181
+ exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us
137
182
self .stop_event .set ()
183
+ elif sig in NONFATAL_SIGNALS :
184
+ self .log_info ("Caught signal '{}' - ignoring..." .format (SIGNALS_TO_NAMES_DICT [sig ]))
138
185
else :
139
- self .log_warning ("Caught unhandled signal '" + sig + "'" )
186
+ self .log_warning ("Caught unhandled signal '{}' - ignoring..." . format ( SIGNALS_TO_NAMES_DICT [ sig ]) )
140
187
141
- # Initialize daemon
142
- def init (self ):
143
- self .log_info ("Start daemon init..." )
144
-
145
- # Deinitialize daemon
146
- def deinit (self ):
147
- self .log_info ("Start daemon deinit..." )
148
-
149
- # Run daemon
188
+ # Main daemon logic
150
189
def run (self ):
151
- self .log_info ("Starting up..." )
152
-
153
- # Start daemon initialization sequence
154
- self .init ()
155
-
156
- # Start main loop
157
- self .log_info ("Start daemon main loop" )
158
-
159
- while not self .stop_event .wait (self .timeout ):
160
- # Check the Pcie device status
161
- self .check_pcie_devices ()
162
-
163
- self .log_info ("Stop daemon main loop" )
190
+ if self .stop_event .wait (self .timeout ):
191
+ # We received a fatal signal
192
+ return False
164
193
165
- # Start daemon deinitialization sequence
166
- self .deinit ()
167
-
168
- self .log_info ("Shutting down..." )
194
+ self .check_pcie_devices ()
169
195
196
+ return True
170
197
#
171
198
# Main =========================================================================
172
199
#
173
200
174
201
175
202
def main ():
176
203
pcied = DaemonPcied (SYSLOG_IDENTIFIER )
177
- pcied .run ()
204
+
205
+ pcied .log_info ("Starting up..." )
206
+
207
+ while pcied .run ():
208
+ pass
209
+
210
+ pcied .log_info ("Shutting down..." )
211
+
212
+ return exit_code
178
213
179
214
if __name__ == '__main__' :
180
- main ()
215
+ sys . exit ( main () )
0 commit comments