5
5
PCIe device monitoring daemon for SONiC
6
6
"""
7
7
8
- try :
9
- import os
10
- import signal
11
- import sys
12
- import threading
13
-
14
- import swsssdk
15
- from sonic_py_common import daemon_base , device_info
16
- from swsscommon import swsscommon
17
- except ImportError as e :
18
- raise ImportError (str (e ) + " - required module not found" )
8
+ import os
9
+ import signal
10
+ import sys
11
+ import threading
12
+
13
+ from sonic_py_common import daemon_base , device_info , logger
14
+ from swsscommon import swsscommon
19
15
20
16
#
21
17
# Constants ====================================================================
22
18
#
19
+
20
+ # TODO: Once we no longer support Python 2, we can eliminate this and get the
21
+ # name using the 'name' field (e.g., `signal.SIGINT.name`) starting with Python 3.5
22
+ SIGNALS_TO_NAMES_DICT = dict ((getattr (signal , n ), n )
23
+ for n in dir (signal ) if n .startswith ('SIG' ) and '_' not in n )
24
+
23
25
SYSLOG_IDENTIFIER = "pcied"
24
26
25
27
PCIE_RESULT_REGEX = "PCIe Device Checking All Test"
26
- PCIE_TABLE_NAME = "PCIE_STATUS"
27
28
PCIE_DEVICE_TABLE_NAME = "PCIE_DEVICE"
28
-
29
- PCIE_CONF_FILE = 'pcie.yaml'
29
+ PCIE_STATUS_TABLE_NAME = "PCIE_DEVICES"
30
30
31
31
PCIED_MAIN_THREAD_SLEEP_SECS = 60
32
- REDIS_HOSTIP = "127.0.0.1"
32
+
33
+ PCIEUTIL_CONF_FILE_ERROR = 1
34
+ PCIEUTIL_LOAD_ERROR = 2
35
+
36
+ platform_pcieutil = None
37
+
38
+ log = logger .Logger (SYSLOG_IDENTIFIER )
39
+
40
+ exit_code = 0
41
+
42
+ # wrapper functions to call the platform api
43
+ def load_platform_pcieutil ():
44
+ _platform_pcieutil = None
45
+ (platform_path , _ ) = device_info .get_paths_to_platform_and_hwsku_dirs ()
46
+ try :
47
+ from sonic_platform .pcie import Pcie
48
+ _platform_pcieutil = Pcie (platform_path )
49
+ except ImportError as e :
50
+ log .log_notice ("Failed to load platform Pcie module. Error : {}, , Fallback to default module" .format (str (e )), True )
51
+ try :
52
+ from sonic_platform_base .sonic_pcie .pcie_common import PcieUtil
53
+ _platform_pcieutil = PcieUtil (platform_path )
54
+ except ImportError as e :
55
+ log .log_error ("Failed to load default PcieUtil module. Error : {}" .format (str (e )), True )
56
+ return _platform_pcieutil
57
+
58
+ def read_id_file (device_name ):
59
+ id = None
60
+ dev_id_path = '/sys/bus/pci/devices/0000:%s/device' % device_name
61
+
62
+ if os .path .exists (dev_id_path ):
63
+ with open (dev_id_path , 'r' ) as fd :
64
+ id = fd .read ().strip ()
65
+ return id
33
66
34
67
#
35
68
# Daemon =======================================================================
@@ -40,142 +73,145 @@ class DaemonPcied(daemon_base.DaemonBase):
40
73
def __init__ (self , log_identifier ):
41
74
super (DaemonPcied , self ).__init__ (log_identifier )
42
75
43
- (platform_path , _ ) = device_info .get_paths_to_platform_and_hwsku_dirs ()
44
- pciefilePath = os .path .join (platform_path , PCIE_CONF_FILE )
45
- if not os .path .exists (pciefilePath ):
46
- self .log_error ("Platform pcie configuration file doesn't exist! Exiting ..." )
47
- sys .exit ("Platform PCIe Configuration file doesn't exist!" )
48
-
49
76
self .timeout = PCIED_MAIN_THREAD_SLEEP_SECS
50
77
self .stop_event = threading .Event ()
51
-
52
- self .state_db = swsssdk .SonicV2Connector (host = REDIS_HOSTIP )
53
- self .state_db .connect ("STATE_DB" )
54
- state_db = daemon_base .db_connect ("STATE_DB" )
55
- self .device_table = swsscommon .Table (state_db , PCIE_DEVICE_TABLE_NAME )
56
-
57
- # Load AER-fields into STATEDB
58
- def update_aer_to_statedb (self , device_name , aer_stats ):
78
+ self .state_db = None
79
+ self .device_table = None
80
+ self .table = None
81
+ self .resultInfo = []
82
+ self .device_name = None
83
+ self .aer_stats = {}
84
+
85
+ global platform_pcieutil
86
+
87
+ platform_pcieutil = load_platform_pcieutil ()
88
+ if platform_pcieutil is None :
89
+ sys .exit (PCIEUTIL_LOAD_ERROR )
90
+
91
+ # Connect to STATE_DB and create pcie device table
92
+ self .state_db = daemon_base .db_connect ("STATE_DB" )
93
+ self .device_table = swsscommon .Table (self .state_db , PCIE_DEVICE_TABLE_NAME )
94
+ self .status_table = swsscommon .Table (self .state_db , PCIE_STATUS_TABLE_NAME )
95
+
96
+ def __del__ (self ):
97
+ if self .device_table :
98
+ table_keys = self .device_table .getKeys ()
99
+ for tk in table_keys :
100
+ self .device_table ._del (tk )
101
+ if self .status_table :
102
+ stable_keys = self .status_table .getKeys ()
103
+ for stk in stable_keys :
104
+ self .status_table ._del (stk )
105
+
106
+ # load aer-fields into statedb
107
+ def update_aer_to_statedb (self ):
108
+ if self .aer_stats is None :
109
+ self .log_debug ("PCIe device {} has no AER Stats" .format (device_name ))
110
+ return
59
111
60
112
aer_fields = {}
61
113
62
- for field , value in aer_stats ['correctable' ].items ():
63
- correctable_field = "correctable|" + field
64
- aer_fields [correctable_field ] = value
65
-
66
- for field , value in aer_stats ['fatal' ].items ():
67
- fatal_field = "fatal|" + field
68
- aer_fields [fatal_field ] = value
69
-
70
- for field , value in aer_stats ['non_fatal' ].items ():
71
- non_fatal_field = "non_fatal|" + field
72
- aer_fields [non_fatal_field ] = value
114
+ for key , fv in self .aer_stats .items ():
115
+ for field , value in fv .items ():
116
+ key_field = "{}|{}" .format (key ,field )
117
+ aer_fields [key_field ] = value
73
118
74
119
if aer_fields :
75
120
formatted_fields = swsscommon .FieldValuePairs (list (aer_fields .items ()))
76
- self .device_table .set (device_name , formatted_fields )
121
+ self .device_table .set (self . device_name , formatted_fields )
77
122
else :
78
- self .log_debug ("PCIe device {} has no AER attriutes" .format (device_name ))
123
+ self .log_debug ("PCIe device {} has no AER attriutes" .format (self . device_name ))
79
124
80
- # Check the PCIe devices
81
- def check_pcie_devices (self ):
82
- try :
83
- platform_path , _ = device_info .get_paths_to_platform_and_hwsku_dirs ()
84
- from sonic_platform_base .sonic_pcie .pcie_common import PcieUtil
85
- platform_pcieutil = PcieUtil (platform_path )
86
- except ImportError as e :
87
- self .log_error ("Failed to load default PcieUtil module. Error : {}" .format (str (e )), True )
88
- raise e
89
125
90
- resultInfo = platform_pcieutil .get_pcie_check ()
91
- err = 0
126
+ # Check the PCIe AER Stats
127
+ def check_n_update_pcie_aer_stats (self , Bus , Dev , Fn ):
128
+ self .device_name = "%02x:%02x.%d" % (Bus , Dev , Fn )
92
129
93
- for item in resultInfo :
94
- if item ["result" ] == "Failed" :
95
- self .log_warning ("PCIe Device: " + item ["name" ] + " Not Found" )
96
- err += 1
130
+ Id = read_id_file (self .device_name )
97
131
132
+ self .aer_stats = {}
133
+ if Id is not None :
134
+ self .device_table .set (self .device_name , [('id' , Id )])
135
+ self .aer_stats = platform_pcieutil .get_pcie_aer_stats (bus = Bus , dev = Dev , func = Fn )
136
+ self .update_aer_to_statedb ()
137
+
138
+
139
+ # Update the PCIe devices status to DB
140
+ def update_pcie_devices_status_db (self , err ):
98
141
if err :
99
- self . update_state_db ( "PCIE_DEVICES" , "status" , "FAILED" )
100
- self .log_error ("PCIe device status check : FAILED" )
142
+ pcie_status = "FAILED"
143
+ self .log_error ("PCIe device status check : {}" . format ( pcie_status ) )
101
144
else :
102
- self .update_state_db ("PCIE_DEVICES" , "status" , "PASSED" )
103
- self .log_info ("PCIe device status check : PASSED" )
145
+ pcie_status = "PASSED"
146
+ self .log_info ("PCIe device status check : {}" .format (pcie_status ))
147
+ fvs = swsscommon .FieldValuePairs ([
148
+ ('status' , pcie_status )
149
+ ])
104
150
105
- # update AER-attributes to DB
106
- for item in resultInfo :
107
- if item ["result" ] == "Failed" :
108
- continue
151
+ self .status_table .set ("status" , fvs )
109
152
110
- Bus = int (item ["bus" ], 16 )
111
- Dev = int (item ["dev" ], 16 )
112
- Fn = int (item ["fn" ], 16 )
153
+ # Check the PCIe devices
154
+ def check_pcie_devices (self ):
155
+ self .resultInfo = platform_pcieutil .get_pcie_check ()
156
+ err = 0
157
+ if self .resultInfo is None :
158
+ return
113
159
114
- device_name = "%02x:%02x.%d" % (Bus , Dev , Fn )
115
- dev_id_path = '/sys/bus/pci/devices/0000:%s/device' % device_name
116
- with open (dev_id_path , 'r' ) as fd :
117
- Id = fd .read ().strip ()
160
+ for result in self .resultInfo :
161
+ if result ["result" ] == "Failed" :
162
+ self .log_warning ("PCIe Device: " + result ["name" ] + " Not Found" )
163
+ err += 1
164
+ else :
165
+ Bus = int (result ["bus" ], 16 )
166
+ Dev = int (result ["dev" ], 16 )
167
+ Fn = int (result ["fn" ], 16 )
168
+ # update AER-attributes to DB
169
+ self .check_n_update_pcie_aer_stats (Bus , Dev , Fn )
118
170
119
- self .device_table .set (device_name , [('id' , Id )])
120
- aer_stats = platform_pcieutil .get_pcie_aer_stats (bus = Bus , device = Dev , func = Fn )
121
- self .update_aer_to_statedb (device_name , aer_stats )
171
+ # update PCIe Device Status to DB
172
+ self .update_pcie_devices_status_db (err )
122
173
123
- def read_state_db (self , key1 , key2 ):
124
- return self .state_db .get ('STATE_DB' , key1 , key2 )
174
+ # Override signal handler from DaemonBase
175
+ def signal_handler (self , sig , frame ):
176
+ FATAL_SIGNALS = [signal .SIGINT , signal .SIGTERM ]
177
+ NONFATAL_SIGNALS = [signal .SIGHUP ]
125
178
126
- def update_state_db (self , key1 , key2 , value ):
127
- self .state_db .set ('STATE_DB' , key1 , key2 , value )
179
+ global exit_code
128
180
129
- # Signal handler
130
- def signal_handler (self , sig , frame ):
131
- if sig == signal .SIGHUP :
132
- self .log_info ("Caught SIGHUP - ignoring..." )
133
- elif sig == signal .SIGINT :
134
- self .log_info ("Caught SIGINT - exiting..." )
135
- self .stop_event .set ()
136
- elif sig == signal .SIGTERM :
137
- self .log_info ("Caught SIGTERM - exiting..." )
181
+ if sig in FATAL_SIGNALS :
182
+ self .log_info ("Caught signal '{}' - exiting..." .format (SIGNALS_TO_NAMES_DICT [sig ]))
183
+ exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us
138
184
self .stop_event .set ()
185
+ elif sig in NONFATAL_SIGNALS :
186
+ self .log_info ("Caught signal '{}' - ignoring..." .format (SIGNALS_TO_NAMES_DICT [sig ]))
139
187
else :
140
- self .log_warning ("Caught unhandled signal '" + sig + "'" )
188
+ self .log_warning ("Caught unhandled signal '{}' - ignoring..." . format ( SIGNALS_TO_NAMES_DICT [ sig ]) )
141
189
142
- # Initialize daemon
143
- def init (self ):
144
- self .log_info ("Start daemon init..." )
145
-
146
- # Deinitialize daemon
147
- def deinit (self ):
148
- self .log_info ("Start daemon deinit..." )
149
-
150
- # Run daemon
190
+ # Main daemon logic
151
191
def run (self ):
152
- self .log_info ("Starting up..." )
153
-
154
- # Start daemon initialization sequence
155
- self .init ()
156
-
157
- # Start main loop
158
- self .log_info ("Start daemon main loop" )
159
-
160
- while not self .stop_event .wait (self .timeout ):
161
- # Check the Pcie device status
162
- self .check_pcie_devices ()
163
-
164
- self .log_info ("Stop daemon main loop" )
192
+ if self .stop_event .wait (self .timeout ):
193
+ # We received a fatal signal
194
+ return False
165
195
166
- # Start daemon deinitialization sequence
167
- self .deinit ()
168
-
169
- self .log_info ("Shutting down..." )
196
+ self .check_pcie_devices ()
170
197
198
+ return True
171
199
#
172
200
# Main =========================================================================
173
201
#
174
202
175
203
176
204
def main ():
177
205
pcied = DaemonPcied (SYSLOG_IDENTIFIER )
178
- pcied .run ()
206
+
207
+ pcied .log_info ("Starting up..." )
208
+
209
+ while pcied .run ():
210
+ pass
211
+
212
+ pcied .log_info ("Shutting down..." )
213
+
214
+ return exit_code
179
215
180
216
if __name__ == '__main__' :
181
- main ()
217
+ sys . exit ( main () )
0 commit comments