@@ -52,9 +52,24 @@ TIME_FOR_SFP_READY_SECS = 1
52
52
RETRIES_FOR_SPF_READY = 5
53
53
XCVRD_MAIN_THREAD_SLEEP_MSECS = 60000
54
54
55
+ RETRY_TIMES_FOR_SYSTEM_READY = 24
56
+ RETRY_PERIOD_FOR_SYSTEM_READY_MSECS = 5000
57
+
55
58
SFP_STATUS_INSERTED = '1'
56
59
SFP_STATUS_REMOVED = '0'
57
60
61
+ EVENT_ON_ALL_SFP = '-1'
62
+ # events definition
63
+ SYSTEM_NOT_READY = 'system_not_ready'
64
+ SYSTEM_BECOME_READY = 'system_become_ready'
65
+ SYSTEM_FAIL = 'system_fail'
66
+ NORMAL_EVENT = 'normal'
67
+ # states definition
68
+ STATE_INIT = 0
69
+ STATE_NORMAL = 1
70
+ STATE_EXIT = 2
71
+
72
+ SFP_EEPROM_HANDLE_SUCCESS = 0
58
73
PHYSICAL_PORT_NOT_EXIST = - 1
59
74
SFP_EEPROM_NOT_READY = - 2
60
75
@@ -63,6 +78,8 @@ VOLT_UNIT = 'Volts'
63
78
POWER_UNIT = 'dBm'
64
79
BIAS_UNIT = 'mA'
65
80
81
+ XCVRD_MAIN_TASK_RUNNING_FLAG = True
82
+
66
83
#========================== Syslog wrappers ==========================
67
84
68
85
def log_info (msg , also_print_to_console = False ):
@@ -92,15 +109,16 @@ def log_error(msg, also_print_to_console=False):
92
109
#========================== Signal Handling ==========================
93
110
94
111
def signal_handler (sig , frame ):
112
+ global XCVRD_MAIN_TASK_RUNNING_FLAG
95
113
if sig == signal .SIGHUP :
96
114
log_info ("Caught SIGHUP - ignoring..." )
97
115
return
98
116
elif sig == signal .SIGINT :
99
117
log_info ("Caught SIGINT - exiting..." )
100
- sys . exit ( 128 + sig )
118
+ XCVRD_MAIN_TASK_RUNNING_FLAG = False
101
119
elif sig == signal .SIGTERM :
102
120
log_info ("Caught SIGTERM - exiting..." )
103
- sys . exit ( 128 + sig )
121
+ XCVRD_MAIN_TASK_RUNNING_FLAG = False
104
122
else :
105
123
log_warning ("Caught unhandled signal '" + sig + "'" )
106
124
return
@@ -256,6 +274,8 @@ def post_port_sfp_info_to_db(logical_port_name, table):
256
274
log_error ("This functionality is currently not implemented for this platform" )
257
275
sys .exit (3 )
258
276
277
+ return SFP_EEPROM_HANDLE_SUCCESS
278
+
259
279
# update dom sensor info to db
260
280
def post_port_dom_info_to_db (logical_port_name , table ):
261
281
ganged_port = False
@@ -333,8 +353,32 @@ def recover_missing_sfp_table_entries(sfp_util, int_tbl):
333
353
logical_port_list = sfp_util .logical
334
354
for logical_port_name in logical_port_list :
335
355
if logical_port_name not in keys :
336
- post_port_sfp_info_to_db (logical_port_name , int_tbl )
337
- log_info ("Port {} has been recovered" .format (logical_port_name ))
356
+ rc = post_port_sfp_info_to_db (logical_port_name , int_tbl )
357
+ if rc == SFP_EEPROM_HANDLE_SUCCESS :
358
+ log_info ("Port {} has been recovered" .format (logical_port_name ))
359
+
360
+ def mapping_event_from_change_event (status , port_dict ):
361
+ """
362
+ mapping from what get_transceiver_change_event returns to event defined in the state machine
363
+ the logic is pretty straightforword
364
+ """
365
+ if status :
366
+ if bool (port_dict ):
367
+ event = NORMAL_EVENT
368
+ else :
369
+ event = SYSTEM_BECOME_READY
370
+ # here, a simple timeout event whose port_dict is empty is mapped
371
+ # into a SYSTEM_BECOME_READY event so that it can be handled
372
+ port_dict [EVENT_ON_ALL_SFP ] = SYSTEM_BECOME_READY
373
+ else :
374
+ if EVENT_ON_ALL_SFP in port_dict .keys ():
375
+ event = port_dict [EVENT_ON_ALL_SFP ]
376
+ else :
377
+ # this should not happen. just for protection
378
+ event = SYSTEM_FAIL
379
+ port_dict [EVENT_ON_ALL_SFP ] = SYSTEM_FAIL
380
+
381
+ return event
338
382
339
383
# Timer thread wrapper class to update dom info to DB periodically
340
384
class dom_info_update_task :
@@ -403,7 +447,7 @@ def main():
403
447
sel .addSelectable (sst )
404
448
405
449
# Make sure this daemon started after all port configured.
406
- while True :
450
+ while XCVRD_MAIN_TASK_RUNNING_FLAG :
407
451
(state , c ) = sel .select (SELECT_TIMEOUT_MSECS )
408
452
if state == swsscommon .Select .TIMEOUT :
409
453
continue
@@ -426,41 +470,161 @@ def main():
426
470
dom_info_update .task_run ()
427
471
428
472
# Start main loop to listen to the SFP change event.
473
+ # The state migrating sequence:
474
+ # 1. When the system starts, it is in "INIT" state, calling get_transceiver_change_event
475
+ # with RETRY_PERIOD_FOR_SYSTEM_READY_MSECS as timeout for as many as RETRY_TIMES_FOR_SYSTEM_READY
476
+ # times
477
+ # 2. Once 'system_become_ready' returned, the system enters "SYSTEM_READY" state and starts to monitor
478
+ # the insertion/removal event of all the SFP modules.
479
+ # In this state, receiving any system level event will be treated as an unrecoverable error and cause
480
+ # the daemon exit
481
+
482
+ # states definition
483
+ # - Initial state: INIT, before received system ready or a normal event
484
+ # - Final state: EXIT
485
+ # - other state: NORMAL, after has received system-ready or a normal event
486
+
487
+ # events definition
488
+ # - SYSTEM_NOT_READY
489
+ # - SYSTEM_BECOME_READY
490
+ # -
491
+ # - NORMAL_EVENT
492
+ # - sfp insertion/removal
493
+ # - timeout returned by sfputil.get_change_event with status = true
494
+ # - SYSTEM_FAIL
495
+
496
+ # State transmit:
497
+ # 1. SYSTEM_NOT_READY
498
+ # - INIT
499
+ # - retry < RETRY_TIMES_FOR_SYSTEM_READY
500
+ # retry ++
501
+ # - else
502
+ # max retry reached, treat as fatal, exit
503
+ # - NORMAL
504
+ # Treat as a fatal error, exit
505
+ # 2. SYSTEM_BECOME_READY
506
+ # - INIT
507
+ # transmit to NORMAL
508
+ # - NORMAL
509
+ # log the event
510
+ # nop
511
+ # 3. NORMAL_EVENT
512
+ # - INIT (for the vendors who don't implement SYSTEM_BECOME_READY)
513
+ # transmit to NORMAL
514
+ # handle the event normally
515
+ # - NORMAL
516
+ # handle the event normally
517
+ # 4. SYSTEM_FAIL
518
+ # treat as a fatal error
519
+
520
+ # State event next state
521
+ # INIT SYSTEM NOT READY INIT / EXIT
522
+ # INIT SYSTEM BECOME READY NORMAL
523
+ # NORMAL SYSTEM BECOME READY NORMAL
524
+ # INIT/NORMAL SYSTEM FAIL EXIT
525
+ # INIT/NORMAL NORMAL EVENT NORMAL
526
+ # NORMAL SYSTEM NOT READY EXIT
527
+ # EXIT -
528
+
429
529
log_info ("Start main loop" )
430
530
time_last_recovery_run = time .time ()
431
- while True :
432
- status , port_dict = platform_sfputil .get_transceiver_change_event (XCVRD_MAIN_THREAD_SLEEP_MSECS )
433
- if status :
434
- for key , value in port_dict .iteritems ():
435
- logical_port_list = platform_sfputil .get_physical_to_logical (int (key ))
436
- for logical_port in logical_port_list :
437
- if value == SFP_STATUS_INSERTED :
438
- rc = post_port_sfp_info_to_db (logical_port , int_tbl )
439
- # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
440
- if rc == SFP_EEPROM_NOT_READY :
441
- log_info ("Port {} isn't present when got SFP insert event" .format (logical_port ))
442
- retry = 0
443
- while retry <= RETRIES_FOR_SPF_READY :
444
- time .sleep (TIME_FOR_SFP_READY_SECS )
445
- rc = post_port_sfp_info_to_db (logical_port , int_tbl )
446
- if rc == SFP_EEPROM_NOT_READY :
447
- log_info ("Port {} isn't present when got SFP insert event, retry {}" .format (logical_port , retry ))
448
- retry = retry + 1
449
- else :
450
- break
531
+ retry = 0
532
+ timeout = RETRY_PERIOD_FOR_SYSTEM_READY_MSECS
533
+ state = STATE_INIT
534
+ while XCVRD_MAIN_TASK_RUNNING_FLAG :
535
+ next_state = state
536
+ status , port_dict = platform_sfputil .get_transceiver_change_event (timeout )
537
+ event = mapping_event_from_change_event (status , port_dict )
538
+
539
+ if event == SYSTEM_NOT_READY :
540
+ if state == STATE_INIT :
541
+ # system not ready, wait and retry
542
+ if retry >= RETRY_TIMES_FOR_SYSTEM_READY :
543
+ log_error ("System failed to get ready in {} secs or received system error. Exiting..." .format ((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS / 1000 )* RETRY_TIMES_FOR_SYSTEM_READY ))
544
+ next_state = STATE_EXIT
545
+ else :
546
+ retry = retry + 1
547
+
548
+ # get_transceiver_change_event may return immediately,
549
+ # we want the retry expired in expected time period,
550
+ # So need to calc the time diff,
551
+ # if time diff less that the pre-defined waiting time,
552
+ # use sleep() to complete the time.
553
+ time_now = time .time ()
554
+ time_diff = time_now - time_start
555
+ if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS / 1000 :
556
+ time .sleep (RETRY_PERIOD_FOR_SYSTEM_READY_MSECS / 1000 - time_diff )
557
+ elif state == STATE_NORMAL :
558
+ log_error ("Got system_not_ready in normal state, treat as fatal. Exiting..." )
559
+ next_state = STATE_EXIT
560
+ else :
561
+ next_state = STATE_EXIT
562
+ elif event == SYSTEM_BECOME_READY :
563
+ if state == STATE_INIT :
564
+ next_state = STATE_NORMAL
565
+ log_info ("Got system_become_ready in init state, transmit to normal state" )
566
+ elif state == STATE_NORMAL :
567
+ next_state = STATE_NORMAL
568
+ else :
569
+ next_state = STATE_EXIT
570
+
571
+
572
+ elif event == NORMAL_EVENT :
573
+ if state == STATE_NORMAL or state == STATE_INIT :
574
+ if state == STATE_INIT :
575
+ next_state = STATE_NORMAL
576
+ # this is the originally logic that handled the transceiver change event
577
+ # this can be reached in two cases:
578
+ # 1. the state has been normal before got the event
579
+ # 2. the state was init and is transmitted to normal after got the event.
580
+ # this is for the vendors who don't implement "system_not_ready/system_becom_ready" logic
581
+ for key , value in port_dict .iteritems ():
582
+ logical_port_list = platform_sfputil .get_physical_to_logical (int (key ))
583
+ for logical_port in logical_port_list :
584
+ if value == SFP_STATUS_INSERTED :
585
+ log_info ("Got SFP inserted event" )
586
+ rc = post_port_sfp_info_to_db (logical_port , int_tbl )
587
+ # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
588
+ if rc == SFP_EEPROM_NOT_READY :
589
+ log_info ("Port {} isn't present when got SFP insert event" .format (logical_port ))
590
+ retry = 0
591
+ while retry <= RETRIES_FOR_SPF_READY :
592
+ time .sleep (TIME_FOR_SFP_READY_SECS )
593
+ rc = post_port_sfp_info_to_db (logical_port , int_tbl )
594
+ if rc == SFP_EEPROM_NOT_READY :
595
+ log_info ("Port {} isn't present when got SFP insert event, retry {}" .format (logical_port , retry ))
596
+ retry = retry + 1
597
+ else :
598
+ break
599
+ else :
600
+ log_info ("get sfp info successfully {}, push to db" .format (logical_port ))
601
+ post_port_dom_info_to_db (logical_port , dom_tbl )
602
+
603
+ elif value == SFP_STATUS_REMOVED :
604
+ log_info ("Got SFP removed event" )
605
+ del_port_sfp_dom_info_to_db (logical_port , int_tbl , dom_tbl )
451
606
else :
452
- log_info ("get sfp info successfully {}, push to db" .format (logical_port ))
453
- post_port_dom_info_to_db (logical_port , dom_tbl )
454
-
455
- elif value == SFP_STATUS_REMOVED :
456
- del_port_sfp_dom_info_to_db (logical_port , int_tbl , dom_tbl )
457
- else :
458
- # TODO, SFP return error code, need handle accordingly.
459
- continue
607
+ # TODO, SFP return error code, need handle accordingly.
608
+ log_warning ("Got unknown event {}, ignored" .format (value ))
609
+ continue
610
+ else :
611
+ next_state = STATE_EXIT
612
+ elif event == SYSTEM_FAIL :
613
+ # no matter which state current it is, it's fatal
614
+ next_state = STATE_EXIT
615
+ log_error ("Got system_fail event on state {}, exiting" .format (state ))
460
616
else :
461
- # If get_transceiver_change_event() return error, will clean up the DB and then exit
462
- # TODO: next step need to define more error types to handle accordingly.
617
+ log_warning ("Got unknown event {} on state {}." .format (event , state ))
618
+
619
+ if next_state != state :
620
+ log_info ("State transmitted from {} to {}" .format (state , next_state ))
621
+ state = next_state
622
+
623
+ if next_state == STATE_EXIT :
463
624
break
625
+ elif next_state == STATE_NORMAL :
626
+ # When transit to normal state time out will be changed
627
+ timeout = XCVRD_MAIN_THREAD_SLEEP_MSECS
464
628
465
629
time_now = time .time ()
466
630
time_diff = time_now - time_last_recovery_run
@@ -476,7 +640,7 @@ def main():
476
640
logical_port_list = platform_sfputil .logical
477
641
for logical_port_name in logical_port_list :
478
642
del_port_sfp_dom_info_to_db (logical_port_name , int_tbl , dom_tbl )
479
- log_error ("Error: return error from get_transceiver_change_event() , exiting..." )
643
+ log_error ("Xcvrd main task stopped , exiting..." )
480
644
return 1
481
645
482
646
if __name__ == '__main__' :
0 commit comments