Skip to content

Commit 1039764

Browse files
authored
Merge pull request #42 from keboliu/backport-state-machine
[xcvrd] backport PR(#39) "Enhance xcvrd to handle new system level event/error" to 201811
2 parents 42f64d8 + 7ab9888 commit 1039764

File tree

1 file changed

+200
-36
lines changed

1 file changed

+200
-36
lines changed

sonic-xcvrd/scripts/xcvrd

+200-36
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,24 @@ TIME_FOR_SFP_READY_SECS = 1
5252
RETRIES_FOR_SPF_READY = 5
5353
XCVRD_MAIN_THREAD_SLEEP_MSECS = 60000
5454

55+
RETRY_TIMES_FOR_SYSTEM_READY = 24
56+
RETRY_PERIOD_FOR_SYSTEM_READY_MSECS = 5000
57+
5558
SFP_STATUS_INSERTED = '1'
5659
SFP_STATUS_REMOVED = '0'
5760

61+
EVENT_ON_ALL_SFP = '-1'
62+
# events definition
63+
SYSTEM_NOT_READY = 'system_not_ready'
64+
SYSTEM_BECOME_READY = 'system_become_ready'
65+
SYSTEM_FAIL = 'system_fail'
66+
NORMAL_EVENT = 'normal'
67+
# states definition
68+
STATE_INIT = 0
69+
STATE_NORMAL = 1
70+
STATE_EXIT = 2
71+
72+
SFP_EEPROM_HANDLE_SUCCESS = 0
5873
PHYSICAL_PORT_NOT_EXIST = -1
5974
SFP_EEPROM_NOT_READY = -2
6075

@@ -63,6 +78,8 @@ VOLT_UNIT = 'Volts'
6378
POWER_UNIT = 'dBm'
6479
BIAS_UNIT = 'mA'
6580

81+
XCVRD_MAIN_TASK_RUNNING_FLAG = True
82+
6683
#========================== Syslog wrappers ==========================
6784

6885
def log_info(msg, also_print_to_console=False):
@@ -92,15 +109,16 @@ def log_error(msg, also_print_to_console=False):
92109
#========================== Signal Handling ==========================
93110

94111
def signal_handler(sig, frame):
112+
global XCVRD_MAIN_TASK_RUNNING_FLAG
95113
if sig == signal.SIGHUP:
96114
log_info("Caught SIGHUP - ignoring...")
97115
return
98116
elif sig == signal.SIGINT:
99117
log_info("Caught SIGINT - exiting...")
100-
sys.exit(128 + sig)
118+
XCVRD_MAIN_TASK_RUNNING_FLAG = False
101119
elif sig == signal.SIGTERM:
102120
log_info("Caught SIGTERM - exiting...")
103-
sys.exit(128 + sig)
121+
XCVRD_MAIN_TASK_RUNNING_FLAG = False
104122
else:
105123
log_warning("Caught unhandled signal '" + sig + "'")
106124
return
@@ -256,6 +274,8 @@ def post_port_sfp_info_to_db(logical_port_name, table):
256274
log_error("This functionality is currently not implemented for this platform")
257275
sys.exit(3)
258276

277+
return SFP_EEPROM_HANDLE_SUCCESS
278+
259279
# update dom sensor info to db
260280
def post_port_dom_info_to_db(logical_port_name, table):
261281
ganged_port = False
@@ -333,8 +353,32 @@ def recover_missing_sfp_table_entries(sfp_util, int_tbl):
333353
logical_port_list = sfp_util.logical
334354
for logical_port_name in logical_port_list:
335355
if logical_port_name not in keys:
336-
post_port_sfp_info_to_db(logical_port_name, int_tbl)
337-
log_info("Port {} has been recovered".format(logical_port_name))
356+
rc = post_port_sfp_info_to_db(logical_port_name, int_tbl)
357+
if rc == SFP_EEPROM_HANDLE_SUCCESS:
358+
log_info("Port {} has been recovered".format(logical_port_name))
359+
360+
def mapping_event_from_change_event(status, port_dict):
361+
"""
362+
mapping from what get_transceiver_change_event returns to event defined in the state machine
363+
the logic is pretty straightforword
364+
"""
365+
if status:
366+
if bool(port_dict):
367+
event = NORMAL_EVENT
368+
else:
369+
event = SYSTEM_BECOME_READY
370+
# here, a simple timeout event whose port_dict is empty is mapped
371+
# into a SYSTEM_BECOME_READY event so that it can be handled
372+
port_dict[EVENT_ON_ALL_SFP] = SYSTEM_BECOME_READY
373+
else:
374+
if EVENT_ON_ALL_SFP in port_dict.keys():
375+
event = port_dict[EVENT_ON_ALL_SFP]
376+
else:
377+
# this should not happen. just for protection
378+
event = SYSTEM_FAIL
379+
port_dict[EVENT_ON_ALL_SFP] = SYSTEM_FAIL
380+
381+
return event
338382

339383
# Timer thread wrapper class to update dom info to DB periodically
340384
class dom_info_update_task:
@@ -403,7 +447,7 @@ def main():
403447
sel.addSelectable(sst)
404448

405449
# Make sure this daemon started after all port configured.
406-
while True:
450+
while XCVRD_MAIN_TASK_RUNNING_FLAG:
407451
(state, c) = sel.select(SELECT_TIMEOUT_MSECS)
408452
if state == swsscommon.Select.TIMEOUT:
409453
continue
@@ -426,41 +470,161 @@ def main():
426470
dom_info_update.task_run()
427471

428472
# Start main loop to listen to the SFP change event.
473+
# The state migrating sequence:
474+
# 1. When the system starts, it is in "INIT" state, calling get_transceiver_change_event
475+
# with RETRY_PERIOD_FOR_SYSTEM_READY_MSECS as timeout for as many as RETRY_TIMES_FOR_SYSTEM_READY
476+
# times
477+
# 2. Once 'system_become_ready' returned, the system enters "SYSTEM_READY" state and starts to monitor
478+
# the insertion/removal event of all the SFP modules.
479+
# In this state, receiving any system level event will be treated as an unrecoverable error and cause
480+
# the daemon exit
481+
482+
# states definition
483+
# - Initial state: INIT, before received system ready or a normal event
484+
# - Final state: EXIT
485+
# - other state: NORMAL, after has received system-ready or a normal event
486+
487+
# events definition
488+
# - SYSTEM_NOT_READY
489+
# - SYSTEM_BECOME_READY
490+
# -
491+
# - NORMAL_EVENT
492+
# - sfp insertion/removal
493+
# - timeout returned by sfputil.get_change_event with status = true
494+
# - SYSTEM_FAIL
495+
496+
# State transmit:
497+
# 1. SYSTEM_NOT_READY
498+
# - INIT
499+
# - retry < RETRY_TIMES_FOR_SYSTEM_READY
500+
# retry ++
501+
# - else
502+
# max retry reached, treat as fatal, exit
503+
# - NORMAL
504+
# Treat as a fatal error, exit
505+
# 2. SYSTEM_BECOME_READY
506+
# - INIT
507+
# transmit to NORMAL
508+
# - NORMAL
509+
# log the event
510+
# nop
511+
# 3. NORMAL_EVENT
512+
# - INIT (for the vendors who don't implement SYSTEM_BECOME_READY)
513+
# transmit to NORMAL
514+
# handle the event normally
515+
# - NORMAL
516+
# handle the event normally
517+
# 4. SYSTEM_FAIL
518+
# treat as a fatal error
519+
520+
# State event next state
521+
# INIT SYSTEM NOT READY INIT / EXIT
522+
# INIT SYSTEM BECOME READY NORMAL
523+
# NORMAL SYSTEM BECOME READY NORMAL
524+
# INIT/NORMAL SYSTEM FAIL EXIT
525+
# INIT/NORMAL NORMAL EVENT NORMAL
526+
# NORMAL SYSTEM NOT READY EXIT
527+
# EXIT -
528+
429529
log_info("Start main loop")
430530
time_last_recovery_run = time.time()
431-
while True:
432-
status, port_dict = platform_sfputil.get_transceiver_change_event(XCVRD_MAIN_THREAD_SLEEP_MSECS)
433-
if status:
434-
for key, value in port_dict.iteritems():
435-
logical_port_list = platform_sfputil.get_physical_to_logical(int(key))
436-
for logical_port in logical_port_list:
437-
if value == SFP_STATUS_INSERTED:
438-
rc = post_port_sfp_info_to_db(logical_port, int_tbl)
439-
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
440-
if rc == SFP_EEPROM_NOT_READY:
441-
log_info("Port {} isn't present when got SFP insert event".format(logical_port))
442-
retry = 0
443-
while retry <= RETRIES_FOR_SPF_READY:
444-
time.sleep(TIME_FOR_SFP_READY_SECS)
445-
rc = post_port_sfp_info_to_db(logical_port, int_tbl)
446-
if rc == SFP_EEPROM_NOT_READY:
447-
log_info("Port {} isn't present when got SFP insert event, retry {}".format(logical_port, retry))
448-
retry = retry + 1
449-
else:
450-
break
531+
retry = 0
532+
timeout = RETRY_PERIOD_FOR_SYSTEM_READY_MSECS
533+
state = STATE_INIT
534+
while XCVRD_MAIN_TASK_RUNNING_FLAG:
535+
next_state = state
536+
status, port_dict = platform_sfputil.get_transceiver_change_event(timeout)
537+
event = mapping_event_from_change_event(status, port_dict)
538+
539+
if event == SYSTEM_NOT_READY:
540+
if state == STATE_INIT:
541+
# system not ready, wait and retry
542+
if retry >= RETRY_TIMES_FOR_SYSTEM_READY:
543+
log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY))
544+
next_state = STATE_EXIT
545+
else:
546+
retry = retry + 1
547+
548+
# get_transceiver_change_event may return immediately,
549+
# we want the retry expired in expected time period,
550+
# So need to calc the time diff,
551+
# if time diff less that the pre-defined waiting time,
552+
# use sleep() to complete the time.
553+
time_now = time.time()
554+
time_diff = time_now - time_start
555+
if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000:
556+
time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff)
557+
elif state == STATE_NORMAL:
558+
log_error("Got system_not_ready in normal state, treat as fatal. Exiting...")
559+
next_state = STATE_EXIT
560+
else:
561+
next_state = STATE_EXIT
562+
elif event == SYSTEM_BECOME_READY:
563+
if state == STATE_INIT:
564+
next_state = STATE_NORMAL
565+
log_info("Got system_become_ready in init state, transmit to normal state")
566+
elif state == STATE_NORMAL:
567+
next_state = STATE_NORMAL
568+
else:
569+
next_state = STATE_EXIT
570+
571+
572+
elif event == NORMAL_EVENT:
573+
if state == STATE_NORMAL or state == STATE_INIT:
574+
if state == STATE_INIT:
575+
next_state = STATE_NORMAL
576+
# this is the originally logic that handled the transceiver change event
577+
# this can be reached in two cases:
578+
# 1. the state has been normal before got the event
579+
# 2. the state was init and is transmitted to normal after got the event.
580+
# this is for the vendors who don't implement "system_not_ready/system_becom_ready" logic
581+
for key, value in port_dict.iteritems():
582+
logical_port_list = platform_sfputil.get_physical_to_logical(int(key))
583+
for logical_port in logical_port_list:
584+
if value == SFP_STATUS_INSERTED:
585+
log_info("Got SFP inserted event")
586+
rc = post_port_sfp_info_to_db(logical_port, int_tbl)
587+
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
588+
if rc == SFP_EEPROM_NOT_READY:
589+
log_info("Port {} isn't present when got SFP insert event".format(logical_port))
590+
retry = 0
591+
while retry <= RETRIES_FOR_SPF_READY:
592+
time.sleep(TIME_FOR_SFP_READY_SECS)
593+
rc = post_port_sfp_info_to_db(logical_port, int_tbl)
594+
if rc == SFP_EEPROM_NOT_READY:
595+
log_info("Port {} isn't present when got SFP insert event, retry {}".format(logical_port, retry))
596+
retry = retry + 1
597+
else:
598+
break
599+
else:
600+
log_info("get sfp info successfully {}, push to db".format(logical_port))
601+
post_port_dom_info_to_db(logical_port, dom_tbl)
602+
603+
elif value == SFP_STATUS_REMOVED:
604+
log_info("Got SFP removed event")
605+
del_port_sfp_dom_info_to_db(logical_port, int_tbl, dom_tbl)
451606
else:
452-
log_info("get sfp info successfully {}, push to db".format(logical_port))
453-
post_port_dom_info_to_db(logical_port, dom_tbl)
454-
455-
elif value == SFP_STATUS_REMOVED:
456-
del_port_sfp_dom_info_to_db(logical_port, int_tbl, dom_tbl)
457-
else:
458-
# TODO, SFP return error code, need handle accordingly.
459-
continue
607+
# TODO, SFP return error code, need handle accordingly.
608+
log_warning("Got unknown event {}, ignored".format(value))
609+
continue
610+
else:
611+
next_state = STATE_EXIT
612+
elif event == SYSTEM_FAIL:
613+
# no matter which state current it is, it's fatal
614+
next_state = STATE_EXIT
615+
log_error("Got system_fail event on state {}, exiting".format(state))
460616
else:
461-
# If get_transceiver_change_event() return error, will clean up the DB and then exit
462-
# TODO: next step need to define more error types to handle accordingly.
617+
log_warning("Got unknown event {} on state {}.".format(event, state))
618+
619+
if next_state != state:
620+
log_info("State transmitted from {} to {}".format(state, next_state))
621+
state = next_state
622+
623+
if next_state == STATE_EXIT:
463624
break
625+
elif next_state == STATE_NORMAL:
626+
# When transit to normal state time out will be changed
627+
timeout = XCVRD_MAIN_THREAD_SLEEP_MSECS
464628

465629
time_now = time.time()
466630
time_diff = time_now - time_last_recovery_run
@@ -476,7 +640,7 @@ def main():
476640
logical_port_list = platform_sfputil.logical
477641
for logical_port_name in logical_port_list:
478642
del_port_sfp_dom_info_to_db(logical_port_name, int_tbl, dom_tbl)
479-
log_error("Error: return error from get_transceiver_change_event(), exiting...")
643+
log_error("Xcvrd main task stopped, exiting...")
480644
return 1
481645

482646
if __name__ == '__main__':

0 commit comments

Comments
 (0)