@@ -538,11 +538,17 @@ class TemperatureUpdater(logger.Logger):
538
538
539
539
self .is_chassis_system = chassis .is_modular_chassis ()
540
540
if self .is_chassis_system :
541
+ self .module_thermals = set ()
541
542
my_slot = try_get (chassis .get_my_slot , INVALID_SLOT )
542
543
if my_slot != INVALID_SLOT :
543
- table_name = TemperatureUpdater .TEMPER_INFO_TABLE_NAME + '_' + str (my_slot )
544
- chassis_state_db = daemon_base .db_connect ("CHASSIS_STATE_DB" )
545
- self .chassis_table = swsscommon .Table (chassis_state_db , table_name )
544
+ try :
545
+ # Modular chassis does not have to have table CHASSIS_STATE_DB.
546
+ # So catch the exception here and ignore it.
547
+ table_name = TemperatureUpdater .TEMPER_INFO_TABLE_NAME + '_' + str (my_slot )
548
+ chassis_state_db = daemon_base .db_connect ("CHASSIS_STATE_DB" )
549
+ self .chassis_table = swsscommon .Table (chassis_state_db , table_name )
550
+ except Exception as e :
551
+ self .chassis_table = None
546
552
547
553
def deinit (self ):
548
554
"""
@@ -576,31 +582,61 @@ class TemperatureUpdater(logger.Logger):
576
582
for index , thermal in enumerate (self .chassis .get_all_thermals ()):
577
583
if self .task_stopping_event .is_set ():
578
584
return
579
- try :
580
- self ._refresh_temperature_status (CHASSIS_INFO_KEY , thermal , index )
581
- except Exception as e :
582
- self .log_warning ('Failed to update thermal status - {}' .format (repr (e )))
585
+
586
+ self ._refresh_temperature_status (CHASSIS_INFO_KEY , thermal , index )
583
587
584
588
for psu_index , psu in enumerate (self .chassis .get_all_psus ()):
585
589
parent_name = 'PSU {}' .format (psu_index + 1 )
586
590
for thermal_index , thermal in enumerate (psu .get_all_thermals ()):
587
591
if self .task_stopping_event .is_set ():
588
592
return
589
- try :
590
- self ._refresh_temperature_status (parent_name , thermal , thermal_index )
591
- except Exception as e :
592
- self .log_warning ('Failed to update thermal status - {}' .format (repr (e )))
593
+
594
+ self ._refresh_temperature_status (parent_name , thermal , thermal_index )
593
595
594
596
for sfp_index , sfp in enumerate (self .chassis .get_all_sfps ()):
595
597
parent_name = 'SFP {}' .format (sfp_index + 1 )
596
598
for thermal_index , thermal in enumerate (sfp .get_all_thermals ()):
597
599
if self .task_stopping_event .is_set ():
598
600
return
599
- try :
600
- self ._refresh_temperature_status (parent_name , thermal , thermal_index )
601
- except Exception as e :
602
- self .log_warning ('Failed to update thermal status - {}' .format (repr (e )))
603
601
602
+ self ._refresh_temperature_status (parent_name , thermal , thermal_index )
603
+
604
+ if self .is_chassis_system :
605
+ available_thermals = set ()
606
+ for module_index , module in enumerate (self .chassis .get_all_modules ()):
607
+ module_name = try_get (module .get_name , 'Module {}' .format (module_index + 1 ))
608
+
609
+ for thermal_index , thermal in enumerate (module .get_all_thermals ()):
610
+ if self .task_stopping_event .is_set ():
611
+ return
612
+
613
+ available_thermals .add ((thermal , module_name , thermal_index ))
614
+ self ._refresh_temperature_status (module_name , thermal , thermal_index )
615
+
616
+ for sfp_index , sfp in enumerate (module .get_all_sfps ()):
617
+ sfp_name = '{} SFP {}' .format (module_name , sfp_index + 1 )
618
+ for thermal_index , thermal in enumerate (sfp .get_all_thermals ()):
619
+ if self .task_stopping_event .is_set ():
620
+ return
621
+
622
+ available_thermals .add ((thermal , sfp_name , thermal_index ))
623
+ self ._refresh_temperature_status (sfp_name , thermal , thermal_index )
624
+
625
+ for psu_index , psu in enumerate (module .get_all_psus ()):
626
+ psu_name = '{} PSU {}' .format (module_name , psu_index + 1 )
627
+ for thermal_index , thermal in enumerate (psu .get_all_thermals ()):
628
+ if self .task_stopping_event .is_set ():
629
+ return
630
+
631
+ available_thermals .add ((thermal , psu_name , thermal_index ))
632
+ self ._refresh_temperature_status (psu_name , thermal , thermal_index )
633
+
634
+
635
+ thermals_to_remove = self .module_thermals - available_thermals
636
+ self .module_thermals = available_thermals
637
+ for thermal , parent_name , thermal_index in thermals_to_remove :
638
+ self ._remove_thermal_from_db (thermal , parent_name , thermal_index )
639
+
604
640
self .log_debug ("End temperature updating" )
605
641
606
642
def _refresh_temperature_status (self , parent_name , thermal , thermal_index ):
@@ -611,72 +647,82 @@ class TemperatureUpdater(logger.Logger):
611
647
:param thermal_index: Index of the thermal object in platform chassis
612
648
:return:
613
649
"""
614
- name = try_get (thermal .get_name , '{} Thermal {}' .format (parent_name , thermal_index + 1 ))
650
+ try :
651
+ name = try_get (thermal .get_name , '{} Thermal {}' .format (parent_name , thermal_index + 1 ))
652
+
653
+ # Only save entity info for thermals that belong to chassis and PSU
654
+ # for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
655
+ # and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
656
+ # logical port
657
+ if 'SFP' not in parent_name :
658
+ update_entity_info (self .phy_entity_table , parent_name , name , thermal , thermal_index + 1 )
659
+
660
+ if name not in self .temperature_status_dict :
661
+ self .temperature_status_dict [name ] = TemperatureStatus ()
662
+
663
+ temperature_status = self .temperature_status_dict [name ]
664
+
665
+ high_threshold = NOT_AVAILABLE
666
+ low_threshold = NOT_AVAILABLE
667
+ high_critical_threshold = NOT_AVAILABLE
668
+ low_critical_threshold = NOT_AVAILABLE
669
+ maximum_temperature = NOT_AVAILABLE
670
+ minimum_temperature = NOT_AVAILABLE
671
+ temperature = try_get (thermal .get_temperature )
672
+ is_replaceable = try_get (thermal .is_replaceable , False )
673
+ if temperature != NOT_AVAILABLE :
674
+ temperature_status .set_temperature (name , temperature )
675
+ minimum_temperature = try_get (thermal .get_minimum_recorded )
676
+ maximum_temperature = try_get (thermal .get_maximum_recorded )
677
+ high_threshold = try_get (thermal .get_high_threshold )
678
+ low_threshold = try_get (thermal .get_low_threshold )
679
+ high_critical_threshold = try_get (thermal .get_high_critical_threshold )
680
+ low_critical_threshold = try_get (thermal .get_low_critical_threshold )
681
+
682
+ warning = False
683
+ if temperature != NOT_AVAILABLE and temperature_status .set_over_temperature (temperature , high_threshold ):
684
+ self ._log_on_status_changed (not temperature_status .over_temperature ,
685
+ 'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C' .
686
+ format (name , temperature , high_threshold ),
687
+ 'High temperature warning: {} current temperature {}C, high threshold {}C' .
688
+ format (name , temperature , high_threshold )
689
+ )
690
+ warning = warning | temperature_status .over_temperature
691
+
692
+ if temperature != NOT_AVAILABLE and temperature_status .set_under_temperature (temperature , low_threshold ):
693
+ self ._log_on_status_changed (not temperature_status .under_temperature ,
694
+ 'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C' .
695
+ format (name , temperature , low_threshold ),
696
+ 'Low temperature warning: {} current temperature {}C, low threshold {}C' .
697
+ format (name , temperature , low_threshold )
698
+ )
699
+ warning = warning | temperature_status .under_temperature
700
+
701
+ fvs = swsscommon .FieldValuePairs (
702
+ [('temperature' , str (temperature )),
703
+ ('minimum_temperature' , str (minimum_temperature )),
704
+ ('maximum_temperature' , str (maximum_temperature )),
705
+ ('high_threshold' , str (high_threshold )),
706
+ ('low_threshold' , str (low_threshold )),
707
+ ('warning_status' , str (warning )),
708
+ ('critical_high_threshold' , str (high_critical_threshold )),
709
+ ('critical_low_threshold' , str (low_critical_threshold )),
710
+ ('is_replaceable' , str (is_replaceable )),
711
+ ('timestamp' , datetime .now ().strftime ('%Y%m%d %H:%M:%S' ))
712
+ ])
615
713
616
- # Only save entity info for thermals that belong to chassis and PSU
617
- # for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
618
- # and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
619
- # logical port
620
- if 'SFP' not in parent_name :
621
- update_entity_info (self .phy_entity_table , parent_name , name , thermal , thermal_index + 1 )
622
-
623
- if name not in self .temperature_status_dict :
624
- self .temperature_status_dict [name ] = TemperatureStatus ()
625
-
626
- temperature_status = self .temperature_status_dict [name ]
627
-
628
- high_threshold = NOT_AVAILABLE
629
- low_threshold = NOT_AVAILABLE
630
- high_critical_threshold = NOT_AVAILABLE
631
- low_critical_threshold = NOT_AVAILABLE
632
- maximum_temperature = NOT_AVAILABLE
633
- minimum_temperature = NOT_AVAILABLE
634
- temperature = try_get (thermal .get_temperature )
635
- is_replaceable = try_get (thermal .is_replaceable , False )
636
- if temperature != NOT_AVAILABLE :
637
- temperature_status .set_temperature (name , temperature )
638
- minimum_temperature = try_get (thermal .get_minimum_recorded )
639
- maximum_temperature = try_get (thermal .get_maximum_recorded )
640
- high_threshold = try_get (thermal .get_high_threshold )
641
- low_threshold = try_get (thermal .get_low_threshold )
642
- high_critical_threshold = try_get (thermal .get_high_critical_threshold )
643
- low_critical_threshold = try_get (thermal .get_low_critical_threshold )
644
-
645
- warning = False
646
- if temperature != NOT_AVAILABLE and temperature_status .set_over_temperature (temperature , high_threshold ):
647
- self ._log_on_status_changed (not temperature_status .over_temperature ,
648
- 'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C' .
649
- format (name , temperature , high_threshold ),
650
- 'High temperature warning: {} current temperature {}C, high threshold {}C' .
651
- format (name , temperature , high_threshold )
652
- )
653
- warning = warning | temperature_status .over_temperature
654
-
655
- if temperature != NOT_AVAILABLE and temperature_status .set_under_temperature (temperature , low_threshold ):
656
- self ._log_on_status_changed (not temperature_status .under_temperature ,
657
- 'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C' .
658
- format (name , temperature , low_threshold ),
659
- 'Low temperature warning: {} current temperature {}C, low threshold {}C' .
660
- format (name , temperature , low_threshold )
661
- )
662
- warning = warning | temperature_status .under_temperature
714
+ self .table .set (name , fvs )
715
+ if self .is_chassis_system and self .chassis_table is not None :
716
+ self .chassis_table .set (name , fvs )
717
+ except Exception as e :
718
+ self .log_warning ('Failed to update thermal status for {} - {}' .format (name , repr (e )))
663
719
664
- fvs = swsscommon .FieldValuePairs (
665
- [('temperature' , str (temperature )),
666
- ('minimum_temperature' , str (minimum_temperature )),
667
- ('maximum_temperature' , str (maximum_temperature )),
668
- ('high_threshold' , str (high_threshold )),
669
- ('low_threshold' , str (low_threshold )),
670
- ('warning_status' , str (warning )),
671
- ('critical_high_threshold' , str (high_critical_threshold )),
672
- ('critical_low_threshold' , str (low_critical_threshold )),
673
- ('is_replaceable' , str (is_replaceable )),
674
- ('timestamp' , datetime .now ().strftime ('%Y%m%d %H:%M:%S' ))
675
- ])
720
+ def _remove_thermal_from_db (self , thermal , parent_name , thermal_index ):
721
+ name = try_get (thermal .get_name , '{} Thermal {}' .format (parent_name , thermal_index + 1 ))
722
+ self .table ._del (name )
676
723
677
- self .table .set (name , fvs )
678
- if self .is_chassis_system and self .chassis_table is not None :
679
- self .chassis_table .set (name , fvs )
724
+ if self .chassis_table is not None :
725
+ self .chassis_table ._del (name )
680
726
681
727
682
728
class ThermalMonitor (ProcessTaskBase ):
0 commit comments