From 80c212cd1584290132bf5657e42375c012ed2889 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Karri Date: Fri, 4 Jun 2021 17:06:59 +0000 Subject: [PATCH 01/71] Crash Capture Tech Support Enhancement Signed-off-by: Vivek Reddy Karri --- doc/techsupport/crash_capture.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 doc/techsupport/crash_capture.md diff --git a/doc/techsupport/crash_capture.md b/doc/techsupport/crash_capture.md new file mode 100644 index 00000000000..e69de29bb2d From 6bde847cf889eeba7a479c21ffb9ca360af5486d Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 4 Jun 2021 17:07:22 -0400 Subject: [PATCH 02/71] Update crash_capture.md --- doc/techsupport/crash_capture.md | 35 ++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/doc/techsupport/crash_capture.md b/doc/techsupport/crash_capture.md index e69de29bb2d..9efe41b4ba6 100644 --- a/doc/techsupport/crash_capture.md +++ b/doc/techsupport/crash_capture.md @@ -0,0 +1,35 @@ +# Crash Capture Support # +#### Rev 1.0 + + +### Revision +| Rev | Date | Author | Change Description | +|:---:|:-----------:|:-------------------------|:----------------------| +| 1.0 | 06/15/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | + + +## About this Manual +This document describes the details of the system in place which facilitates the crash capture support in SONiC when the NOS generates a core dump. + +## 1. Overview +Currently, techsupport invocation is done by invoking `show techsupport` either by orchestration tools like Jenkins or manually. The techsupport dump also collects any core dump files available in the `/var/core/` directory. + +However if the techsupport invocation can be made event-driven based on core dump generation, that would definitely improve the debuggability. That is the overall All the high-level enhancements are summarized in the next section + +## 2. High Level Requirements +* Techsupport invocation should also be made event-driven based on core dump generation +* This capability should be made optional and is disabled by default +* Users should have the abiliity to turn this capability on and off. + +## 3. Core Dump Generation in SONiC +In SONiC, the core dumps generated from any process crash across the dockers and the base host are directed to the location `/var/core` and will have the name `/var/core/*.core.gz`. +The naming format and compression is governed by the script `/usr/local/bin/coredump-compress`. + +## 4. Design + +### 4.1 Script to invoke the techsupport CLI +A new python script `/usr/local/bin/crash-capture` will be added for this purpose and when invoked, it checks if a core-dump file has been generated within the last 20 sec and if yes, will invoke the techsupport dump. +Additionally, the CLI invocation will also have a `since` flag indicating the last time the tech support was run, if any. + +### 4.1 Event-trigger for Core-dump generation +To Monitor and respond for the file-change events in `/var/core/`, systemd path unit will be used. This unit will start a corresponding systemd unit, which inturn invokes the crash-capture python script From 854bc22c8b9dff89040d69a0ccde9bd4f01373ca Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 4 Jun 2021 19:16:18 -0400 Subject: [PATCH 03/71] Update crash_capture.md --- doc/techsupport/crash_capture.md | 101 +++++++++++++++++++++++++++++-- 1 file changed, 97 insertions(+), 4 deletions(-) diff --git a/doc/techsupport/crash_capture.md b/doc/techsupport/crash_capture.md index 9efe41b4ba6..1fac41599a5 100644 --- a/doc/techsupport/crash_capture.md +++ b/doc/techsupport/crash_capture.md @@ -1,6 +1,18 @@ # Crash Capture Support # #### Rev 1.0 +## Table of Contents + * [Revision](#revision) + * [About this Manual](#about-this-manual) + * [1. Overview](#1-overview) + * [2. High Level Requirements](#2-high-level-requirements) + * [3. Core-Dump-Generation-in-SONiC](#3-core-dump-generation-in-sonic) + * [4. Design](#4-design) + * [4.1 Script to invoke the techsupport CLI](#41-Script-to-invoke-the-techsupport-CLI) + * [4.2 Event trigger for Core-dump generation](#42-Event-trigger-for-Core-dump-generation) + * [4.3 Crash Capture ecosystem should be configurable](#43-crash-capture-ecosystem-should-be-configurable) + * [4.4 CLI Enhancements](#44-CLI-Enhancements) + ### Revision | Rev | Date | Author | Change Description | @@ -12,14 +24,14 @@ This document describes the details of the system in place which facilitates the crash capture support in SONiC when the NOS generates a core dump. ## 1. Overview -Currently, techsupport invocation is done by invoking `show techsupport` either by orchestration tools like Jenkins or manually. The techsupport dump also collects any core dump files available in the `/var/core/` directory. +Currently, techsupport is run by invoking `show techsupport` either by orchestration tools like Jenkins or manually. The techsupport dump also collects any core dump files available in the `/var/core/` directory. -However if the techsupport invocation can be made event-driven based on core dump generation, that would definitely improve the debuggability. That is the overall All the high-level enhancements are summarized in the next section +However if the techsupport invocation can be made event-driven based on core dump generation, that would definitely improve the debuggability. That is the overall idea behind this HLD. All the high-level requirements are summarized in the next section ## 2. High Level Requirements * Techsupport invocation should also be made event-driven based on core dump generation * This capability should be made optional and is disabled by default -* Users should have the abiliity to turn this capability on and off. +* Users should have the abiliity to configre the capability. ## 3. Core Dump Generation in SONiC In SONiC, the core dumps generated from any process crash across the dockers and the base host are directed to the location `/var/core` and will have the name `/var/core/*.core.gz`. @@ -31,5 +43,86 @@ The naming format and compression is governed by the script `/usr/local/bin/core A new python script `/usr/local/bin/crash-capture` will be added for this purpose and when invoked, it checks if a core-dump file has been generated within the last 20 sec and if yes, will invoke the techsupport dump. Additionally, the CLI invocation will also have a `since` flag indicating the last time the tech support was run, if any. -### 4.1 Event-trigger for Core-dump generation +### 4.2 Event-trigger for Core-dump generation To Monitor and respond for the file-change events in `/var/core/`, systemd path unit will be used. This unit will start a corresponding systemd unit, which inturn invokes the crash-capture python script + +#### crash-capture.path +``` +[Unit] +Description=Triggers the Unit when a core is dumped +After=database.service, crash-capture-configure.service +Requires=database.service + +[Path] +PathExists=/var/core/ +Unit=crash-capture.service + +[Install] +WantedBy=multi-user.target +``` + +#### crash-capture.service +``` +[Unit] +Description=Executes the crash-capture script when triggered +After=database.service, crash-capture-configure.service +Requires=database.service + +[Service] +Type=simple +ExecStart=/usr/local/bin/crash-capture + +[Install] +WantedBy=multi-user.target +``` + +Note: Both of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. + +### 4.3 crash-capture ecosystem should be configurable. + +Turning on this feature would just mean unmasking or enabling both of crash-capture.{path, service} and starting crash-capture.path +Similarly, turning this off would be masking or disabling these two. + +A new schema is added to Cfg DB which is defined below. + +#### Schema additions to Config DB +``` +key = "CRASH_CAPTURE|global" +state = enabled; +``` + +To monitor the config changes pushed by the user, a crash-capture-daemon will be started. +This'll be started using crash-capture-configure.service. + +#### crash-capture-configure.service +``` +[Unit] +Description=Starts the daemon which monitors the crash-capture config changes +After=database.service +Requires=database.service + +[Service] +Type=simple +Restart=always +ExecStart=/usr/local/bin/crash-capture-daemon + +[Install] +WantedBy=multi-user.target +``` + +This service starts the crash-capture-daemon. The crash-capture-daemon script subscribes to Config DB and listens for changes made to `CRASH_CAPTURE|global` key. +It then enables or disables the service accordingly + +### 4.4 CLI Enhancements. + +### config cli + +`config crash-capture ` + +### show cli + +`show crash-capture status` + +Both of them would work as you've expected + + From 833c1343cf2382a48858edd3eb42aa3f616bf3f9 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 4 Jun 2021 19:20:19 -0400 Subject: [PATCH 04/71] Update crash_capture.md --- doc/techsupport/crash_capture.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/techsupport/crash_capture.md b/doc/techsupport/crash_capture.md index 1fac41599a5..07cc0819582 100644 --- a/doc/techsupport/crash_capture.md +++ b/doc/techsupport/crash_capture.md @@ -17,7 +17,7 @@ ### Revision | Rev | Date | Author | Change Description | |:---:|:-----------:|:-------------------------|:----------------------| -| 1.0 | 06/15/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | +| 1.0 | 06/04/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | ## About this Manual @@ -31,7 +31,7 @@ However if the techsupport invocation can be made event-driven based on core dum ## 2. High Level Requirements * Techsupport invocation should also be made event-driven based on core dump generation * This capability should be made optional and is disabled by default -* Users should have the abiliity to configre the capability. +* Users should have the abiliity to configure this capability. ## 3. Core Dump Generation in SONiC In SONiC, the core dumps generated from any process crash across the dockers and the base host are directed to the location `/var/core` and will have the name `/var/core/*.core.gz`. @@ -51,7 +51,7 @@ To Monitor and respond for the file-change events in `/var/core/`, systemd path [Unit] Description=Triggers the Unit when a core is dumped After=database.service, crash-capture-configure.service -Requires=database.service +Requires=database.service, crash-capture-configure.service [Path] PathExists=/var/core/ @@ -66,7 +66,7 @@ WantedBy=multi-user.target [Unit] Description=Executes the crash-capture script when triggered After=database.service, crash-capture-configure.service -Requires=database.service +Requires=database.service, crash-capture-configure.service [Service] Type=simple @@ -76,7 +76,7 @@ ExecStart=/usr/local/bin/crash-capture WantedBy=multi-user.target ``` -Note: Both of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. +Note: Both of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. The other dependency is `crash-capture-configure.service` which will be explained in the next section. ### 4.3 crash-capture ecosystem should be configurable. @@ -123,6 +123,5 @@ It then enables or disables the service accordingly `show crash-capture status` -Both of them would work as you've expected From 6a3c9531f3f72ce485e69297ec0ba19253de6753 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 4 Jun 2021 19:20:44 -0400 Subject: [PATCH 05/71] Update crash_capture.md --- doc/techsupport/crash_capture.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/crash_capture.md b/doc/techsupport/crash_capture.md index 07cc0819582..fc34399750e 100644 --- a/doc/techsupport/crash_capture.md +++ b/doc/techsupport/crash_capture.md @@ -88,7 +88,7 @@ A new schema is added to Cfg DB which is defined below. #### Schema additions to Config DB ``` key = "CRASH_CAPTURE|global" -state = enabled; +state = enabled|disabled; ``` To monitor the config changes pushed by the user, a crash-capture-daemon will be started. From 9755a92ca98cd0b68073531ad61ff774825d4f8d Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Sun, 6 Jun 2021 12:05:54 -0400 Subject: [PATCH 06/71] Update crash_capture.md --- doc/techsupport/crash_capture.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/crash_capture.md b/doc/techsupport/crash_capture.md index fc34399750e..95b975efec4 100644 --- a/doc/techsupport/crash_capture.md +++ b/doc/techsupport/crash_capture.md @@ -54,7 +54,7 @@ After=database.service, crash-capture-configure.service Requires=database.service, crash-capture-configure.service [Path] -PathExists=/var/core/ +PathChanged=/var/core/ Unit=crash-capture.service [Install] From 2b86bd0226f0d9ad7f2a78fc6520468facd782b6 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Karri Date: Mon, 14 Jun 2021 21:31:05 +0000 Subject: [PATCH 07/71] Renamed the Techsupport doc Signed-off-by: Vivek Reddy Karri --- doc/techsupport/{crash_capture.md => auto_techsupport_gen.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/techsupport/{crash_capture.md => auto_techsupport_gen.md} (100%) diff --git a/doc/techsupport/crash_capture.md b/doc/techsupport/auto_techsupport_gen.md similarity index 100% rename from doc/techsupport/crash_capture.md rename to doc/techsupport/auto_techsupport_gen.md From 2c4679bcb2058830d0347d1aaed4b7d1ee0beb8e Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Mon, 14 Jun 2021 20:39:03 -0400 Subject: [PATCH 08/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 121 ++++++++++++++---------- 1 file changed, 71 insertions(+), 50 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 95b975efec4..2bdd3f1682b 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -1,4 +1,4 @@ -# Crash Capture Support # +# Auto Techsupport Enhancement # #### Rev 1.0 ## Table of Contents @@ -6,22 +6,23 @@ * [About this Manual](#about-this-manual) * [1. Overview](#1-overview) * [2. High Level Requirements](#2-high-level-requirements) - * [3. Core-Dump-Generation-in-SONiC](#3-core-dump-generation-in-sonic) - * [4. Design](#4-design) - * [4.1 Script to invoke the techsupport CLI](#41-Script-to-invoke-the-techsupport-CLI) - * [4.2 Event trigger for Core-dump generation](#42-Event-trigger-for-Core-dump-generation) - * [4.3 Crash Capture ecosystem should be configurable](#43-crash-capture-ecosystem-should-be-configurable) - * [4.4 CLI Enhancements](#44-CLI-Enhancements) + * [3. Core Dump Generation in SONiC](#3-core-dump-generation-in-sonic) + * [4. Schema Additions](#4-schema-additions) + * [5. CLI Enhancements](#5-cli-enhancements) + * [6. Design](#6-design) + * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) + * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) + * [6.3 Adding these services to SONiC](#63-Adding-these-services-to-sonic) ### Revision | Rev | Date | Author | Change Description | |:---:|:-----------:|:-------------------------|:----------------------| -| 1.0 | 06/04/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | +| 1.0 | 06/14/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | ## About this Manual -This document describes the details of the system in place which facilitates the crash capture support in SONiC when the NOS generates a core dump. +This document describes the details of the system which facilitates the auto techsupport invocation support in SONiC when the NOS throws a core dump. ## 1. Overview Currently, techsupport is run by invoking `show techsupport` either by orchestration tools like Jenkins or manually. The techsupport dump also collects any core dump files available in the `/var/core/` directory. @@ -34,94 +35,114 @@ However if the techsupport invocation can be made event-driven based on core dum * Users should have the abiliity to configure this capability. ## 3. Core Dump Generation in SONiC -In SONiC, the core dumps generated from any process crash across the dockers and the base host are directed to the location `/var/core` and will have the name `/var/core/*.core.gz`. +In SONiC, the core dumps generated from any process crashes across the dockers and the base host are directed to the location `/var/core` and will have the naming format `/var/core/*.core.gz`. The naming format and compression is governed by the script `/usr/local/bin/coredump-compress`. -## 4. Design +## 4. Schema Additions -### 4.1 Script to invoke the techsupport CLI -A new python script `/usr/local/bin/crash-capture` will be added for this purpose and when invoked, it checks if a core-dump file has been generated within the last 20 sec and if yes, will invoke the techsupport dump. -Additionally, the CLI invocation will also have a `since` flag indicating the last time the tech support was run, if any. +#### Config DB +``` +key = "AUTO_TECHSUPPORT|global" +state = enabled|disabled; +cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. + # Should be greater than 120 seconds as a techsupport run would take around that time. +``` + +#### State DB +``` +key = "AUTO_TECHSUPPORT|global" +last_techsupport_run = 0; # Monotonic time in seconds relative to the latest techsupport run +enabled = yes|no; +``` + +## 5. CLI Enhancements. + +### config cli -### 4.2 Event-trigger for Core-dump generation -To Monitor and respond for the file-change events in `/var/core/`, systemd path unit will be used. This unit will start a corresponding systemd unit, which inturn invokes the crash-capture python script +`config auto-techsupport state ` -#### crash-capture.path +`config auto-techsupport cooloff ` + +### show cli + +`show auto-techsupport` + +## 6. Design + +### 6.1 Event-trigger for Core-dump generation +To Monitor and respond for the file-change events in `/var/core/`, a systemd path unit ([systemd path unit](https://www.freedesktop.org/software/systemd/man/systemd.path.html)) will be used. This unit will start a corresponding systemd service, which inturn invokes the python script and it handles the heavylifting of invoking techsupport and other config tasks. + +#### coredump-monit.path ``` [Unit] -Description=Triggers the Unit when a core is dumped -After=database.service, crash-capture-configure.service -Requires=database.service, crash-capture-configure.service +Description=Triggers the coredump-monit services accordingly when a coredump is found. +After=database.service +Requires=database.service [Path] PathChanged=/var/core/ -Unit=crash-capture.service +Unit=coredump-monit.service [Install] WantedBy=multi-user.target ``` -#### crash-capture.service +#### coredump-monit.service ``` [Unit] -Description=Executes the crash-capture script when triggered -After=database.service, crash-capture-configure.service -Requires=database.service, crash-capture-configure.service +Description=Invokes the auto_techsupport_gen script when triggered by the coredump-monit.path +After=database.service +Requires=database.service [Service] Type=simple -ExecStart=/usr/local/bin/crash-capture +ExecStart=/usr/local/bin/auto_techsupport_gen core [Install] WantedBy=multi-user.target ``` -Note: Both of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. The other dependency is `crash-capture-configure.service` which will be explained in the next section. +### 6.2 Monitor Techsupport creation +As seen in the schema, the script will be using the last_techsupport_run field in the State DB to determine whether to run techsupport based on the cooloff period configured by the user. To have the last_techsupport_run upto date, techsupport-monit.{path, service} is used. -### 4.3 crash-capture ecosystem should be configurable. -Turning on this feature would just mean unmasking or enabling both of crash-capture.{path, service} and starting crash-capture.path -Similarly, turning this off would be masking or disabling these two. +#### techsupport-monit.path +``` +[Unit] +Description=Triggers the auto_techsupport_gen services when a techsupport dump is found. +After=database.service +Requires=database.service -A new schema is added to Cfg DB which is defined below. +[Path] +PathChanged=/var/dump/ +Unit=techsupport-monit.service -#### Schema additions to Config DB -``` -key = "CRASH_CAPTURE|global" -state = enabled|disabled; +[Install] +WantedBy=multi-user.target ``` -To monitor the config changes pushed by the user, a crash-capture-daemon will be started. -This'll be started using crash-capture-configure.service. - -#### crash-capture-configure.service +#### techsupport-monit.service ``` [Unit] -Description=Starts the daemon which monitors the crash-capture config changes +Description=Invokes the auto_techsupport_gen script when triggered by the techsupport-monit.path After=database.service Requires=database.service [Service] Type=simple -Restart=always -ExecStart=/usr/local/bin/crash-capture-daemon +ExecStart=/usr/local/bin/auto_techsupport_gen techsupport [Install] WantedBy=multi-user.target ``` -This service starts the crash-capture-daemon. The crash-capture-daemon script subscribes to Config DB and listens for changes made to `CRASH_CAPTURE|global` key. -It then enables or disables the service accordingly +Note: All of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. -### 4.4 CLI Enhancements. +### 6.3 Adding these services to SONiC -### config cli - -`config crash-capture ` +These will be added to sonic-host-services directory under sonic-buildimage/src directory. -### show cli -`show crash-capture status` From 1b232e5a5594389d4164d6237a10307470bddb10 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Mon, 14 Jun 2021 20:57:22 -0400 Subject: [PATCH 09/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 2bdd3f1682b..d9b10da0867 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -70,7 +70,7 @@ enabled = yes|no; ## 6. Design ### 6.1 Event-trigger for Core-dump generation -To Monitor and respond for the file-change events in `/var/core/`, a systemd path unit ([systemd path unit](https://www.freedesktop.org/software/systemd/man/systemd.path.html)) will be used. This unit will start a corresponding systemd service, which inturn invokes the python script and it handles the heavylifting of invoking techsupport and other config tasks. +To Monitor and respond for the file-change events in `/var/core/`, a systemd path unit ([systemd path unit](https://www.freedesktop.org/software/systemd/man/systemd.path.html)) will be used. This unit will start a corresponding systemd service, which inturn invokes the python script `/usr/local/bin/auto_techsupport_gen` and it handles the heavylifting of invoking techsupport and other config tasks. #### coredump-monit.path ``` @@ -103,7 +103,7 @@ WantedBy=multi-user.target ``` ### 6.2 Monitor Techsupport creation -As seen in the schema, the script will be using the last_techsupport_run field in the State DB to determine whether to run techsupport based on the cooloff period configured by the user. To have the last_techsupport_run upto date, techsupport-monit.{path, service} is used. +The script will use the last_techsupport_run field in the State DB to determine whether to run techsupport based on the cooloff period configured by the user. To have the last_techsupport_run upto date, techsupport-monit.{path, service} units is used. #### techsupport-monit.path @@ -140,7 +140,7 @@ Note: All of these will have strict ordering dependency on database.service and ### 6.3 Adding these services to SONiC -These will be added to sonic-host-services directory under sonic-buildimage/src directory. +These will be added to `target/debs/buster/sonic-host-services-data_1.0-1_all.deb`. From b948f227cd2658b0eda1ae41df89970334099c86 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 15 Jun 2021 16:28:25 -0400 Subject: [PATCH 10/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index d9b10da0867..460b94bffb6 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -22,7 +22,7 @@ ## About this Manual -This document describes the details of the system which facilitates the auto techsupport invocation support in SONiC when the NOS throws a core dump. +This document describes the details of the system which facilitates the auto techsupport invocation support in SONiC. The auto invocation is triggered when any process across the dockers or the host crashes and a core dump is generated. ## 1. Overview Currently, techsupport is run by invoking `show techsupport` either by orchestration tools like Jenkins or manually. The techsupport dump also collects any core dump files available in the `/var/core/` directory. @@ -45,7 +45,6 @@ The naming format and compression is governed by the script `/usr/local/bin/core key = "AUTO_TECHSUPPORT|global" state = enabled|disabled; cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. - # Should be greater than 120 seconds as a techsupport run would take around that time. ``` #### State DB @@ -53,6 +52,8 @@ cooloff = 300; # Minimum Time in seconds, between two successive techsupport key = "AUTO_TECHSUPPORT|global" last_techsupport_run = 0; # Monotonic time in seconds relative to the latest techsupport run enabled = yes|no; +core_file_list = "<*.core.gz>;<*.core.gz>"; List of the core files inside the /var/core/ folder as a list + Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." ``` ## 5. CLI Enhancements. @@ -65,12 +66,19 @@ enabled = yes|no; ### show cli -`show auto-techsupport` +``` +admin@sonic:~$ show auto-techsupport ++----------------+----------------+-----------------------------------+ +| Enabled | Cooloff (sec) | Last TechSupport Run | ++================+================+===================================+ +| Yes | 300 | Tue 15 Jun 2021 08:09:59 PM UTC | ++----------------+----------------+-----------------------------------+ +``` ## 6. Design ### 6.1 Event-trigger for Core-dump generation -To Monitor and respond for the file-change events in `/var/core/`, a systemd path unit ([systemd path unit](https://www.freedesktop.org/software/systemd/man/systemd.path.html)) will be used. This unit will start a corresponding systemd service, which inturn invokes the python script `/usr/local/bin/auto_techsupport_gen` and it handles the heavylifting of invoking techsupport and other config tasks. +To Monitor and respond for the file-change events in `/var/core/`, a systemd path unit ([systemd path unit](https://www.freedesktop.org/software/systemd/man/systemd.path.html)) will be used. This unit will start a corresponding systemd service, which inturn invokes the python script `/usr/local/bin/auto_techsupport_gen` and it handles the heavylifting of invoking techsupport and other tasks. More on the script in section 6.3 #### coredump-monit.path ``` @@ -138,6 +146,13 @@ WantedBy=multi-user.target Note: All of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. +### 6.3 auto_techsupport_gen Script + +As seen in the techsupport-monit.service & coredump-monit.service Unit descriptions, the script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script updates the `last_techsupport_run` field in the State DB. + +On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script then checks for any diff between `core_file_list` field in the State DB and the file system. If any diff is found, it updates the State Db entry and moves forward. The script finally checks the `last_techsupport_run` field in the State DB and only when the cooloff period has passed, the script invokes the techsupport. + + ### 6.3 Adding these services to SONiC These will be added to `target/debs/buster/sonic-host-services-data_1.0-1_all.deb`. From 4b4b7a8e2860bddf29f5ab476b95a4eeb67696cd Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 15 Jun 2021 16:33:59 -0400 Subject: [PATCH 11/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 460b94bffb6..575ad2344a4 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -12,7 +12,8 @@ * [6. Design](#6-design) * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) - * [6.3 Adding these services to SONiC](#63-Adding-these-services-to-sonic) + * [6.3 auto_techsupport_gen script](#63-auto-techsupport-gen-script) + * [6.4 Adding these services to SONiC](#64-Adding-these-services-to-sonic) ### Revision @@ -146,14 +147,15 @@ WantedBy=multi-user.target Note: All of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. -### 6.3 auto_techsupport_gen Script +### 6.3 auto_techsupport_gen script As seen in the techsupport-monit.service & coredump-monit.service Unit descriptions, the script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script updates the `last_techsupport_run` field in the State DB. On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script then checks for any diff between `core_file_list` field in the State DB and the file system. If any diff is found, it updates the State Db entry and moves forward. The script finally checks the `last_techsupport_run` field in the State DB and only when the cooloff period has passed, the script invokes the techsupport. +**Note: The last_techsupport_run value doesn't persist across reboots, since we're keeping track of monotonic time. The field will be empty after reboot (including warm-boot)** -### 6.3 Adding these services to SONiC +### 6.4 Adding these services to SONiC These will be added to `target/debs/buster/sonic-host-services-data_1.0-1_all.deb`. From ea7f9262d479d1faffe145abe91dd0978dcc28e1 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 15 Jun 2021 16:34:19 -0400 Subject: [PATCH 12/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 575ad2344a4..a605252a83a 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -12,7 +12,7 @@ * [6. Design](#6-design) * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) - * [6.3 auto_techsupport_gen script](#63-auto-techsupport-gen-script) + * [6.3 auto_techsupport_gen script](#63-auto_techsupport_gen-script) * [6.4 Adding these services to SONiC](#64-Adding-these-services-to-sonic) From 47890a9eb861ae640905536e7b7ef95f31b8a289 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 15 Jun 2021 16:35:49 -0400 Subject: [PATCH 13/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index a605252a83a..4984290ba62 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -153,7 +153,7 @@ As seen in the techsupport-monit.service & coredump-monit.service Unit descripti On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script then checks for any diff between `core_file_list` field in the State DB and the file system. If any diff is found, it updates the State Db entry and moves forward. The script finally checks the `last_techsupport_run` field in the State DB and only when the cooloff period has passed, the script invokes the techsupport. -**Note: The last_techsupport_run value doesn't persist across reboots, since we're keeping track of monotonic time. The field will be empty after reboot (including warm-boot)** +**Note: The last_techsupport_run value doesn't persist across reboots, since monotonic time is used. The field will be empty after reboot (including warm-boot)**. ### 6.4 Adding these services to SONiC From 498a52cc9502abc983ca331060921a30d0c4a98d Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 15 Jun 2021 16:37:32 -0400 Subject: [PATCH 14/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 4984290ba62..b9dd4c1c2e1 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -53,7 +53,7 @@ cooloff = 300; # Minimum Time in seconds, between two successive techsupport key = "AUTO_TECHSUPPORT|global" last_techsupport_run = 0; # Monotonic time in seconds relative to the latest techsupport run enabled = yes|no; -core_file_list = "<*.core.gz>;<*.core.gz>"; List of the core files inside the /var/core/ folder as a list +core_file_list = "<*.core.gz>;<*.core.gz>"; List of the core files inside the /var/core/ folder Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." ``` From 42f9b2dfa53e825dd3cd2601a36556ef4a93b00b Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 17 Jun 2021 10:27:12 -0400 Subject: [PATCH 15/71] Updated Config DB Schema --- doc/techsupport/auto_techsupport_gen.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index b9dd4c1c2e1..fee2757a545 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -46,6 +46,8 @@ The naming format and compression is governed by the script `/usr/local/bin/core key = "AUTO_TECHSUPPORT|global" state = enabled|disabled; cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. +max_dumps = 3; # Maximum number of Techsupport dumps, which can be present on the switch. + If a new request to create a techsupport comes in, the oldest one will be deleted. ``` #### State DB @@ -65,6 +67,8 @@ core_file_list = "<*.core.gz>;<*.core.gz>"; List of the core files inside the /v `config auto-techsupport cooloff ` +`config auto-techsupport max_dumps ` + ### show cli ``` From 79208a93842fd24ded56d9fde0321d5ef408137a Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 17 Jun 2021 10:28:22 -0400 Subject: [PATCH 16/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index fee2757a545..9290e0a47f6 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -54,7 +54,6 @@ max_dumps = 3; # Maximum number of Techsupport dumps, which can be present on ``` key = "AUTO_TECHSUPPORT|global" last_techsupport_run = 0; # Monotonic time in seconds relative to the latest techsupport run -enabled = yes|no; core_file_list = "<*.core.gz>;<*.core.gz>"; List of the core files inside the /var/core/ folder Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." ``` From 5ac6d658ff5da0788aa951b62e46aa396c12edda Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 17 Jun 2021 13:55:39 -0400 Subject: [PATCH 17/71] Updated Schema, Yang Model and CLI --- doc/techsupport/auto_techsupport_gen.md | 96 ++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 9 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 9290e0a47f6..1af430157a7 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -45,9 +45,11 @@ The naming format and compression is governed by the script `/usr/local/bin/core ``` key = "AUTO_TECHSUPPORT|global" state = enabled|disabled; -cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. -max_dumps = 3; # Maximum number of Techsupport dumps, which can be present on the switch. - If a new request to create a techsupport comes in, the oldest one will be deleted. +cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. +max_ts_dumps = 3; # Maximum number of Techsupport dumps, which can be present on the switch. + The oldest one will be deleted, when the the limit has already crossed this. +max_core_dump_size = 100; # Maximum Size to which /var/core directory can go till in MB; + When the limit is crossed, the older core files are deleted. ``` #### State DB @@ -58,6 +60,82 @@ core_file_list = "<*.core.gz>;<*.core.gz>"; List of the core files inside the /v Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." ``` +### 4.1 YANG Model + +``` +module sonic-auto_techsupport { + + yang-version 1.1; + + namespace "http://github.com/Azure/sonic-auto_techsupport"; + prefix auto_techsupport; + + description "Auto Techsupport Capability in SONiC OS"; + + revision 2021-06-17 { + description "First Revision"; + } + + container sonic-auto_techsupport { + + container AUTO_TECHSUPPORT { + + description "AUTO_TECHSUPPORT part of config_db.json"; + + leaf status { + description "AUTO_TECHSUPPORT status"; + type enumeration { + enum disable; + enum enable; + } + default disable; + } + + leaf cooloff { + description "Minimum Time in seconds, between two successive techsupport invocations by the script."; + type uint16 { + range "0..3600" { + error-message "Should be between 0 to 3600 seconds"; + error-app-tag cooloff-invalid; + } + } + default "300"; + } + + leaf max_ts_dumps { + description "Maximum number of Techsupport dumps, which can be present on the switch. + The oldest one will be deleted, when the the limit has already crossed this. "; + type uint8 { + range "1..10" { + error-message "Should be between 1 to 10"; + error-app-tag max_ts_dumps-invalid; + } + } + default "3"; + } + + leaf max_core_dump_size { + description "Maximum Size to which /var/core directory can go till in MB; + When the limit is crossed, the older core files are deleted."; + type uint16 { + range "10..500" { + error-message "Should be between 10 to 500 MB"; + error-app-tag max_core_dump_size-invalid; + } + } + default "200"; + } + } + /* end of container AUTO_TECHSUPPORT */ + } + /* end of top level container */ +} + + +``` + + + ## 5. CLI Enhancements. ### config cli @@ -66,17 +144,17 @@ core_file_list = "<*.core.gz>;<*.core.gz>"; List of the core files inside the /v `config auto-techsupport cooloff ` -`config auto-techsupport max_dumps ` +`config auto-techsupport max_ts_dumps ` + +`config auto-techsupport max_core_dump_size ` ### show cli ``` admin@sonic:~$ show auto-techsupport -+----------------+----------------+-----------------------------------+ -| Enabled | Cooloff (sec) | Last TechSupport Run | -+================+================+===================================+ -| Yes | 300 | Tue 15 Jun 2021 08:09:59 PM UTC | -+----------------+----------------+-----------------------------------+ +STATUS COOLOFF MAX_TS_DUMPS MAX_CORE_DUMP_SIZE LAST_TECHSUPPORT_RUN +------- ------- ------------ ------------------ ------------------------------- +Enabled 300 sec 3 200 MB Tue 15 Jun 2021 08:09:59 PM UTC ``` ## 6. Design From 3521be35f57af7cce4e9cd544525c65fa281aec2 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 17 Jun 2021 14:14:27 -0400 Subject: [PATCH 18/71] Minor Fixes --- doc/techsupport/auto_techsupport_gen.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 1af430157a7..f784869935b 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -8,6 +8,7 @@ * [2. High Level Requirements](#2-high-level-requirements) * [3. Core Dump Generation in SONiC](#3-core-dump-generation-in-sonic) * [4. Schema Additions](#4-schema-additions) + * [4.1 YANG Model](#61-YANG-Model) * [5. CLI Enhancements](#5-cli-enhancements) * [6. Design](#6-design) * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) @@ -55,9 +56,9 @@ max_core_dump_size = 100; # Maximum Size to which /var/core directory can go t #### State DB ``` key = "AUTO_TECHSUPPORT|global" -last_techsupport_run = 0; # Monotonic time in seconds relative to the latest techsupport run -core_file_list = "<*.core.gz>;<*.core.gz>"; List of the core files inside the /var/core/ folder - Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." +last_techsupport_run = 0; # Monotonic time in seconds relative to the latest techsupport run +core_file_list = "<*.core.gz>;<*.core.gz>"; # List of the core files inside the /var/core/ folder + Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." ``` ### 4.1 YANG Model @@ -123,7 +124,7 @@ module sonic-auto_techsupport { error-app-tag max_core_dump_size-invalid; } } - default "200"; + default "100"; } } /* end of container AUTO_TECHSUPPORT */ @@ -230,11 +231,13 @@ Note: All of these will have strict ordering dependency on database.service and ### 6.3 auto_techsupport_gen script -As seen in the techsupport-monit.service & coredump-monit.service Unit descriptions, the script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script updates the `last_techsupport_run` field in the State DB. +As seen in the techsupport-monit.service & coredump-monit.service Unit descriptions, the script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script updates the `last_techsupport_run` field in the State DB. It then deletes any old Techsupport dumps, if the limit configured by the user has crossed. -On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script then checks for any diff between `core_file_list` field in the State DB and the file system. If any diff is found, it updates the State Db entry and moves forward. The script finally checks the `last_techsupport_run` field in the State DB and only when the cooloff period has passed, the script invokes the techsupport. +On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script then checks for any diff between `core_file_list` field in the State DB and the file system. If any diff is found, it updates the State Db entry and moves forward. The script finally checks the `last_techsupport_run` field in the State DB and only when the cooloff period has passed, the script invokes the techsupport. The script will also independently check if the Max Size configured by the user has already exceeded -**Note: The last_techsupport_run value doesn't persist across reboots, since monotonic time is used. The field will be empty after reboot (including warm-boot)**. +### 6.4 Warmboot Considerations + +The last_techsupport_run value is meaningless across reboots since monotonic time is used. This field will be empty after reboot type. Other Relavant Entries in the State DB will be added to the db_migrator and are persisted across warm-reboots. and uf yes, deletes the old core files ### 6.4 Adding these services to SONiC From 59a4739cef25ee5564d4fbdf1520d2ddb70a13a4 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 17 Jun 2021 14:33:22 -0400 Subject: [PATCH 19/71] max_cdd-Dump changed to percentage based --- doc/techsupport/auto_techsupport_gen.md | 41 ++++++++++++------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index f784869935b..0b6db5fb940 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -48,9 +48,12 @@ key = "AUTO_TECHSUPPORT|global" state = enabled|disabled; cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. max_ts_dumps = 3; # Maximum number of Techsupport dumps, which can be present on the switch. - The oldest one will be deleted, when the the limit has already crossed this. -max_core_dump_size = 100; # Maximum Size to which /var/core directory can go till in MB; - When the limit is crossed, the older core files are deleted. + The oldest one will be deleted, when the the limit has already crossed this. +max_cdd_size = 1; # Maximum Size to which /var/core directory can go; + A perentage value should be specified. The actual value in bytes is calculate dbased on available disk size + When the limit is crossed, the older core files are deleted. + Size-based cleanup design was inspired from MaxUse= Argument in the systemd-coredump.conf + https://www.freedesktop.org/software/systemd/man/coredump.conf.html ``` #### State DB @@ -115,24 +118,23 @@ module sonic-auto_techsupport { default "3"; } - leaf max_core_dump_size { - description "Maximum Size to which /var/core directory can go till in MB; + leaf max_cdd_size { + description "Maximum Size to which /var/core directory can go; + A perentage value should be specified. The actual value in bytes is calculate based on the available disk size When the limit is crossed, the older core files are deleted."; - type uint16 { - range "10..500" { - error-message "Should be between 10 to 500 MB"; - error-app-tag max_core_dump_size-invalid; + type uint8 { + range "1..20" { + error-message "Should be between 1 to 20% of the total disk space"; + error-app-tag max_cdd_size_size-invalid; } } - default "100"; + default "1"; } } /* end of container AUTO_TECHSUPPORT */ } /* end of top level container */ } - - ``` @@ -142,20 +144,17 @@ module sonic-auto_techsupport { ### config cli `config auto-techsupport state ` - -`config auto-techsupport cooloff ` - -`config auto-techsupport max_ts_dumps ` - -`config auto-techsupport max_core_dump_size ` +`config auto-techsupport cooloff <0..3600>` +`config auto-techsupport max_ts_dumps <1..10>` +`config auto-techsupport max_cdd_size <1..20>` ### show cli ``` admin@sonic:~$ show auto-techsupport -STATUS COOLOFF MAX_TS_DUMPS MAX_CORE_DUMP_SIZE LAST_TECHSUPPORT_RUN -------- ------- ------------ ------------------ ------------------------------- -Enabled 300 sec 3 200 MB Tue 15 Jun 2021 08:09:59 PM UTC +STATUS COOLOFF MAX_TS_DUMPS MAX_CDD_SIZE LAST_TECHSUPPORT_RUN +------- ------- ------------ ------------------- ------------------------------- +Enabled 300 sec 3 200000 KB / 1% Tue 15 Jun 2021 08:09:59 PM UTC ``` ## 6. Design From 473087982dbd7828d7be9f464ee7e8dfb5cd6280 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 17 Jun 2021 15:02:45 -0400 Subject: [PATCH 20/71] Updated the size-based cleanup --- doc/techsupport/auto_techsupport_gen.md | 55 ++++++++++++++++++------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 0b6db5fb940..db2056870b9 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -14,7 +14,9 @@ * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) * [6.3 auto_techsupport_gen script](#63-auto_techsupport_gen-script) - * [6.4 Adding these services to SONiC](#64-Adding-these-services-to-sonic) + * [6.4 Warmboot Considerations](#64-Warmboot-Considerations) + * [6.5 Adding these services to SONiC](#66-Adding-these-services-to-sonic) + * [6.6 Design choices for max_cdd_size argument ](#66-Design-choices-for-max_cdd_size-argument ) ### Revision @@ -49,11 +51,9 @@ state = enabled|disabled; cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. max_ts_dumps = 3; # Maximum number of Techsupport dumps, which can be present on the switch. The oldest one will be deleted, when the the limit has already crossed this. -max_cdd_size = 1; # Maximum Size to which /var/core directory can go; - A perentage value should be specified. The actual value in bytes is calculate dbased on available disk size - When the limit is crossed, the older core files are deleted. - Size-based cleanup design was inspired from MaxUse= Argument in the systemd-coredump.conf - https://www.freedesktop.org/software/systemd/man/coredump.conf.html +max_cdd_size = 2; # Maximum Size to which /var/core directory can go. A perentage value should be specified. + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core + When the limit is crossed, the older core files are incrementally deleted ``` #### State DB @@ -120,15 +120,16 @@ module sonic-auto_techsupport { leaf max_cdd_size { description "Maximum Size to which /var/core directory can go; - A perentage value should be specified. The actual value in bytes is calculate based on the available disk size + A perentage value should be specified. + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core When the limit is crossed, the older core files are deleted."; type uint8 { range "1..20" { - error-message "Should be between 1 to 20% of the total disk space"; + error-message "Can be between 1 to 20%"; error-app-tag max_cdd_size_size-invalid; } } - default "1"; + default "2"; } } /* end of container AUTO_TECHSUPPORT */ @@ -142,11 +143,12 @@ module sonic-auto_techsupport { ## 5. CLI Enhancements. ### config cli - -`config auto-techsupport state ` -`config auto-techsupport cooloff <0..3600>` -`config auto-techsupport max_ts_dumps <1..10>` -`config auto-techsupport max_cdd_size <1..20>` +``` +config auto-techsupport state +config auto-techsupport cooloff <0..3600> +config auto-techsupport max_ts_dumps <1..10> +config auto-techsupport max_cdd_size <1..20> +``` ### show cli @@ -154,7 +156,7 @@ module sonic-auto_techsupport { admin@sonic:~$ show auto-techsupport STATUS COOLOFF MAX_TS_DUMPS MAX_CDD_SIZE LAST_TECHSUPPORT_RUN ------- ------- ------------ ------------------- ------------------------------- -Enabled 300 sec 3 200000 KB / 1% Tue 15 Jun 2021 08:09:59 PM UTC +Enabled 300 sec 3 200000 KB / 2% Tue 15 Jun 2021 08:09:59 PM UTC ``` ## 6. Design @@ -238,11 +240,32 @@ On the other hand, when invoked with `core` argument, the script first checks if The last_techsupport_run value is meaningless across reboots since monotonic time is used. This field will be empty after reboot type. Other Relavant Entries in the State DB will be added to the db_migrator and are persisted across warm-reboots. and uf yes, deletes the old core files -### 6.4 Adding these services to SONiC +### 6.5 Adding these services to SONiC These will be added to `target/debs/buster/sonic-host-services-data_1.0-1_all.deb`. +### 6.6 Design choices for max_cdd_size argument + +Firstly, Size-based cleanup design was inspired from MaxUse= Argument in the systemd-coredump.conf https://www.freedesktop.org/software/systemd/man/coredump.conf.html + +``` +admin@sonic-nvda-spc:/var/core$ df . +Filesystem 1K-blocks Used Available Use% Mounted on +root-overlay 14928328 3106572 11040396 22% / + +admin@sonic-nvda-spc2:/var/core$ df . +Filesystem 1K-blocks Used Available Use% Mounted on +root-overlay 28589288 2922160 24191796 11% / + +admin@sonic-nvda-spc3:/var/core$ df . +Filesystem 1K-blocks Used Available Use% Mounted on +root-overlay 32896880 5460768 25742008 18% / +``` + +/var/core directory is hosted on root-overlay filesystem and i've seen this ranging from 10G to 25G. +Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistc default value of 2% is choosen. A typical 2% would amount to 200 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. +Although if the admin feels otherwise, this value is configurable upto 20% i.e almost 2G. From 848cb00b53987358d5c36f3353591793da713aba Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 17 Jun 2021 15:13:35 -0400 Subject: [PATCH 21/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index db2056870b9..10aca14494b 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -8,7 +8,7 @@ * [2. High Level Requirements](#2-high-level-requirements) * [3. Core Dump Generation in SONiC](#3-core-dump-generation-in-sonic) * [4. Schema Additions](#4-schema-additions) - * [4.1 YANG Model](#61-YANG-Model) + * [4.1 YANG Model](#41-YANG-Model) * [5. CLI Enhancements](#5-cli-enhancements) * [6. Design](#6-design) * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) @@ -22,7 +22,7 @@ ### Revision | Rev | Date | Author | Change Description | |:---:|:-----------:|:-------------------------|:----------------------| -| 1.0 | 06/14/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | +| 1.0 | 06/17/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | ## About this Manual @@ -49,7 +49,8 @@ The naming format and compression is governed by the script `/usr/local/bin/core key = "AUTO_TECHSUPPORT|global" state = enabled|disabled; cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. -max_ts_dumps = 3; # Maximum number of Techsupport dumps, which can be present on the switch. +max_ts_dumps = 3; # Maximum number of Techsupport dumps (Doesn't matter if it's manually or auto invoked), + which are allowed to be present on the device. The oldest one will be deleted, when the the limit has already crossed this. max_cdd_size = 2; # Maximum Size to which /var/core directory can go. A perentage value should be specified. The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core From d82e2c70646ff14c65322a8d6c7df7d41313cad6 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 17 Jun 2021 19:19:34 -0400 Subject: [PATCH 22/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 10aca14494b..fa63ec3714e 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -13,9 +13,9 @@ * [6. Design](#6-design) * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) - * [6.3 auto_techsupport_gen script](#63-auto_techsupport_gen-script) + * [6.3 auto-techsupport script](#63-auto-techsupport-script) * [6.4 Warmboot Considerations](#64-Warmboot-Considerations) - * [6.5 Adding these services to SONiC](#66-Adding-these-services-to-sonic) + * [6.5 Adding these services to SONiC](#65-Adding-these-services-to-sonic) * [6.6 Design choices for max_cdd_size argument ](#66-Design-choices-for-max_cdd_size-argument ) @@ -183,13 +183,13 @@ WantedBy=multi-user.target #### coredump-monit.service ``` [Unit] -Description=Invokes the auto_techsupport_gen script when triggered by the coredump-monit.path +Description=Invokes the auto-techsupport script when triggered by the coredump-monit.path After=database.service Requires=database.service [Service] Type=simple -ExecStart=/usr/local/bin/auto_techsupport_gen core +ExecStart=/usr/local/bin/auto-techsupport core [Install] WantedBy=multi-user.target @@ -202,7 +202,7 @@ The script will use the last_techsupport_run field in the State DB to determine #### techsupport-monit.path ``` [Unit] -Description=Triggers the auto_techsupport_gen services when a techsupport dump is found. +Description=Triggers the auto-techsupport services when a techsupport dump is found. After=database.service Requires=database.service @@ -217,13 +217,13 @@ WantedBy=multi-user.target #### techsupport-monit.service ``` [Unit] -Description=Invokes the auto_techsupport_gen script when triggered by the techsupport-monit.path +Description=Invokes the auto-techsupport script when triggered by the techsupport-monit.path After=database.service Requires=database.service [Service] Type=simple -ExecStart=/usr/local/bin/auto_techsupport_gen techsupport +ExecStart=/usr/local/bin/auto-techsupport techsupport [Install] WantedBy=multi-user.target @@ -231,7 +231,7 @@ WantedBy=multi-user.target Note: All of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. -### 6.3 auto_techsupport_gen script +### 6.3 auto-techsupport script As seen in the techsupport-monit.service & coredump-monit.service Unit descriptions, the script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script updates the `last_techsupport_run` field in the State DB. It then deletes any old Techsupport dumps, if the limit configured by the user has crossed. From 2a325bfd502f4b49fb4a61b01c98c81c92711835 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 18 Jun 2021 00:46:27 -0400 Subject: [PATCH 23/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 108 +++++++++++------------- 1 file changed, 51 insertions(+), 57 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index fa63ec3714e..2e4797e7462 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -48,13 +48,14 @@ The naming format and compression is governed by the script `/usr/local/bin/core ``` key = "AUTO_TECHSUPPORT|global" state = enabled|disabled; -cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. -max_ts_dumps = 3; # Maximum number of Techsupport dumps (Doesn't matter if it's manually or auto invoked), - which are allowed to be present on the device. - The oldest one will be deleted, when the the limit has already crossed this. -max_cdd_size = 2; # Maximum Size to which /var/core directory can go. A perentage value should be specified. - The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core - When the limit is crossed, the older core files are incrementally deleted +cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. +max-techsupports = 5; # Maximum number of Techsupport dumps (Doesn't matter if it's manually or auto invoked), + which are allowed to be present on the device. + The oldest one will be deleted, when the the limit has already crossed this. +core-usage = 5; # A perentage value should be specified. + This signifies maximum Size to which /var/core directory can be grown until. + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core + When the limit is crossed, the older core files are incrementally deleted ``` #### State DB @@ -87,51 +88,44 @@ module sonic-auto_techsupport { description "AUTO_TECHSUPPORT part of config_db.json"; - leaf status { - description "AUTO_TECHSUPPORT status"; - type enumeration { - enum disable; - enum enable; + container global { + + leaf status { + description "AUTO_TECHSUPPORT status"; + type enumeration { + enum disable; + enum enable; + } + default disable; } - default disable; - } - - leaf cooloff { - description "Minimum Time in seconds, between two successive techsupport invocations by the script."; - type uint16 { - range "0..3600" { - error-message "Should be between 0 to 3600 seconds"; - error-app-tag cooloff-invalid; - } - } - default "300"; - } - - leaf max_ts_dumps { - description "Maximum number of Techsupport dumps, which can be present on the switch. - The oldest one will be deleted, when the the limit has already crossed this. "; - type uint8 { - range "1..10" { - error-message "Should be between 1 to 10"; - error-app-tag max_ts_dumps-invalid; - } + + leaf cooloff { + description "Minimum Time in seconds, between two successive techsupport invocations by the script."; + type uint16; + default "300"; } - default "3"; - } - - leaf max_cdd_size { - description "Maximum Size to which /var/core directory can go; - A perentage value should be specified. - The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core - When the limit is crossed, the older core files are deleted."; - type uint8 { - range "1..20" { - error-message "Can be between 1 to 20%"; - error-app-tag max_cdd_size_size-invalid; - } + + leaf max-techsupports { + description "Maximum number of Techsupport dumps, which can be present on the switch. + The oldest one will be deleted, when the the limit has already crossed this. "; + type uint8; + default "5"; } - default "2"; - } + + leaf core-usage { + description "A perentage value should be specified. + This signifies maximum Size to which /var/core directory can be grown until + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core + When the limit is crossed, the older core files are deleted."; + type uint8 { + range "1..100" { + error-message "Can only be between 1 to 100"; + } + } + default "5"; + } + } + /* end of container global */ } /* end of container AUTO_TECHSUPPORT */ } @@ -146,18 +140,18 @@ module sonic-auto_techsupport { ### config cli ``` config auto-techsupport state -config auto-techsupport cooloff <0..3600> -config auto-techsupport max_ts_dumps <1..10> -config auto-techsupport max_cdd_size <1..20> +config auto-techsupport cooloff +config auto-techsupport max-techsupport +config auto-techsupport core-usage <1..100> ``` ### show cli ``` admin@sonic:~$ show auto-techsupport -STATUS COOLOFF MAX_TS_DUMPS MAX_CDD_SIZE LAST_TECHSUPPORT_RUN -------- ------- ------------ ------------------- ------------------------------- -Enabled 300 sec 3 200000 KB / 2% Tue 15 Jun 2021 08:09:59 PM UTC +STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_DIR_SIZE LAST_TECHSUPPORT_RUN +------- ------- --------------------- ---------------------- ------------------------------- +Enabled 300 sec 3 200000 KB / 2% Tue 15 Jun 2021 08:09:59 PM UTC ``` ## 6. Design @@ -264,9 +258,9 @@ root-overlay 32896880 5460768 25742008 18% / ``` /var/core directory is hosted on root-overlay filesystem and i've seen this ranging from 10G to 25G. -Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistc default value of 2% is choosen. A typical 2% would amount to 200 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. +Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistc default value of 2% is choosen. A typical 5% would amount to a minimum of 500 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. -Although if the admin feels otherwise, this value is configurable upto 20% i.e almost 2G. +Although if the admin feels otherwise, this value is configurable. From 584c46589cf2712cb4d2c5bf4567aff965dad062 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 18 Jun 2021 00:50:40 -0400 Subject: [PATCH 24/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 2e4797e7462..76ee9b8049d 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -258,7 +258,7 @@ root-overlay 32896880 5460768 25742008 18% / ``` /var/core directory is hosted on root-overlay filesystem and i've seen this ranging from 10G to 25G. -Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistc default value of 2% is choosen. A typical 5% would amount to a minimum of 500 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. +Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistic default value of 2% is chosen. A typical 5% would amount to a minimum of 500 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. Although if the admin feels otherwise, this value is configurable. From e667ed36efb05d98d625975ca533de295776658a Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 18 Jun 2021 01:02:57 -0400 Subject: [PATCH 25/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 76ee9b8049d..8269cbe8d33 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -149,9 +149,9 @@ config auto-techsupport core-usage <1..100> ``` admin@sonic:~$ show auto-techsupport -STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_DIR_SIZE LAST_TECHSUPPORT_RUN -------- ------- --------------------- ---------------------- ------------------------------- -Enabled 300 sec 3 200000 KB / 2% Tue 15 Jun 2021 08:09:59 PM UTC +STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_USAGE_SIZE LAST_TECHSUPPORT_RUN +------- ------- --------------------- ------------------------ ------------------------------- +Enabled 300 sec 3 200000 KB / 2% Tue 15 Jun 2021 08:09:59 PM UTC ``` ## 6. Design From b8c9bad737629b981720794ac53fe181b8dd42ce Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 18 Jun 2021 10:25:40 -0400 Subject: [PATCH 26/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 8269cbe8d33..e75a47fa881 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -61,7 +61,7 @@ core-usage = 5; # A perentage value should be specified. #### State DB ``` key = "AUTO_TECHSUPPORT|global" -last_techsupport_run = 0; # Monotonic time in seconds relative to the latest techsupport run +last_techsupport_run = 0; # The last techsupport run, represented by the Monotonic time in seconds. core_file_list = "<*.core.gz>;<*.core.gz>"; # List of the core files inside the /var/core/ folder Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." ``` From 406c3b1d9356d1227049f4517eda5562a155e58f Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 18 Jun 2021 13:24:22 -0400 Subject: [PATCH 27/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index e75a47fa881..14501eae983 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -62,8 +62,10 @@ core-usage = 5; # A perentage value should be specified. ``` key = "AUTO_TECHSUPPORT|global" last_techsupport_run = 0; # The last techsupport run, represented by the Monotonic time in seconds. +num_techsupports = 0; # Number of TS Dumps already present. core_file_list = "<*.core.gz>;<*.core.gz>"; # List of the core files inside the /var/core/ folder Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." + ``` ### 4.1 YANG Model From a3523b3652f58cb12f8a879877c9ad020f80c2f3 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Mon, 21 Jun 2021 15:19:47 -0400 Subject: [PATCH 28/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 14501eae983..c7a567e2a63 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -239,7 +239,7 @@ The last_techsupport_run value is meaningless across reboots since monotonic tim ### 6.5 Adding these services to SONiC -These will be added to `target/debs/buster/sonic-host-services-data_1.0-1_all.deb`. +These will be added to `target/debs/buster/sonic-host-services-data_1.0-1_all.deb` & `target/python-wheels/sonic_host_services-1.0-py3-none-any.whl` accordingly. ### 6.6 Design choices for max_cdd_size argument From 4f07a40146d9353a05493d189490cdade3995a5f Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 22 Jun 2021 00:43:47 -0400 Subject: [PATCH 29/71] Simplified the Design --- doc/techsupport/auto_techsupport_gen.md | 33 +++++++++---------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index c7a567e2a63..3a49bd444ce 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -14,15 +14,14 @@ * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) * [6.3 auto-techsupport script](#63-auto-techsupport-script) - * [6.4 Warmboot Considerations](#64-Warmboot-Considerations) - * [6.5 Adding these services to SONiC](#65-Adding-these-services-to-sonic) - * [6.6 Design choices for max_cdd_size argument ](#66-Design-choices-for-max_cdd_size-argument ) + * [6.4 Adding these services to SONiC](#64-Adding-these-services-to-sonic) + * [6.5 Design choices for max_cdd_size argument ](#65-Design-choices-for-max_cdd_size-argument ) ### Revision | Rev | Date | Author | Change Description | |:---:|:-----------:|:-------------------------|:----------------------| -| 1.0 | 06/17/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | +| 1.0 | 06/22/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | ## About this Manual @@ -48,7 +47,8 @@ The naming format and compression is governed by the script `/usr/local/bin/core ``` key = "AUTO_TECHSUPPORT|global" state = enabled|disabled; -cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations by the script. +cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations. + Manual Invocations will be considered as well in the cooloff calculation max-techsupports = 5; # Maximum number of Techsupport dumps (Doesn't matter if it's manually or auto invoked), which are allowed to be present on the device. The oldest one will be deleted, when the the limit has already crossed this. @@ -58,15 +58,6 @@ core-usage = 5; # A perentage value should be specified. When the limit is crossed, the older core files are incrementally deleted ``` -#### State DB -``` -key = "AUTO_TECHSUPPORT|global" -last_techsupport_run = 0; # The last techsupport run, represented by the Monotonic time in seconds. -num_techsupports = 0; # Number of TS Dumps already present. -core_file_list = "<*.core.gz>;<*.core.gz>"; # List of the core files inside the /var/core/ folder - Eg: "python3.15678876.168.core.gz;orchagent.145678765.182.core.gz;...." - -``` ### 4.1 YANG Model @@ -192,7 +183,7 @@ WantedBy=multi-user.target ``` ### 6.2 Monitor Techsupport creation -The script will use the last_techsupport_run field in the State DB to determine whether to run techsupport based on the cooloff period configured by the user. To have the last_techsupport_run upto date, techsupport-monit.{path, service} units is used. +These units are used to cleanup the old Techsupport dumps, when the limit configured by the user is crossed. #### techsupport-monit.path @@ -229,19 +220,17 @@ Note: All of these will have strict ordering dependency on database.service and ### 6.3 auto-techsupport script -As seen in the techsupport-monit.service & coredump-monit.service Unit descriptions, the script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script updates the `last_techsupport_run` field in the State DB. It then deletes any old Techsupport dumps, if the limit configured by the user has crossed. - -On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script then checks for any diff between `core_file_list` field in the State DB and the file system. If any diff is found, it updates the State Db entry and moves forward. The script finally checks the `last_techsupport_run` field in the State DB and only when the cooloff period has passed, the script invokes the techsupport. The script will also independently check if the Max Size configured by the user has already exceeded +As seen in the techsupport-monit.service & coredump-monit.service Unit descriptions, the script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. -### 6.4 Warmboot Considerations +On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script first verifies if a file is created within the last 20 sec and if yes, it moves forward. -The last_techsupport_run value is meaningless across reboots since monotonic time is used. This field will be empty after reboot type. Other Relavant Entries in the State DB will be added to the db_migrator and are persisted across warm-reboots. and uf yes, deletes the old core files +The script then checks if the cooloff period has passed, and it invokes the techsupport command. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files independently. -### 6.5 Adding these services to SONiC +### 6.4 Adding these services to SONiC These will be added to `target/debs/buster/sonic-host-services-data_1.0-1_all.deb` & `target/python-wheels/sonic_host_services-1.0-py3-none-any.whl` accordingly. -### 6.6 Design choices for max_cdd_size argument +### 6.5 Design choices for max_cdd_size argument Firstly, Size-based cleanup design was inspired from MaxUse= Argument in the systemd-coredump.conf https://www.freedesktop.org/software/systemd/man/coredump.conf.html From 1149eb4e573022a011b8dd7abe1af235fb7a20dc Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 22 Jun 2021 00:47:08 -0400 Subject: [PATCH 30/71] Updated 6.5 section --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 3a49bd444ce..d23e2dedcb7 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -249,7 +249,7 @@ root-overlay 32896880 5460768 25742008 18% / ``` /var/core directory is hosted on root-overlay filesystem and i've seen this ranging from 10G to 25G. -Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistic default value of 2% is chosen. A typical 5% would amount to a minimum of 500 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. +Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistic default value of 5% is chosen.This would amount to a minimum of 500 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. Although if the admin feels otherwise, this value is configurable. From 4f50e238f5e6987748d27431a5e8d3ed6e8df042 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 22 Jun 2021 00:51:47 -0400 Subject: [PATCH 31/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index d23e2dedcb7..c9f102cdc69 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -14,7 +14,7 @@ * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) * [6.3 auto-techsupport script](#63-auto-techsupport-script) - * [6.4 Adding these services to SONiC](#64-Adding-these-services-to-sonic) + * [6.4 Warmboot/Fastboot consideration](#63-Warmboot/Fastboot-consideration) * [6.5 Design choices for max_cdd_size argument ](#65-Design-choices-for-max_cdd_size-argument ) @@ -226,9 +226,9 @@ On the other hand, when invoked with `core` argument, the script first checks if The script then checks if the cooloff period has passed, and it invokes the techsupport command. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files independently. -### 6.4 Adding these services to SONiC +### 6.4 Warmboot/Fastboot consideration -These will be added to `target/debs/buster/sonic-host-services-data_1.0-1_all.deb` & `target/python-wheels/sonic_host_services-1.0-py3-none-any.whl` accordingly. +No impact for warmboot/fastboot flows. ### 6.5 Design choices for max_cdd_size argument From 860f1143ed09fbfae8dfae1c5e054e8e5692869d Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 22 Jun 2021 00:52:17 -0400 Subject: [PATCH 32/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index c9f102cdc69..dcf2915ca2f 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -14,8 +14,8 @@ * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) * [6.3 auto-techsupport script](#63-auto-techsupport-script) - * [6.4 Warmboot/Fastboot consideration](#63-Warmboot/Fastboot-consideration) - * [6.5 Design choices for max_cdd_size argument ](#65-Design-choices-for-max_cdd_size-argument ) + * [6.4 Warmboot/Fastboot consideration](#64-Warmboot/Fastboot-consideration) + * [6.5 Design choices for max_cdd_size argument](#65-Design-choices-for-max_cdd_size-argument ) ### Revision From 85ce0dfcb3ce7736969bf5832107db8eabf5064e Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 22 Jun 2021 00:53:14 -0400 Subject: [PATCH 33/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index dcf2915ca2f..966425a0937 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -224,7 +224,7 @@ As seen in the techsupport-monit.service & coredump-monit.service Unit descripti On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script first verifies if a file is created within the last 20 sec and if yes, it moves forward. -The script then checks if the cooloff period has passed, and it invokes the techsupport command. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files independently. +The script then checks if the cooloff period has passed, and it invokes the techsupport command. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. ### 6.4 Warmboot/Fastboot consideration From 0013ab104d785533fe5f20b8b9bed2926d396a7e Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 22 Jun 2021 00:57:32 -0400 Subject: [PATCH 34/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 966425a0937..71e56119a9c 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -15,7 +15,7 @@ * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) * [6.3 auto-techsupport script](#63-auto-techsupport-script) * [6.4 Warmboot/Fastboot consideration](#64-Warmboot/Fastboot-consideration) - * [6.5 Design choices for max_cdd_size argument](#65-Design-choices-for-max_cdd_size-argument ) + * [6.5 Design choices for max_cdd_size argument](#65-Design-choices-for-core-usage-argument ) ### Revision @@ -230,7 +230,7 @@ The script then checks if the cooloff period has passed, and it invokes the tech No impact for warmboot/fastboot flows. -### 6.5 Design choices for max_cdd_size argument +### 6.5 Design choices for core-usage argument Firstly, Size-based cleanup design was inspired from MaxUse= Argument in the systemd-coredump.conf https://www.freedesktop.org/software/systemd/man/coredump.conf.html @@ -248,8 +248,8 @@ Filesystem 1K-blocks Used Available Use% Mounted on root-overlay 32896880 5460768 25742008 18% / ``` -/var/core directory is hosted on root-overlay filesystem and i've seen this ranging from 10G to 25G. -Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistic default value of 5% is chosen.This would amount to a minimum of 500 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. +/var/core directory is hosted on root-overlay filesystem and this usually ranges from 10G to 25G+. +Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistic default value of 5% is chosen. This would amount to a minimum of 500 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. Although if the admin feels otherwise, this value is configurable. From 79a70477a0b2a838fc1a5f95cace794e881893ab Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 22 Jun 2021 00:57:49 -0400 Subject: [PATCH 35/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 71e56119a9c..2386c508e95 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -15,7 +15,7 @@ * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) * [6.3 auto-techsupport script](#63-auto-techsupport-script) * [6.4 Warmboot/Fastboot consideration](#64-Warmboot/Fastboot-consideration) - * [6.5 Design choices for max_cdd_size argument](#65-Design-choices-for-core-usage-argument ) + * [6.5 Design choices for core-usage argument](#65-Design-choices-for-core-usage-argument ) ### Revision From 986d31a760d41035845b55e68c60e85b7aef71fd Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 20 Jul 2021 15:22:17 -0700 Subject: [PATCH 36/71] Simplified design by removing systemd units --- doc/techsupport/auto_techsupport_gen.md | 85 ++++--------------------- 1 file changed, 12 insertions(+), 73 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 2386c508e95..252a39ed3bd 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -11,9 +11,9 @@ * [4.1 YANG Model](#41-YANG-Model) * [5. CLI Enhancements](#5-cli-enhancements) * [6. Design](#6-design) - * [6.1 Event trigger for Core-dump generation](#61-Event-trigger-for-Core-dump-generation) - * [6.2 Monitor Techsupport creation](#62-Monitor-Techsupport-Creation) - * [6.3 auto-techsupport script](#63-auto-techsupport-script) + * [6.1 auto-techsupport script](#61-auto-techsupport-script) + * [6.2 Modifications to coredump-compress script](#62-Modifications-to-coredump-compress-script) + * [6.3 Modifications to generate_dump script](#63-Modifications-to-generate-dump-script) * [6.4 Warmboot/Fastboot consideration](#64-Warmboot/Fastboot-consideration) * [6.5 Design choices for core-usage argument](#65-Design-choices-for-core-usage-argument ) @@ -38,7 +38,7 @@ However if the techsupport invocation can be made event-driven based on core dum * Users should have the abiliity to configure this capability. ## 3. Core Dump Generation in SONiC -In SONiC, the core dumps generated from any process crashes across the dockers and the base host are directed to the location `/var/core` and will have the naming format `/var/core/*.core.gz`. +In SONiC, the core dumps generated from any process crashes are directed to the location `/var/core` and will have the naming format `/var/core/*.core.gz`. The naming format and compression is governed by the script `/usr/local/bin/coredump-compress`. ## 4. Schema Additions @@ -149,82 +149,21 @@ Enabled 300 sec 3 200000 KB / 2% Tue 15 ## 6. Design -### 6.1 Event-trigger for Core-dump generation -To Monitor and respond for the file-change events in `/var/core/`, a systemd path unit ([systemd path unit](https://www.freedesktop.org/software/systemd/man/systemd.path.html)) will be used. This unit will start a corresponding systemd service, which inturn invokes the python script `/usr/local/bin/auto_techsupport_gen` and it handles the heavylifting of invoking techsupport and other tasks. More on the script in section 6.3 +### 6.1 auto-techsupport script -#### coredump-monit.path -``` -[Unit] -Description=Triggers the coredump-monit services accordingly when a coredump is found. -After=database.service -Requires=database.service - -[Path] -PathChanged=/var/core/ -Unit=coredump-monit.service - -[Install] -WantedBy=multi-user.target -``` - -#### coredump-monit.service -``` -[Unit] -Description=Invokes the auto-techsupport script when triggered by the coredump-monit.path -After=database.service -Requires=database.service - -[Service] -Type=simple -ExecStart=/usr/local/bin/auto-techsupport core - -[Install] -WantedBy=multi-user.target -``` - -### 6.2 Monitor Techsupport creation -These units are used to cleanup the old Techsupport dumps, when the limit configured by the user is crossed. - - -#### techsupport-monit.path -``` -[Unit] -Description=Triggers the auto-techsupport services when a techsupport dump is found. -After=database.service -Requires=database.service - -[Path] -PathChanged=/var/dump/ -Unit=techsupport-monit.service - -[Install] -WantedBy=multi-user.target -``` - -#### techsupport-monit.service -``` -[Unit] -Description=Invokes the auto-techsupport script when triggered by the techsupport-monit.path -After=database.service -Requires=database.service +A script under the name `auto-techsupport` will be added to `/usr/local/bin/` directory which has the logic on the auto-invocation & auto-cleanup. This script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. -[Service] -Type=simple -ExecStart=/usr/local/bin/auto-techsupport techsupport - -[Install] -WantedBy=multi-user.target -``` +On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script first verifies if a file is created within the last 20 sec and if yes, it moves forward. -Note: All of these will have strict ordering dependency on database.service and not swss or sonic.target, because the crashes might occur during the swss/syncd bringup etc. And for this to be captured the service should be active before the start of these services. +The script then checks if the cooloff period has passed, and it invokes the techsupport command. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. -### 6.3 auto-techsupport script +### 6.2 Modifications to coredump-compress script -As seen in the techsupport-monit.service & coredump-monit.service Unit descriptions, the script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. +The coredump-compress script is modified to invoke the auto-techsupport script with `core` argument once it is done writing the core file to /var/core. -On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script first verifies if a file is created within the last 20 sec and if yes, it moves forward. +### 6.3 Modifications to generate_dump script -The script then checks if the cooloff period has passed, and it invokes the techsupport command. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. +The generate_dump script will invoke the auto-techsupport script with `techsupport` argument to handle the cleanup of techsupport files, if configured ### 6.4 Warmboot/Fastboot consideration From 1cd7b2adc9c27f8326a7352e68928f5eb71c7d84 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 20 Jul 2021 15:25:18 -0700 Subject: [PATCH 37/71] Minor Change --- doc/techsupport/auto_techsupport_gen.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 252a39ed3bd..7e5d9cc9ffb 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -151,9 +151,9 @@ Enabled 300 sec 3 200000 KB / 2% Tue 15 ### 6.1 auto-techsupport script -A script under the name `auto-techsupport` will be added to `/usr/local/bin/` directory which has the logic on the auto-invocation & auto-cleanup. This script follows two separate flows based on the argument provided. When invoked with `techsupport` argument, the script checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. +A script under the name `auto-techsupport` will be added to `/usr/local/bin/` directory which has the logic to handle the auto-invocation & auto-cleanup. When invoked with `techsupport` argument, the script checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. -On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script first verifies if a file is created within the last 20 sec and if yes, it moves forward. +On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script then verifies if a core dump file is created within the last 20 sec and if yes, it moves forward. The script then checks if the cooloff period has passed, and it invokes the techsupport command. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. From eabb0549f0f2ba7bd7223f1e9ea714222afa3267 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 14:36:29 -0700 Subject: [PATCH 38/71] since option added --- doc/techsupport/auto_techsupport_gen.md | 55 +++++++++++++++++-------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 7e5d9cc9ffb..c4d903630cf 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -11,11 +11,12 @@ * [4.1 YANG Model](#41-YANG-Model) * [5. CLI Enhancements](#5-cli-enhancements) * [6. Design](#6-design) - * [6.1 auto-techsupport script](#61-auto-techsupport-script) - * [6.2 Modifications to coredump-compress script](#62-Modifications-to-coredump-compress-script) - * [6.3 Modifications to generate_dump script](#63-Modifications-to-generate-dump-script) - * [6.4 Warmboot/Fastboot consideration](#64-Warmboot/Fastboot-consideration) - * [6.5 Design choices for core-usage argument](#65-Design-choices-for-core-usage-argument ) + * [6.1 coredump_gen_handler script](#61-coredump_gen_handler-script) + * [6.2 techsupport_cleanup script](#62-techsupport_cleanup-script) + * [6.3 Modifications to coredump-compress script](#63-Modifications-to-coredump-compress-script) + * [6.4 Modifications to generate_dump script](#64-Modifications-to-generate-dump-script) + * [6.5 Warmboot/Fastboot consideration](#65-Warmboot/Fastboot-consideration) + * [6.6 Design choices for core-usage argument](#66-Design-choices-for-core-usage-argument ) ### Revision @@ -34,7 +35,7 @@ However if the techsupport invocation can be made event-driven based on core dum ## 2. High Level Requirements * Techsupport invocation should also be made event-driven based on core dump generation -* This capability should be made optional and is disabled by default +* This capability should be made optional and is enabled by default * Users should have the abiliity to configure this capability. ## 3. Core Dump Generation in SONiC @@ -56,6 +57,10 @@ core-usage = 5; # A perentage value should be specified. This signifies maximum Size to which /var/core directory can be grown until. The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core When the limit is crossed, the older core files are incrementally deleted +since = "2 days ago"; # This limits the auto-invoked techsupport to only collect the logs & core-dumps generated since the time provided. + Any valid date string of the formats specified here (https://www.gnu.org/software/coreutils/manual/html_node/Date-input-formats.html) + can be used. + If this value is not explicitly configured or a non-valid string is provided, a default value of "2 days ago" is used. ``` @@ -116,7 +121,18 @@ module sonic-auto_techsupport { } } default "5"; - } + } + + leaf since { + description "This limits the auto-invoked techsupport to only collect the logs & core-dumps generated since the time provided. + Any valid date string of the formats specified here (https://www.gnu.org/software/coreutils/manual/html_node/Date-input-formats.html) + can be used. + If this value is not explicitly configured or a non-valid string is provided, a default value of "2 days ago" is used"; + type string { + length 1..255; + } + default "2 days ago"; + } } /* end of container global */ } @@ -136,40 +152,43 @@ config auto-techsupport state config auto-techsupport cooloff config auto-techsupport max-techsupport config auto-techsupport core-usage <1..100> +config auto-techsupport since ``` ### show cli ``` admin@sonic:~$ show auto-techsupport -STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_USAGE_SIZE LAST_TECHSUPPORT_RUN -------- ------- --------------------- ------------------------ ------------------------------- -Enabled 300 sec 3 200000 KB / 2% Tue 15 Jun 2021 08:09:59 PM UTC +STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_USAGE_SIZE SINCE LAST_TECHSUPPORT_RUN +------- ------- --------------------- ------------------------ ---------- ------------------------------- +Enabled 300 sec 3 200000 KB / 2% 2 days ago Tue 15 Jun 2021 08:09:59 PM UTC ``` ## 6. Design -### 6.1 auto-techsupport script +### 6.1 coredump_gen_handler script + +A script under the name `coredump_gen_handler` will be added to `/usr/local/bin/` directory which will be invoked after a coredump is generated. The script first checks if this feature is enabled by the user. The script then verifies if a core dump file is created within the last 20 sec and if yes, it moves forward. -A script under the name `auto-techsupport` will be added to `/usr/local/bin/` directory which has the logic to handle the auto-invocation & auto-cleanup. When invoked with `techsupport` argument, the script checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. +The script invokes the show techsupport command, if the cooloff period configured by the user has passed. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. -On the other hand, when invoked with `core` argument, the script first checks if this feature is enabled by the user. The Script then verifies if a core dump file is created within the last 20 sec and if yes, it moves forward. +### 6.2 techsupport_cleanup script -The script then checks if the cooloff period has passed, and it invokes the techsupport command. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. +A script under the name `techsupport_cleanup` will be added to `/usr/local/bin/` directory which will be invoked after a techsupport dump is created. The script first checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. -### 6.2 Modifications to coredump-compress script +### 6.3 Modifications to coredump-compress script The coredump-compress script is modified to invoke the auto-techsupport script with `core` argument once it is done writing the core file to /var/core. -### 6.3 Modifications to generate_dump script +### 6.4 Modifications to generate_dump script The generate_dump script will invoke the auto-techsupport script with `techsupport` argument to handle the cleanup of techsupport files, if configured -### 6.4 Warmboot/Fastboot consideration +### 6.5 Warmboot/Fastboot consideration No impact for warmboot/fastboot flows. -### 6.5 Design choices for core-usage argument +### 6.6 Design choices for core-usage argument Firstly, Size-based cleanup design was inspired from MaxUse= Argument in the systemd-coredump.conf https://www.freedesktop.org/software/systemd/man/coredump.conf.html From 0e60844f4ccdc423a4489aae362ebb78a9f0b997 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 14:43:30 -0700 Subject: [PATCH 39/71] Minor changes updated --- doc/techsupport/auto_techsupport_gen.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index c4d903630cf..f9d9d66eaa3 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -168,21 +168,21 @@ Enabled 300 sec 3 200000 KB / 2% 2 days ### 6.1 coredump_gen_handler script -A script under the name `coredump_gen_handler` will be added to `/usr/local/bin/` directory which will be invoked after a coredump is generated. The script first checks if this feature is enabled by the user. The script then verifies if a core dump file is created within the last 20 sec and if yes, it moves forward. +A script under the name `coredump_gen_handler` is added to `/usr/local/bin/` directory which will be invoked after a coredump is generated. The script first checks if this feature is enabled by the user. The script then verifies if a core dump file is created within the last 20 sec and if yes, it moves forward. The script invokes the show techsupport command, if the cooloff period configured by the user has passed. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. ### 6.2 techsupport_cleanup script -A script under the name `techsupport_cleanup` will be added to `/usr/local/bin/` directory which will be invoked after a techsupport dump is created. The script first checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. +A script under the name `techsupport_cleanup` is added to `/usr/local/bin/` directory which will be invoked after a techsupport dump is created. The script first checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. ### 6.3 Modifications to coredump-compress script -The coredump-compress script is modified to invoke the auto-techsupport script with `core` argument once it is done writing the core file to /var/core. +The coredump-compress script is updated to invoke the `coredump_gen_handler` script once it is done writing the core file to /var/core. ### 6.4 Modifications to generate_dump script -The generate_dump script will invoke the auto-techsupport script with `techsupport` argument to handle the cleanup of techsupport files, if configured +The generate_dump script is updated to invoke the `techsupport_cleanup` script to handle the cleanup of techsupport files ### 6.5 Warmboot/Fastboot consideration From 1c0f440dddc04da8e0ff6a601676219735933547 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 14:51:41 -0700 Subject: [PATCH 40/71] Added the test plan --- doc/techsupport/auto_techsupport_gen.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index f9d9d66eaa3..2ad7a0493fe 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -211,5 +211,14 @@ Since Techsupport dumps are also hosted on the same filesystem, a slightly pessi Although if the admin feels otherwise, this value is configurable. +## 7. Test Plan +Enhance the existing techsupport sonic-mgmt test with the following cases. + +| S.No | Test case synopsis | +|------|-----------------------------------------------------------------------------------------------------------------------------------------| +| 1 | Check if the `coredump_gen_handler` script is infact invoking the techsupport cmd, when configured | +| 2 | Check if the techsupport cleanup is working as expected | +| 3 | Check if the cooloff is honoured | +| 4 | Check if the core-dump cleanup mechanism is working as expected | From 0d201be0af89847fe441f405d9062d5a0d2e5f1f Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 14:53:27 -0700 Subject: [PATCH 41/71] Update auto_techsupport_gen.md --- doc/techsupport/auto_techsupport_gen.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index 2ad7a0493fe..c94cf8b4922 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -16,7 +16,8 @@ * [6.3 Modifications to coredump-compress script](#63-Modifications-to-coredump-compress-script) * [6.4 Modifications to generate_dump script](#64-Modifications-to-generate-dump-script) * [6.5 Warmboot/Fastboot consideration](#65-Warmboot/Fastboot-consideration) - * [6.6 Design choices for core-usage argument](#66-Design-choices-for-core-usage-argument ) + * [6.6 Design choices for core-usage argument](#66-Design-choices-for-core-usage-argument) + * [7. Test Plan](#7-Test-Plan) ### Revision From 2cad5af9fdccce575feb931dc490bc502a150ddf Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 18:36:45 -0700 Subject: [PATCH 42/71] Added Logs and updated requirements --- doc/techsupport/auto_techsupport_gen.md | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/techsupport/auto_techsupport_gen.md index c94cf8b4922..e67a0eb3de0 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/techsupport/auto_techsupport_gen.md @@ -36,8 +36,11 @@ However if the techsupport invocation can be made event-driven based on core dum ## 2. High Level Requirements * Techsupport invocation should also be made event-driven based on core dump generation -* This capability should be made optional and is enabled by default -* Users should have the abiliity to configure this capability. +* This capability should be enabled by default +* Users should have the abiliity to enable/disable this feature through CLI. +* Core cleanup mechanism should also be introduced. +* Should provide a way to cleanup techsupport dumps +* The existing "--since" option in techsupport should be leveraged and this should be a configurable parameter for this feature ## 3. Core Dump Generation in SONiC In SONiC, the core dumps generated from any process crashes are directed to the location `/var/core` and will have the naming format `/var/core/*.core.gz`. @@ -212,7 +215,22 @@ Since Techsupport dumps are also hosted on the same filesystem, a slightly pessi Although if the admin feels otherwise, this value is configurable. -## 7. Test Plan +## 7. Syslog Messages +``` +DATE sonic NOTICE coredump_gen_handler[pid]: Core Dump spotted at /var/core/orchagent.1626916631.117644.core.gz +DATE sonic INFO coredump_gen_handler[pid]: Cooloff period has not yet passed. No Techsupport Invocation is performed +DATE sonic INFO coredump_gen_handler[pid]: AUTO_TECHSUPPORT is not enabled. No Techsupport Invocation and Coredump cleanup is performed +DATE sonic INFO coredump_gen_handler[pid]: No Cleanup process is initiated since the core-usage param is not configured +DATE sonic NOTICE coredump_gen_handler[pid]: Techsupport Invocation is successful, /var/dump/sonic_dump_sonic_20210721_235228.tar.gz is created +DATE sonic ERR coredump_gen_handler[pid]: Techsupport Invocation failed, No dump is found in the /var/dump directory +DATE sonic NOTICE coredump_gen_handler[pid]: /var/core cleanup performed. 12456 bytes are deleted. + +DATE sonic INFO techsupport_cleanup[pid]: AUTO_TECHSUPPORT is not enabled. No TechSupport Cleanup is performed, current number of dumps: 5 +DATE sonic INFO techsupport_cleanup[pid]: max-techsupports is not set. No TechSupport Cleanup Process is needed to be performed, current number of dumps: 5 +DATE sonic NOTICE techsupport_cleanup[pid]: /var/dump/ cleanup is performed. current number of dumps: 4 +``` + +## 8. Test Plan Enhance the existing techsupport sonic-mgmt test with the following cases. From bfb30ee39f360da9f0555ad97eefaa21dcd43d71 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 18:39:22 -0700 Subject: [PATCH 43/71] Update and rename doc/techsupport/auto_techsupport_gen.md to doc/auto_techsupport_and_coredump_mgmt.md --- ...techsupport_gen.md => auto_techsupport_and_coredump_mgmt.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename doc/{techsupport/auto_techsupport_gen.md => auto_techsupport_and_coredump_mgmt.md} (97%) diff --git a/doc/techsupport/auto_techsupport_gen.md b/doc/auto_techsupport_and_coredump_mgmt.md similarity index 97% rename from doc/techsupport/auto_techsupport_gen.md rename to doc/auto_techsupport_and_coredump_mgmt.md index e67a0eb3de0..d2a943a86cb 100644 --- a/doc/techsupport/auto_techsupport_gen.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -1,4 +1,4 @@ -# Auto Techsupport Enhancement # +# Event Driven TechSupport Invocation & CoreDump Mgmt # #### Rev 1.0 ## Table of Contents From c560e3b31fb00674dbac57a201d930ae1e9ff439 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 18:40:42 -0700 Subject: [PATCH 44/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index d2a943a86cb..494cb8e948a 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -17,7 +17,8 @@ * [6.4 Modifications to generate_dump script](#64-Modifications-to-generate-dump-script) * [6.5 Warmboot/Fastboot consideration](#65-Warmboot/Fastboot-consideration) * [6.6 Design choices for core-usage argument](#66-Design-choices-for-core-usage-argument) - * [7. Test Plan](#7-Test-Plan) + * [7. Syslog Messages](#7-Syslog-Messages) + * [8. Test Plan](#8-Test-Plan) ### Revision From c737608a6f67cd14a06ab810fe8f4db9aa5e5b16 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 18:47:45 -0700 Subject: [PATCH 45/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index 494cb8e948a..e8aa37f1075 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -223,11 +223,11 @@ DATE sonic INFO coredump_gen_handler[pid]: Cooloff period has not yet passed. DATE sonic INFO coredump_gen_handler[pid]: AUTO_TECHSUPPORT is not enabled. No Techsupport Invocation and Coredump cleanup is performed DATE sonic INFO coredump_gen_handler[pid]: No Cleanup process is initiated since the core-usage param is not configured DATE sonic NOTICE coredump_gen_handler[pid]: Techsupport Invocation is successful, /var/dump/sonic_dump_sonic_20210721_235228.tar.gz is created -DATE sonic ERR coredump_gen_handler[pid]: Techsupport Invocation failed, No dump is found in the /var/dump directory -DATE sonic NOTICE coredump_gen_handler[pid]: /var/core cleanup performed. 12456 bytes are deleted. +DATE sonic ERR coredump_gen_handler[pid]: Techsupport Invocation failed, No techsupport dump was created in the /var/dump directory +DATE sonic NOTICE coredump_gen_handler[pid]: /var/core cleanup performed. 12456 bytes are cleared. DATE sonic INFO techsupport_cleanup[pid]: AUTO_TECHSUPPORT is not enabled. No TechSupport Cleanup is performed, current number of dumps: 5 -DATE sonic INFO techsupport_cleanup[pid]: max-techsupports is not set. No TechSupport Cleanup Process is needed to be performed, current number of dumps: 5 +DATE sonic INFO techsupport_cleanup[pid]: max-techsupports is not configured. No TechSupport Cleanup is performed, current number of dumps: 5 DATE sonic NOTICE techsupport_cleanup[pid]: /var/dump/ cleanup is performed. current number of dumps: 4 ``` From e95dfc864b7bdcafec7e3c1805d130bc688e6f24 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 18:54:30 -0700 Subject: [PATCH 46/71] Removed the default values --- doc/auto_techsupport_and_coredump_mgmt.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index e8aa37f1075..3e80e544d44 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -112,7 +112,6 @@ module sonic-auto_techsupport { description "Maximum number of Techsupport dumps, which can be present on the switch. The oldest one will be deleted, when the the limit has already crossed this. "; type uint8; - default "5"; } leaf core-usage { @@ -125,7 +124,6 @@ module sonic-auto_techsupport { error-message "Can only be between 1 to 100"; } } - default "5"; } leaf since { @@ -218,11 +216,10 @@ Although if the admin feels otherwise, this value is configurable. ## 7. Syslog Messages ``` -DATE sonic NOTICE coredump_gen_handler[pid]: Core Dump spotted at /var/core/orchagent.1626916631.117644.core.gz DATE sonic INFO coredump_gen_handler[pid]: Cooloff period has not yet passed. No Techsupport Invocation is performed DATE sonic INFO coredump_gen_handler[pid]: AUTO_TECHSUPPORT is not enabled. No Techsupport Invocation and Coredump cleanup is performed DATE sonic INFO coredump_gen_handler[pid]: No Cleanup process is initiated since the core-usage param is not configured -DATE sonic NOTICE coredump_gen_handler[pid]: Techsupport Invocation is successful, /var/dump/sonic_dump_sonic_20210721_235228.tar.gz is created +DATE sonic NOTICE coredump_gen_handler[pid]: Techsupport Invocation is successful, /var/dump/sonic_dump_sonic_20210721_235228.tar.gz is created in response to the coredump orchagent.1626916631.117644.core.gz DATE sonic ERR coredump_gen_handler[pid]: Techsupport Invocation failed, No techsupport dump was created in the /var/dump directory DATE sonic NOTICE coredump_gen_handler[pid]: /var/core cleanup performed. 12456 bytes are cleared. From 99bed384c0cb44e3e03d60a402a9543ccfe9738b Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 21 Jul 2021 19:08:33 -0700 Subject: [PATCH 47/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 44 ++++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index 3e80e544d44..ecd35818183 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -17,8 +17,7 @@ * [6.4 Modifications to generate_dump script](#64-Modifications-to-generate-dump-script) * [6.5 Warmboot/Fastboot consideration](#65-Warmboot/Fastboot-consideration) * [6.6 Design choices for core-usage argument](#66-Design-choices-for-core-usage-argument) - * [7. Syslog Messages](#7-Syslog-Messages) - * [8. Test Plan](#8-Test-Plan) + * [7. Test Plan](#7-Test-Plan) ### Revision @@ -110,7 +109,8 @@ module sonic-auto_techsupport { leaf max-techsupports { description "Maximum number of Techsupport dumps, which can be present on the switch. - The oldest one will be deleted, when the the limit has already crossed this. "; + The oldest one will be deleted, when the the limit has already crossed this. + Disabled by default. Configure '0' to explicitly disable"; type uint8; } @@ -118,9 +118,10 @@ module sonic-auto_techsupport { description "A perentage value should be specified. This signifies maximum Size to which /var/core directory can be grown until The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core - When the limit is crossed, the older core files are deleted."; + When the limit is crossed, the older core files are deleted." + Disabled by default. Configure '0' to explicitly disable";; type uint8 { - range "1..100" { + range "0..100" { error-message "Can only be between 1 to 100"; } } @@ -154,7 +155,7 @@ module sonic-auto_techsupport { config auto-techsupport state config auto-techsupport cooloff config auto-techsupport max-techsupport -config auto-techsupport core-usage <1..100> +config auto-techsupport core-usage <0..100> config auto-techsupport since ``` @@ -175,10 +176,25 @@ A script under the name `coredump_gen_handler` is added to `/usr/local/bin/` dir The script invokes the show techsupport command, if the cooloff period configured by the user has passed. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. +Potential Syslog messages which can be logged are: +``` +DATE sonic INFO coredump_gen_handler[pid]: Cooloff period has not yet passed. No Techsupport Invocation is performed +DATE sonic NOTICE coredump_gen_handler[pid]: Techsupport Invocation is successful, sonic_dump_sonic_20210721_235228.tar.gz is created in response to the coredump orchagent.1626916631.117644.core.gz +DATE sonic ERR coredump_gen_handler[pid]: Techsupport Invocation failed, No techsupport dump was created in the /var/dump directory +DATE sonic INFO coredump_gen_handler[pid]: No Cleanup process is initiated since the core-usage param is not configured +DATE sonic NOTICE coredump_gen_handler[pid]: /var/core cleanup performed. 12456 bytes are cleared. +``` ### 6.2 techsupport_cleanup script A script under the name `techsupport_cleanup` is added to `/usr/local/bin/` directory which will be invoked after a techsupport dump is created. The script first checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. +Potential Syslog messages which can be logged are: +``` +DATE sonic INFO techsupport_cleanup[pid]: AUTO_TECHSUPPORT is not enabled. No TechSupport Cleanup is performed, current number of dumps: 5 +DATE sonic INFO techsupport_cleanup[pid]: max-techsupports is not configured. No TechSupport Cleanup is performed, current number of dumps: 5 +DATE sonic NOTICE techsupport_cleanup[pid]: /var/dump/ cleanup is performed. current number of dumps: 4 +``` + ### 6.3 Modifications to coredump-compress script The coredump-compress script is updated to invoke the `coredump_gen_handler` script once it is done writing the core file to /var/core. @@ -214,21 +230,7 @@ Since Techsupport dumps are also hosted on the same filesystem, a slightly pessi Although if the admin feels otherwise, this value is configurable. -## 7. Syslog Messages -``` -DATE sonic INFO coredump_gen_handler[pid]: Cooloff period has not yet passed. No Techsupport Invocation is performed -DATE sonic INFO coredump_gen_handler[pid]: AUTO_TECHSUPPORT is not enabled. No Techsupport Invocation and Coredump cleanup is performed -DATE sonic INFO coredump_gen_handler[pid]: No Cleanup process is initiated since the core-usage param is not configured -DATE sonic NOTICE coredump_gen_handler[pid]: Techsupport Invocation is successful, /var/dump/sonic_dump_sonic_20210721_235228.tar.gz is created in response to the coredump orchagent.1626916631.117644.core.gz -DATE sonic ERR coredump_gen_handler[pid]: Techsupport Invocation failed, No techsupport dump was created in the /var/dump directory -DATE sonic NOTICE coredump_gen_handler[pid]: /var/core cleanup performed. 12456 bytes are cleared. - -DATE sonic INFO techsupport_cleanup[pid]: AUTO_TECHSUPPORT is not enabled. No TechSupport Cleanup is performed, current number of dumps: 5 -DATE sonic INFO techsupport_cleanup[pid]: max-techsupports is not configured. No TechSupport Cleanup is performed, current number of dumps: 5 -DATE sonic NOTICE techsupport_cleanup[pid]: /var/dump/ cleanup is performed. current number of dumps: 4 -``` - -## 8. Test Plan +## 7. Test Plan Enhance the existing techsupport sonic-mgmt test with the following cases. From 7630e4d70b5298e084534a590f6be7320a4f8c83 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 22 Jul 2021 13:36:15 -0700 Subject: [PATCH 48/71] Removed trivial log message --- doc/auto_techsupport_and_coredump_mgmt.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index ecd35818183..8e882e476d7 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -190,8 +190,6 @@ A script under the name `techsupport_cleanup` is added to `/usr/local/bin/` dire Potential Syslog messages which can be logged are: ``` -DATE sonic INFO techsupport_cleanup[pid]: AUTO_TECHSUPPORT is not enabled. No TechSupport Cleanup is performed, current number of dumps: 5 -DATE sonic INFO techsupport_cleanup[pid]: max-techsupports is not configured. No TechSupport Cleanup is performed, current number of dumps: 5 DATE sonic NOTICE techsupport_cleanup[pid]: /var/dump/ cleanup is performed. current number of dumps: 4 ``` From 8b81aa880983613a6ddb6cb8400259ff0e302daf Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 3 Aug 2021 16:42:25 -0700 Subject: [PATCH 49/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 32 +++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index 8e882e476d7..df016eebe9d 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -24,7 +24,7 @@ | Rev | Date | Author | Change Description | |:---:|:-----------:|:-------------------------|:----------------------| | 1.0 | 06/22/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | - +| 1.1 | TBD | Vivek Reddy Karri | Extending Support for Kernel Dumps | ## About this Manual This document describes the details of the system which facilitates the auto techsupport invocation support in SONiC. The auto invocation is triggered when any process across the dockers or the host crashes and a core dump is generated. @@ -35,12 +35,34 @@ Currently, techsupport is run by invoking `show techsupport` either by orchestra However if the techsupport invocation can be made event-driven based on core dump generation, that would definitely improve the debuggability. That is the overall idea behind this HLD. All the high-level requirements are summarized in the next section ## 2. High Level Requirements -* Techsupport invocation should also be made event-driven based on core dump generation -* This capability should be enabled by default -* Users should have the abiliity to enable/disable this feature through CLI. +### Global Scope +* Techsupport invocation should also be made event-driven based on core dump generation. +* This is only applicable for the critical processes running inside the dockers. Does not apply for other processes. +* init_cfg.json will be enhanced to include the "global CONFIG" required for this feature (described in section 4) and is enabled by default. +* To provide flexibility, a compile time flag "DISABLE_AUTO_TS_CFG" should be provided to disable the "global CONFIG" for this feature. +* Users should have the abiliity to globally enable/disable this capability through CLI. + +### Configurable Params +* A configurable "cooloff" should be introduced to limit the number of techsupport invocations. +* The existing "--since" option in techsupport should be leveraged and this should be a configurable parameter for this feature + +### Per-docker Scope +* Should provide a per-docker granularity for this feature. +* Per-docker enable/disable capability should be achieved through FEATURE table. +* Per-docker cooloff capability should is achieved through FEATURE table. +* Changes to per-docker config's will apply to all the critical processes inside the corresponding docker. +* Existing FEATURE CLI & Table should be used to apply the Configuration + +### Invocation Rules +* Auto techsupport invocation should only happen when both the global cooloff and per-docker cooloff period is passed. +* Feature should be enabled globally and also per-docker, for this to apply on any of the critical processes running inside that docker. +* If not explicitly enabled, the feature is considered disabled. +* If the cooloff (global & per-docker) isn't explicitly configured, a default value should be set and is used + +### Core & Techsupport Cleanup * Core cleanup mechanism should also be introduced. * Should provide a way to cleanup techsupport dumps -* The existing "--since" option in techsupport should be leveraged and this should be a configurable parameter for this feature + ## 3. Core Dump Generation in SONiC In SONiC, the core dumps generated from any process crashes are directed to the location `/var/core` and will have the naming format `/var/core/*.core.gz`. From a87d1ffeb3a25c4fa7c1052efaeecc7dbf7f3d9a Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 3 Aug 2021 18:55:03 -0700 Subject: [PATCH 50/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 75 +++++++++++++++-------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index df016eebe9d..bb9e589af37 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -39,7 +39,7 @@ However if the techsupport invocation can be made event-driven based on core dum * Techsupport invocation should also be made event-driven based on core dump generation. * This is only applicable for the critical processes running inside the dockers. Does not apply for other processes. * init_cfg.json will be enhanced to include the "global CONFIG" required for this feature (described in section 4) and is enabled by default. -* To provide flexibility, a compile time flag "DISABLE_AUTO_TS_CFG" should be provided to disable the "global CONFIG" for this feature. +* To provide flexibility, a compile time flag "ENABLE_AUTO_TECH_SUPPORT" should be provided to enable/disable the "CONFIG" for this feature. * Users should have the abiliity to globally enable/disable this capability through CLI. ### Configurable Params @@ -47,7 +47,7 @@ However if the techsupport invocation can be made event-driven based on core dum * The existing "--since" option in techsupport should be leveraged and this should be a configurable parameter for this feature ### Per-docker Scope -* Should provide a per-docker granularity for this feature. +* Should provide a per-docker configurable granularity for this feature. * Per-docker enable/disable capability should be achieved through FEATURE table. * Per-docker cooloff capability should is achieved through FEATURE table. * Changes to per-docker config's will apply to all the critical processes inside the corresponding docker. @@ -56,13 +56,11 @@ However if the techsupport invocation can be made event-driven based on core dum ### Invocation Rules * Auto techsupport invocation should only happen when both the global cooloff and per-docker cooloff period is passed. * Feature should be enabled globally and also per-docker, for this to apply on any of the critical processes running inside that docker. -* If not explicitly enabled, the feature is considered disabled. -* If the cooloff (global & per-docker) isn't explicitly configured, a default value should be set and is used ### Core & Techsupport Cleanup -* Core cleanup mechanism should also be introduced. -* Should provide a way to cleanup techsupport dumps - +* Core dump & techsupport dump cleanup mechanism should also be introduced. +* Size-based cleanup should be performed for both of these. +* Individual configurable options should be provided for each of these. ## 3. Core Dump Generation in SONiC In SONiC, the core dumps generated from any process crashes are directed to the location `/var/core` and will have the naming format `/var/core/*.core.gz`. @@ -70,15 +68,16 @@ The naming format and compression is governed by the script `/usr/local/bin/core ## 4. Schema Additions -#### Config DB +#### AUTO_TECHSUPPORT|global ``` key = "AUTO_TECHSUPPORT|global" -state = enabled|disabled; +state = enabled|disabled; # Enable/Disable the feature globally cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations. Manual Invocations will be considered as well in the cooloff calculation -max-techsupports = 5; # Maximum number of Techsupport dumps (Doesn't matter if it's manually or auto invoked), - which are allowed to be present on the device. - The oldest one will be deleted, when the the limit has already crossed this. +max-techsupport-size = 10; # A perentage value should be specified. + This signifies maximum Size to which /var/dump/ directory can be grown until. + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/dump + When the limit is crossed, the older techsupport dumps are incrementally deleted core-usage = 5; # A perentage value should be specified. This signifies maximum Size to which /var/core directory can be grown until. The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core @@ -89,6 +88,14 @@ since = "2 days ago"; # This limits the auto-invoked techsupport to on If this value is not explicitly configured or a non-valid string is provided, a default value of "2 days ago" is used. ``` +#### FEATURE Table +``` +............. +............. +cooloff = 600; # Minimum Time in seconds, between two successive techsupport invocations because of the same process + The idea here is not to let a periodically crashing process to invoke the techsupport until a cooloff is met +auto_techsupport = enabled|disabled; # Enable/Disable this feature per-docker +``` ### 4.1 YANG Model @@ -129,11 +136,17 @@ module sonic-auto_techsupport { default "300"; } - leaf max-techsupports { - description "Maximum number of Techsupport dumps, which can be present on the switch. - The oldest one will be deleted, when the the limit has already crossed this. - Disabled by default. Configure '0' to explicitly disable"; - type uint8; + leaf max-techsupport-size { + description "A perentage value should be specified. + This signifies maximum Size to which /var/core directory can be grown until. + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core + When the limit is crossed, the older core files are incrementally deleted"; + type uint8{ + range "0..100" { + error-message "Can only be between 1 to 100"; + } + } + default "10"; } leaf core-usage { @@ -147,6 +160,7 @@ module sonic-auto_techsupport { error-message "Can only be between 1 to 100"; } } + default "5"; } leaf since { @@ -168,6 +182,7 @@ module sonic-auto_techsupport { } ``` +Note: The "cooloff" & "auto_techsupport" will be added to the YANG Model for FEATURE Table ## 5. CLI Enhancements. @@ -179,15 +194,25 @@ config auto-techsupport cooloff config auto-techsupport max-techsupport config auto-techsupport core-usage <0..100> config auto-techsupport since + +config feature auto-techsupport enabled|disabled> +config feature cooloff ``` ### show cli ``` -admin@sonic:~$ show auto-techsupport +admin@sonic:~$ show auto-techsupport global STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_USAGE_SIZE SINCE LAST_TECHSUPPORT_RUN ------- ------- --------------------- ------------------------ ---------- ------------------------------- Enabled 300 sec 3 200000 KB / 2% 2 days ago Tue 15 Jun 2021 08:09:59 PM UTC + +admin@sonic:~$ show feature status +Feature State AutoRestart SetOwner cooloff Auto-techsupport +-------------- -------- ---------- -------- ------- ---------------- +swss enabled enabled 600 enabled +..... + ``` ## 6. Design @@ -196,7 +221,7 @@ Enabled 300 sec 3 200000 KB / 2% 2 days A script under the name `coredump_gen_handler` is added to `/usr/local/bin/` directory which will be invoked after a coredump is generated. The script first checks if this feature is enabled by the user. The script then verifies if a core dump file is created within the last 20 sec and if yes, it moves forward. -The script invokes the show techsupport command, if the cooloff period configured by the user has passed. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. +The script invokes the show techsupport command, if the global cooloff & the per-docker cooloff period has passed. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. Potential Syslog messages which can be logged are: ``` @@ -227,7 +252,7 @@ The generate_dump script is updated to invoke the `techsupport_cleanup` script t No impact for warmboot/fastboot flows. -### 6.6 Design choices for core-usage argument +### 6.6 Design choices for core-usage & max-techsupport-size argument Firstly, Size-based cleanup design was inspired from MaxUse= Argument in the systemd-coredump.conf https://www.freedesktop.org/software/systemd/man/coredump.conf.html @@ -245,10 +270,10 @@ Filesystem 1K-blocks Used Available Use% Mounted on root-overlay 32896880 5460768 25742008 18% / ``` -/var/core directory is hosted on root-overlay filesystem and this usually ranges from 10G to 25G+. -Since Techsupport dumps are also hosted on the same filesystem, a slightly pessimistic default value of 5% is chosen. This would amount to a minimum of 500 MB which is a already a decent space for coredumps. In normal conditions, a core dump will usually be in the order of hundreds of KB's to tens of MB's. +/var/core & /var/dum directories are hosted on root-overlay filesystem and this usually ranges from 10G to 25G+. +A default value of 5% would amount to a minimum of 500 MB which is a already a decent space for coredumps. For techsupport a default value of 10% would amount to a minium of 1G, which might accomodate from 5-10 techsupports. -Although if the admin feels otherwise, this value is configurable. +Although if the admin feels otherwise, these values are configurable. ## 7. Test Plan @@ -258,6 +283,6 @@ Enhance the existing techsupport sonic-mgmt test with the following cases. |------|-----------------------------------------------------------------------------------------------------------------------------------------| | 1 | Check if the `coredump_gen_handler` script is infact invoking the techsupport cmd, when configured | | 2 | Check if the techsupport cleanup is working as expected | -| 3 | Check if the cooloff is honoured | -| 4 | Check if the core-dump cleanup mechanism is working as expected | +| 3 | Check if the global cooloff & per-process cooloff is honoured | +| 4 | Check if the core-dump cleanup & techsupport-cleanup mechanisms are working as expected | From e9207a0bf9af36c6b1dc286f83b9efc2c724ab2c Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 3 Aug 2021 18:55:43 -0700 Subject: [PATCH 51/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index bb9e589af37..1c8f19588db 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -16,7 +16,7 @@ * [6.3 Modifications to coredump-compress script](#63-Modifications-to-coredump-compress-script) * [6.4 Modifications to generate_dump script](#64-Modifications-to-generate-dump-script) * [6.5 Warmboot/Fastboot consideration](#65-Warmboot/Fastboot-consideration) - * [6.6 Design choices for core-usage argument](#66-Design-choices-for-core-usage-argument) + * [6.6 Design choices for core-usage & max-techsupport-size argument](#66-Design-choices-for-core-usage-&-max-techsupport-sizeargument) * [7. Test Plan](#7-Test-Plan) From e95d77ed5ca3b96be6a5343c99c9fbc3ed2e14af Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 3 Aug 2021 18:58:09 -0700 Subject: [PATCH 52/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index 1c8f19588db..7d68ecfa6f3 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -24,10 +24,10 @@ | Rev | Date | Author | Change Description | |:---:|:-----------:|:-------------------------|:----------------------| | 1.0 | 06/22/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | -| 1.1 | TBD | Vivek Reddy Karri | Extending Support for Kernel Dumps | +| 2.0 | TBD | Vivek Reddy Karri | Extending Support for Kernel Dumps | ## About this Manual -This document describes the details of the system which facilitates the auto techsupport invocation support in SONiC. The auto invocation is triggered when any process across the dockers or the host crashes and a core dump is generated. +This document describes the details of the system which facilitates the auto techsupport invocation support in SONiC. The auto invocation is triggered when any critical process inside the docker crashes and a core dump is generated. ## 1. Overview Currently, techsupport is run by invoking `show techsupport` either by orchestration tools like Jenkins or manually. The techsupport dump also collects any core dump files available in the `/var/core/` directory. From 24f52db3bf9a37b0d7d205818ec6a26e439218e0 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Tue, 3 Aug 2021 22:17:54 -0700 Subject: [PATCH 53/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 32 +++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index 7d68ecfa6f3..5d9684e3783 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -8,14 +8,13 @@ * [2. High Level Requirements](#2-high-level-requirements) * [3. Core Dump Generation in SONiC](#3-core-dump-generation-in-sonic) * [4. Schema Additions](#4-schema-additions) - * [4.1 YANG Model](#41-YANG-Model) * [5. CLI Enhancements](#5-cli-enhancements) * [6. Design](#6-design) * [6.1 coredump_gen_handler script](#61-coredump_gen_handler-script) * [6.2 techsupport_cleanup script](#62-techsupport_cleanup-script) * [6.3 Modifications to coredump-compress script](#63-Modifications-to-coredump-compress-script) * [6.4 Modifications to generate_dump script](#64-Modifications-to-generate-dump-script) - * [6.5 Warmboot/Fastboot consideration](#65-Warmboot/Fastboot-consideration) + * [6.5 Warmboot consideration](#65-Warmboot-consideration) * [6.6 Design choices for core-usage & max-techsupport-size argument](#66-Design-choices-for-core-usage-&-max-techsupport-sizeargument) * [7. Test Plan](#7-Test-Plan) @@ -68,6 +67,8 @@ The naming format and compression is governed by the script `/usr/local/bin/core ## 4. Schema Additions +### Config DB + #### AUTO_TECHSUPPORT|global ``` key = "AUTO_TECHSUPPORT|global" @@ -97,7 +98,7 @@ cooloff = 600; # Minimum Time in seconds, between two su auto_techsupport = enabled|disabled; # Enable/Disable this feature per-docker ``` -### 4.1 YANG Model +#### YANG Model ``` module sonic-auto_techsupport { @@ -184,6 +185,19 @@ module sonic-auto_techsupport { Note: The "cooloff" & "auto_techsupport" will be added to the YANG Model for FEATURE Table +### State DB + +#### AUTO_TECHSUPPORT|TS_CORE_MAP +``` +key = "AUTO_TECHSUPPORT|TS_CORE_MAP" + = +``` +Eg: +``` +hgetall "AUTO_TECHSUPPORT|TS_CORE_MAP" +sonic_dump_sonic_20210412_223645 = orchagent.1599047232.39.core;1599047233 +sonic_dump_sonic_20210405_202756 = syncd.1617684247.17.core;1617684249 +``` ## 5. CLI Enhancements. @@ -207,6 +221,14 @@ STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_USAGE_SIZE SINCE ------- ------- --------------------- ------------------------ ---------- ------------------------------- Enabled 300 sec 3 200000 KB / 2% 2 days ago Tue 15 Jun 2021 08:09:59 PM UTC +admin@sonic:~$ show auto-techsupport history +TECHSUPPORT DUMP INVOCATION REASON +-------------------------------- --------------------------- +sonic_dump_sonic_20210412_223645 orchagent.1599047232.39.core +sonic_dump_sonic_20210405_202756 syncd.1617684247.17.core +sonic_dump_sonic_20210329_183626 Unknown +sonic_dump_sonic_20210412_223645 snmpd.1617916877.41.core + admin@sonic:~$ show feature status Feature State AutoRestart SetOwner cooloff Auto-techsupport -------------- -------- ---------- -------- ------- ---------------- @@ -248,9 +270,9 @@ The coredump-compress script is updated to invoke the `coredump_gen_handler` scr The generate_dump script is updated to invoke the `techsupport_cleanup` script to handle the cleanup of techsupport files -### 6.5 Warmboot/Fastboot consideration +### 6.5 Warmboot consideration -No impact for warmboot/fastboot flows. +AUTO_TECHSUPPORT|TS_CORE_MAP table in the State DB will be preserved across Warmboot ### 6.6 Design choices for core-usage & max-techsupport-size argument From cb0df09df181422727f6f09ac08b90d50441e770 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Wed, 4 Aug 2021 15:00:49 -0700 Subject: [PATCH 54/71] Update auto_techsupport_and_coredump_mgmt.md --- doc/auto_techsupport_and_coredump_mgmt.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index 5d9684e3783..b32b2e93e20 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -217,9 +217,9 @@ config feature cooloff ``` admin@sonic:~$ show auto-techsupport global -STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_USAGE_SIZE SINCE LAST_TECHSUPPORT_RUN -------- ------- --------------------- ------------------------ ---------- ------------------------------- -Enabled 300 sec 3 200000 KB / 2% 2 days ago Tue 15 Jun 2021 08:09:59 PM UTC +STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_USAGE_SIZE SINCE +------- ------- --------------------- ------------------------ ---------- +Enabled 300 sec 3 200000 KB / 2% 2 days ago admin@sonic:~$ show auto-techsupport history TECHSUPPORT DUMP INVOCATION REASON From 61a07b416d0ecab85833337944928dca5d64150e Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Fri, 6 Aug 2021 18:28:44 -0700 Subject: [PATCH 55/71] Updated field names --- doc/auto_techsupport_and_coredump_mgmt.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index b32b2e93e20..f82f37f9d5e 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -75,11 +75,11 @@ key = "AUTO_TECHSUPPORT|global" state = enabled|disabled; # Enable/Disable the feature globally cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations. Manual Invocations will be considered as well in the cooloff calculation -max-techsupport-size = 10; # A perentage value should be specified. +max_techsupport_size = 10; # A perentage value should be specified. This signifies maximum Size to which /var/dump/ directory can be grown until. The actual value in bytes is calculate based on the available space in the filesystem hosting /var/dump When the limit is crossed, the older techsupport dumps are incrementally deleted -core-usage = 5; # A perentage value should be specified. +core_usage = 5; # A perentage value should be specified. This signifies maximum Size to which /var/core directory can be grown until. The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core When the limit is crossed, the older core files are incrementally deleted @@ -137,7 +137,7 @@ module sonic-auto_techsupport { default "300"; } - leaf max-techsupport-size { + leaf max_techsupport_size { description "A perentage value should be specified. This signifies maximum Size to which /var/core directory can be grown until. The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core @@ -150,7 +150,7 @@ module sonic-auto_techsupport { default "10"; } - leaf core-usage { + leaf core_usage { description "A perentage value should be specified. This signifies maximum Size to which /var/core directory can be grown until The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core From f57a473ea6f141ef7afa1e2022b5c4da1507a28f Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 19 Aug 2021 17:05:59 -0700 Subject: [PATCH 56/71] Updated based on enhanced design --- doc/auto_techsupport_and_coredump_mgmt.md | 237 ++++++++++++++-------- 1 file changed, 152 insertions(+), 85 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index f82f37f9d5e..19995afab95 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -10,10 +10,11 @@ * [4. Schema Additions](#4-schema-additions) * [5. CLI Enhancements](#5-cli-enhancements) * [6. Design](#6-design) - * [6.1 coredump_gen_handler script](#61-coredump_gen_handler-script) - * [6.2 techsupport_cleanup script](#62-techsupport_cleanup-script) - * [6.3 Modifications to coredump-compress script](#63-Modifications-to-coredump-compress-script) + * [6.1 Modifications to coredump-compress script](#61-Modifications-to-coredump-compress-script) + * [6.2 coredump_gen_handler script](#62-coredump_gen_handler-script) + * [6.3 Requirements for FEATURE_PROC_INFO table](#63-requirements-for-FEATURE_PROC_INFO-table) * [6.4 Modifications to generate_dump script](#64-Modifications-to-generate-dump-script) + * [6.5 techsupport_cleanup script](#65-techsupport_cleanup-script) * [6.5 Warmboot consideration](#65-Warmboot-consideration) * [6.6 Design choices for core-usage & max-techsupport-size argument](#66-Design-choices-for-core-usage-&-max-techsupport-sizeargument) * [7. Test Plan](#7-Test-Plan) @@ -37,7 +38,7 @@ However if the techsupport invocation can be made event-driven based on core dum ### Global Scope * Techsupport invocation should also be made event-driven based on core dump generation. * This is only applicable for the critical processes running inside the dockers. Does not apply for other processes. -* init_cfg.json will be enhanced to include the "global CONFIG" required for this feature (described in section 4) and is enabled by default. +* init_cfg.json will be enhanced to include the "CONFIG" required for this feature (described in section 4) and is enabled by default. * To provide flexibility, a compile time flag "ENABLE_AUTO_TECH_SUPPORT" should be provided to enable/disable the "CONFIG" for this feature. * Users should have the abiliity to globally enable/disable this capability through CLI. @@ -72,7 +73,9 @@ The naming format and compression is governed by the script `/usr/local/bin/core #### AUTO_TECHSUPPORT|global ``` key = "AUTO_TECHSUPPORT|global" -state = enabled|disabled; # Enable/Disable the feature globally +auto_invoke_ts = enabled|disabled; # Enable this to make the Techsupport Invocation event driven based on core-dump generation +coredump_cleanup = enabled|disabled; # Enable Core dump cleanup based on core_usage argument +techsupport_cleanup = enabled|disabled; # Enable Techsupport Dump cleanup based on max_techsupport_size argument cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations. Manual Invocations will be considered as well in the cooloff calculation max_techsupport_size = 10; # A perentage value should be specified. @@ -108,9 +111,9 @@ module sonic-auto_techsupport { namespace "http://github.com/Azure/sonic-auto_techsupport"; prefix auto_techsupport; - description "Auto Techsupport Capability in SONiC OS"; + description "Event Driven Techsupport & CoreDump Mgmt Capability in SONiC OS"; - revision 2021-06-17 { + revision 2021-08-09 { description "First Revision"; } @@ -119,59 +122,79 @@ module sonic-auto_techsupport { container AUTO_TECHSUPPORT { description "AUTO_TECHSUPPORT part of config_db.json"; - + container global { - - leaf status { - description "AUTO_TECHSUPPORT status"; + + leaf auto_invoke_ts { + /* Enable this to make the Techsupport Invocation event driven based on core-dump generation*/ + type enumeration { + enum disabled; + enum enabled; + } + } + + leaf coredump_cleanup { + /* Enable Core dump cleanup based on core_usage argument */ + type enumeration { + enum disabled; + enum enabled; + } + } + + leaf techsupport_cleanup { + /* Enable Techsupport Dump cleanup based on max_techsupport_size argument */ type enumeration { - enum disable; - enum enable; + enum disabled; + enum enabled; } - default disable; } leaf cooloff { - description "Minimum Time in seconds, between two successive techsupport invocations by the script."; + /* Minimum Time in seconds, between two successive techsupport invocations by the script. + Configure '0' to explicitly disable */ type uint16; - default "300"; + default "180"; } leaf max_techsupport_size { - description "A perentage value should be specified. - This signifies maximum Size to which /var/core directory can be grown until. - The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core - When the limit is crossed, the older core files are incrementally deleted"; - type uint8{ - range "0..100" { - error-message "Can only be between 1 to 100"; - } + /* + A value between [0,100) should be specified. + Upto two decimal places will be used in the calculation + This signifies maximum Size to which the techsupport dumps in /var/dump directory can be grown until + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/dump + When the limit is crossed, the older core files are incrementally deleted + Configure '0' to explicitly disable + */ + type decimal64 { + fraction-digits 2; + range 0.00..99.99; } default "10"; } leaf core_usage { - description "A perentage value should be specified. - This signifies maximum Size to which /var/core directory can be grown until - The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core - When the limit is crossed, the older core files are deleted." - Disabled by default. Configure '0' to explicitly disable";; - type uint8 { - range "0..100" { - error-message "Can only be between 1 to 100"; - } + /* + A value between [0,100) should be specified. + Upto two decimal places will be used in the calculation + This signifies maximum Size to which the core dumps in /var/core directory can be grown until + The actual value in bytes is calculated based on the available space in the filesystem hosting /var/core + When the limit is crossed, the older core files are deleted + Configure '0' to explicitly disable + */ + type decimal64 { + fraction-digits 2; + range 0..99.99; } default "5"; } - + leaf since { - description "This limits the auto-invoked techsupport to only collect the logs & core-dumps generated since the time provided. - Any valid date string of the formats specified here (https://www.gnu.org/software/coreutils/manual/html_node/Date-input-formats.html) - can be used. - If this value is not explicitly configured or a non-valid string is provided, a default value of "2 days ago" is used"; - type string { - length 1..255; - } + /* + This limits the auto-invoked techsupport to only collect the logs & core-dumps generated since the time provided + Any valid date string of the formats specified here (https://www.gnu.org/software/coreutils/manual/html_node/Date-input-formats.html) + can be used. If this value is not explicitly configured or a non-valid string is provided, a default value of "2 days ago" is used + */ + type string; default "2 days ago"; } } @@ -190,26 +213,43 @@ Note: The "cooloff" & "auto_techsupport" will be added to the YANG Model for FEA #### AUTO_TECHSUPPORT|TS_CORE_MAP ``` key = "AUTO_TECHSUPPORT|TS_CORE_MAP" - = + = ``` Eg: ``` hgetall "AUTO_TECHSUPPORT|TS_CORE_MAP" -sonic_dump_sonic_20210412_223645 = orchagent.1599047232.39.core;1599047233 -sonic_dump_sonic_20210405_202756 = syncd.1617684247.17.core;1617684249 +sonic_dump_sonic_20210412_223645 = orchagent.1599047232.39.core;1599047233;orchagent +sonic_dump_sonic_20210405_202756 = python3.1617684247.17.core;1617684249;snmp-subagent +``` + +#### AUTO_TECHSUPPORT|FEATURE_PROC_INFO + +CRITICAL_PROC = "AUTO_TECHSUPPORT|FEATURE_PROC_INFO" +``` +key = "AUTO_TECHSUPPORT|FEATURE_PROC_INFO" + = +``` + +Eg: +``` + = "swss;orchagent" + = "snmp;snmp-subagent" + = "lldp;lldp_syncd" ``` ## 5. CLI Enhancements. ### config cli ``` -config auto-techsupport state -config auto-techsupport cooloff -config auto-techsupport max-techsupport -config auto-techsupport core-usage <0..100> -config auto-techsupport since - -config feature auto-techsupport enabled|disabled> +config auto-techsupport global auto-invoke-ts +config auto-techsupport global coredump-cleanups +config auto-techsupport global techsupport-cleanup +config auto-techsupport global cooloff +config auto-techsupport global max-techsupport +config auto-techsupport global core-usage +config auto-techsupport global since + +config feature autotechsupport enabled|disabled> config feature cooloff ``` @@ -217,63 +257,90 @@ config feature cooloff ``` admin@sonic:~$ show auto-techsupport global -STATUS COOLOFF MAX_TECHSUPPORT_DUMPS MAX_CORE_DUMP_USAGE_SIZE SINCE -------- ------- --------------------- ------------------------ ---------- -Enabled 300 sec 3 200000 KB / 2% 2 days ago +AUTO INVOKE TS COREDUMP CLEANUP TECHSUPPORT CLEANUP COOLOFF MAX TECHSUPPORT SIZE CORE USAGE SINCE +---------------- ------------------ --------------------- --------- ---------------------- ------------ ---------- +enabled enabled enabled 180 12.23 5 2 days ago + admin@sonic:~$ show auto-techsupport history -TECHSUPPORT DUMP INVOCATION REASON --------------------------------- --------------------------- -sonic_dump_sonic_20210412_223645 orchagent.1599047232.39.core -sonic_dump_sonic_20210405_202756 syncd.1617684247.17.core -sonic_dump_sonic_20210329_183626 Unknown -sonic_dump_sonic_20210412_223645 snmpd.1617916877.41.core - -admin@sonic:~$ show feature status -Feature State AutoRestart SetOwner cooloff Auto-techsupport --------------- -------- ---------- -------- ------- ---------------- -swss enabled enabled 600 enabled -..... - +Techsupport Dump Triggered By Critical Process +--------------------------------------- ----------------------------- ------------------ +sonic_dump_sonic_20210819_192558.tar.gz python3.1629401152.23.core.gz snmp-subagent + + +admin@sonic:~$ show feature autotechsupport +Feature Auto Techsupport Cooloff (Sec) +-------------- ------------------ --------------- +bgp enabled 300 +database enabled 300 +dhcp_relay enabled 300 +lldp enabled 300 +macsec enabled 300 +mgmt-framework enabled 300 +nat enabled 300 +pmon enabled 300 +radv enabled 300 +sflow enabled 300 +snmp enabled 300 +swss enabled 300 +syncd enabled 300 +teamd enabled 300 +telemetry enabled 300 ``` ## 6. Design -### 6.1 coredump_gen_handler script +### 6.1 Modifications to coredump-compress script -A script under the name `coredump_gen_handler` is added to `/usr/local/bin/` directory which will be invoked after a coredump is generated. The script first checks if this feature is enabled by the user. The script then verifies if a core dump file is created within the last 20 sec and if yes, it moves forward. +The coredump-compress script is updated to invoke the `coredump_gen_handler` script once it is done writing the core file to /var/core. -The script invokes the show techsupport command, if the global cooloff & the per-docker cooloff period has passed. The script will also independently check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. +### 6.2 coredump_gen_handler script -Potential Syslog messages which can be logged are: -``` -DATE sonic INFO coredump_gen_handler[pid]: Cooloff period has not yet passed. No Techsupport Invocation is performed -DATE sonic NOTICE coredump_gen_handler[pid]: Techsupport Invocation is successful, sonic_dump_sonic_20210721_235228.tar.gz is created in response to the coredump orchagent.1626916631.117644.core.gz -DATE sonic ERR coredump_gen_handler[pid]: Techsupport Invocation failed, No techsupport dump was created in the /var/dump directory -DATE sonic INFO coredump_gen_handler[pid]: No Cleanup process is initiated since the core-usage param is not configured -DATE sonic NOTICE coredump_gen_handler[pid]: /var/core cleanup performed. 12456 bytes are cleared. -``` -### 6.2 techsupport_cleanup script +A script under the name `coredump_gen_handler.py` is added to `/usr/local/bin/` directory which will be invoked after a coredump is generated. The script first checks if this feature is enabled by the user. The script then verifies if a core dump file is created within the last 20 sec and if yes, it moves forward. -A script under the name `techsupport_cleanup` is added to `/usr/local/bin/` directory which will be invoked after a techsupport dump is created. The script first checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. +The script invokes the show techsupport command, if the global cooloff & the per-docker cooloff period has passed. The script will also check if the Max Size configured by the user has already exceeded and if yes deletes the core files incrementally. Potential Syslog messages which can be logged are: ``` -DATE sonic NOTICE techsupport_cleanup[pid]: /var/dump/ cleanup is performed. current number of dumps: 4 +DATE sonic INFO coredump_gen_handler[pid]: Global Cooloff period has not passed. Techsupport Invocation is skipped. Core: python3.1629401152.23.core.gz +DATE sonic INFO coredump_gen_handler[pid]: Process Cooloff period for snmp has not passed.Techsupport Invocation is skipped. Core: python3.1629401152.23.core.gz +DATE sonic INFO coredump_gen_handler[pid]: "show techsupport --since '2 days ago'" is successful, sonic_dump_sonic_20210721_235228.tar.gz is created +DATE sonic INFO coredump_gen_handler[pid]: No Cleanup is performed, current size occupied: 456 MB +DATE sonic INFO coredump_gen_handler[pid]: 12 MB deleted from /var/core. +DATE sonic NOTICE coredump_gen_handler[pid]: No Corresponding Exit event info was found for python3.1629401152.23.core.gz. Techsupport Invocation is skipped +DATE sonic NOTICE coredump_gen_handler[pid]: auto_invoke_ts is not enabled. No Techsupport Invocation will be performed. core: python3.1629401152.23.core.gz +DATE sonic NOTICE coredump_gen_handler[pid]: auto-techsupport feature for swss is not enabled. Techsupport Invocation is skipped. core: python3.1629401152.23.core.gz +DATE sonic ERR coredump_gen_handler[pid]: "show techsupport --since '2 days ago'" was run, but no techsupport dump is found ``` -### 6.3 Modifications to coredump-compress script +### 6.3 Requirements for FEATURE_PROC_INFO table -The coredump-compress script is updated to invoke the `coredump_gen_handler` script once it is done writing the core file to /var/core. +A coredump generate will be of format ...core.gz. comm name is typically the executable file name. The dump name is the only information directly available to coredump_gen_handler script. And Just by looking at this, it not possible to infer if the coredump generated is of a particular critical process. That missed information is read from AUTO_TECHSUPPORT|FEATURE_PROC_INFO table. + +Producer for this table is the supervisor-proc-exit-listener script running inside every docker. This script is an event listener for PROC_EXIT & PROC_RUNNING events for the processes running inside the docker and is naturally the right fit to populate the AUTO_TECHSUPPORT|FEATURE_PROC_INFO table. + +1) During a PROC_RUNNING Event: The comm information is read from /proc//comm file and saving it in a local cache. +2) A coredump will certainly trigger a PROC_EXIT event and, the exit-listener writes an entry of format specified in section 4 to the STATE DB. + +coredump_gen_handler.py consumes this data and uses it for decisions based on the info written to this table. ### 6.4 Modifications to generate_dump script The generate_dump script is updated to invoke the `techsupport_cleanup` script to handle the cleanup of techsupport files -### 6.5 Warmboot consideration +### 6.5 techsupport_cleanup script + +A script under the name `techsupport_cleanup.py` is added to `/usr/local/bin/` directory which will be invoked after a techsupport dump is created. The script first checks if the feature is enabled by the user. It then checks if the limit configured by the user has crossed and deletes the old techsupport files, if any. -AUTO_TECHSUPPORT|TS_CORE_MAP table in the State DB will be preserved across Warmboot +Potential Syslog messages which can be logged are: +``` +DATE sonic NOTICE techsupport_cleanup[pid]: /var/dump/ cleanup is performed. current number of dumps: 4 +``` + +### 6.5 Warmboot consideration +No changes to this flow + ### 6.6 Design choices for core-usage & max-techsupport-size argument Firstly, Size-based cleanup design was inspired from MaxUse= Argument in the systemd-coredump.conf https://www.freedesktop.org/software/systemd/man/coredump.conf.html From 682d3e408a3efb4b9c327afbc2ad8d503fca20da Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 19 Aug 2021 17:16:30 -0700 Subject: [PATCH 57/71] Minor Updates --- doc/auto_techsupport_and_coredump_mgmt.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index 19995afab95..0f0a7790961 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -224,7 +224,6 @@ sonic_dump_sonic_20210405_202756 = python3.1617684247.17.core;1617684249;snmp-su #### AUTO_TECHSUPPORT|FEATURE_PROC_INFO -CRITICAL_PROC = "AUTO_TECHSUPPORT|FEATURE_PROC_INFO" ``` key = "AUTO_TECHSUPPORT|FEATURE_PROC_INFO" = @@ -305,17 +304,18 @@ Potential Syslog messages which can be logged are: DATE sonic INFO coredump_gen_handler[pid]: Global Cooloff period has not passed. Techsupport Invocation is skipped. Core: python3.1629401152.23.core.gz DATE sonic INFO coredump_gen_handler[pid]: Process Cooloff period for snmp has not passed.Techsupport Invocation is skipped. Core: python3.1629401152.23.core.gz DATE sonic INFO coredump_gen_handler[pid]: "show techsupport --since '2 days ago'" is successful, sonic_dump_sonic_20210721_235228.tar.gz is created -DATE sonic INFO coredump_gen_handler[pid]: No Cleanup is performed, current size occupied: 456 MB +DATE sonic INFO coredump_gen_handler[pid]: core-usage argument is not set. No Cleanup is performed, current size occupied: 456 MB DATE sonic INFO coredump_gen_handler[pid]: 12 MB deleted from /var/core. -DATE sonic NOTICE coredump_gen_handler[pid]: No Corresponding Exit event info was found for python3.1629401152.23.core.gz. Techsupport Invocation is skipped +DATE sonic INFO coredump_gen_handler[pid]: No Corresponding Exit event info was found for python3.1629401152.23.core.gz. Techsupport Invocation is skipped DATE sonic NOTICE coredump_gen_handler[pid]: auto_invoke_ts is not enabled. No Techsupport Invocation will be performed. core: python3.1629401152.23.core.gz DATE sonic NOTICE coredump_gen_handler[pid]: auto-techsupport feature for swss is not enabled. Techsupport Invocation is skipped. core: python3.1629401152.23.core.gz +DATE sonic NOTICE coredump_gen_handler[pid]: coredump_cleanup is disabled. No cleanup is performed DATE sonic ERR coredump_gen_handler[pid]: "show techsupport --since '2 days ago'" was run, but no techsupport dump is found ``` ### 6.3 Requirements for FEATURE_PROC_INFO table -A coredump generate will be of format ...core.gz. comm name is typically the executable file name. The dump name is the only information directly available to coredump_gen_handler script. And Just by looking at this, it not possible to infer if the coredump generated is of a particular critical process. That missed information is read from AUTO_TECHSUPPORT|FEATURE_PROC_INFO table. +A coredump generate will be of format `...core.gz`. comm name is typically the executable file name. The dump name is the only information directly available to coredump_gen_handler script. And Just by looking at this, it not possible to infer if the coredump generated is of a particular critical process. That information is read from AUTO_TECHSUPPORT|FEATURE_PROC_INFO table. Producer for this table is the supervisor-proc-exit-listener script running inside every docker. This script is an event listener for PROC_EXIT & PROC_RUNNING events for the processes running inside the docker and is naturally the right fit to populate the AUTO_TECHSUPPORT|FEATURE_PROC_INFO table. @@ -334,7 +334,8 @@ A script under the name `techsupport_cleanup.py` is added to `/usr/local/bin/` d Potential Syslog messages which can be logged are: ``` -DATE sonic NOTICE techsupport_cleanup[pid]: /var/dump/ cleanup is performed. current number of dumps: 4 +DATE sonic NOTICE techsupport_cleanup[pid]: techsupport_cleanup is disabled. No cleanup is performed +DATE sonic INFO coredump_gen_handler[pid]: max-techsupport-size argument is not set. No Cleanup is performed, current size occupied: 456 MB ``` ### 6.5 Warmboot consideration From c1d5c003aebcd3e3863230d5ac8a401c4e6812f8 Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Thu, 19 Aug 2021 17:35:08 -0700 Subject: [PATCH 58/71] Updated Schema --- doc/auto_techsupport_and_coredump_mgmt.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index 0f0a7790961..b58777ce4ee 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -226,14 +226,14 @@ sonic_dump_sonic_20210405_202756 = python3.1617684247.17.core;1617684249;snmp-su ``` key = "AUTO_TECHSUPPORT|FEATURE_PROC_INFO" - = + = ``` Eg: ``` - = "swss;orchagent" - = "snmp;snmp-subagent" - = "lldp;lldp_syncd" + = + = + = ``` ## 5. CLI Enhancements. From f3b3e11802d48f2ca74810bff35caad1ad1e263d Mon Sep 17 00:00:00 2001 From: Vivek Reddy Date: Sun, 29 Aug 2021 19:33:30 -0700 Subject: [PATCH 59/71] Addressed Comments --- doc/auto_techsupport_and_coredump_mgmt.md | 174 ++++++++++++---------- 1 file changed, 97 insertions(+), 77 deletions(-) diff --git a/doc/auto_techsupport_and_coredump_mgmt.md b/doc/auto_techsupport_and_coredump_mgmt.md index b58777ce4ee..14a52dd0ae1 100644 --- a/doc/auto_techsupport_and_coredump_mgmt.md +++ b/doc/auto_techsupport_and_coredump_mgmt.md @@ -24,13 +24,14 @@ | Rev | Date | Author | Change Description | |:---:|:-----------:|:-------------------------|:----------------------| | 1.0 | 06/22/2021 | Vivek Reddy Karri | Auto Invocation of Techsupport, triggered by a core dump | +| 1.1 | TBD | Vivek Reddy Karri | Add the YANG Model and the autogen cli AUTO_TECHSUPPORT|RATE_LIMIT_INTERVAL table| | 2.0 | TBD | Vivek Reddy Karri | Extending Support for Kernel Dumps | ## About this Manual This document describes the details of the system which facilitates the auto techsupport invocation support in SONiC. The auto invocation is triggered when any critical process inside the docker crashes and a core dump is generated. ## 1. Overview -Currently, techsupport is run by invoking `show techsupport` either by orchestration tools like Jenkins or manually. The techsupport dump also collects any core dump files available in the `/var/core/` directory. +Currently, techsupport is run by invoking `show techsupport` either by orchestration tools or manually. The techsupport dump also collects any core dump files available in the `/var/core/` directory. However if the techsupport invocation can be made event-driven based on core dump generation, that would definitely improve the debuggability. That is the overall idea behind this HLD. All the high-level requirements are summarized in the next section @@ -43,18 +44,17 @@ However if the techsupport invocation can be made event-driven based on core dum * Users should have the abiliity to globally enable/disable this capability through CLI. ### Configurable Params -* A configurable "cooloff" should be introduced to limit the number of techsupport invocations. +* A configurable "rate_limit_interval" should be introduced to limit the number consecutive of techsupport invocations. * The existing "--since" option in techsupport should be leveraged and this should be a configurable parameter for this feature ### Per-docker Scope * Should provide a per-docker configurable granularity for this feature. * Per-docker enable/disable capability should be achieved through FEATURE table. -* Per-docker cooloff capability should is achieved through FEATURE table. +* Per-docker rate_limit_interval capability should also be provided * Changes to per-docker config's will apply to all the critical processes inside the corresponding docker. -* Existing FEATURE CLI & Table should be used to apply the Configuration ### Invocation Rules -* Auto techsupport invocation should only happen when both the global cooloff and per-docker cooloff period is passed. +* Auto techsupport invocation should only happen when both the global rate_limit_interval and per-docker rate_limit_interval period has passed. * Feature should be enabled globally and also per-docker, for this to apply on any of the critical processes running inside that docker. ### Core & Techsupport Cleanup @@ -70,38 +70,42 @@ The naming format and compression is governed by the script `/usr/local/bin/core ### Config DB -#### AUTO_TECHSUPPORT|global +#### AUTO_TECHSUPPORT|GLOBAL ``` key = "AUTO_TECHSUPPORT|global" auto_invoke_ts = enabled|disabled; # Enable this to make the Techsupport Invocation event driven based on core-dump generation coredump_cleanup = enabled|disabled; # Enable Core dump cleanup based on core_usage argument techsupport_cleanup = enabled|disabled; # Enable Techsupport Dump cleanup based on max_techsupport_size argument -cooloff = 300; # Minimum Time in seconds, between two successive techsupport invocations. - Manual Invocations will be considered as well in the cooloff calculation -max_techsupport_size = 10; # A perentage value should be specified. - This signifies maximum Size to which /var/dump/ directory can be grown until. - The actual value in bytes is calculate based on the available space in the filesystem hosting /var/dump - When the limit is crossed, the older techsupport dumps are incrementally deleted -core_usage = 5; # A perentage value should be specified. - This signifies maximum Size to which /var/core directory can be grown until. - The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core - When the limit is crossed, the older core files are incrementally deleted -since = "2 days ago"; # This limits the auto-invoked techsupport to only collect the logs & core-dumps generated since the time provided. - Any valid date string of the formats specified here (https://www.gnu.org/software/coreutils/manual/html_node/Date-input-formats.html) - can be used. - If this value is not explicitly configured or a non-valid string is provided, a default value of "2 days ago" is used. +rate_limit_interval = 300; # Minimum Time in seconds, between two successive techsupport invocations. + Manual Invocations will be considered as well in the cooloff calculation +max_techsupport_size = 10; # A perentage value should be specified. + This signifies maximum Size to which /var/dump/ directory can be grown until. + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/dump + When the limit is crossed, the older techsupport dumps are incrementally deleted +core_usage = 5; # A perentage value should be specified. + This signifies maximum Size to which /var/core directory can be grown until. + The actual value in bytes is calculate based on the available space in the filesystem hosting /var/core + When the limit is crossed, the older core files are incrementally deleted +since = "2 days ago"; # This limits the auto-invoked techsupport to only collect the logs & core-dumps generated since the time provided. + Any valid date string of the formats specified here (https://www.gnu.org/software/coreutils/manual/html_node/Date-input-formats.html) + can be used. If this value is not explicitly configured or a non-valid string is provided, a default value of "2 days ago" is used. ``` -#### FEATURE Table +#### AUTO_TECHSUPPORT|RATE_LIMIT_INTERVAL +``` +# Minimum Time in seconds, between two successive techsupport invocations because of the same process +# The idea here is not to let a periodically crashing process to invoke the techsupport until a cooloff is met + =