Skip to content

Commit 3a8ba71

Browse files
authored
Fix rasdaemon crash during bootup on AMD CPU (#19023)
- Why I did it Booting SONiC on a AMD EPYC 16-Core CPU is causing rasdaemon to crash. This is not a major blocker because rasdaemon eventually restarts and is stable after a point. Coredump stack trace: [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1". Core was generated by `/usr/sbin/rasdaemon -f -r'. Program terminated with signal SIGBUS, Bus error. #0 0x00007f74f62af7f4 in sqlite3_finalize () from /lib/x86_64-linux-gnu/libsqlite3.so.0 [Current thread is 1 (Thread 0x7f73c8ff96c0 (LWP 17416))] Known issue for rasdaemon: mchehab/rasdaemon#77 Fixed here: mchehab/rasdaemon@f1ea763 Unfortunately this fix is not present in the default bookworm version. So, backported the fix and compiled rasdaemon from source Here is the patch: https://sources.debian.org/patches/rasdaemon/0.8.0-2/0001-Check-CPUs-online-not-configured.patch/ - How I did it - How to verify it Booted the image built with these changes and no issue in observed Signed-off-by: Vivek Reddy <[email protected]>
1 parent c2b0c7b commit 3a8ba71

File tree

8 files changed

+88
-6
lines changed

8 files changed

+88
-6
lines changed

build_debian.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -420,12 +420,6 @@ EOF
420420
# override tcpdump profile to allow tcpdump access TACACS config file.
421421
sudo cp files/apparmor/usr.bin.tcpdump $FILESYSTEM_ROOT/etc/apparmor.d/local/usr.bin.tcpdump
422422

423-
if [[ $CONFIGURED_ARCH == amd64 ]]; then
424-
## Pre-install the fundamental packages for amd64 (x86)
425-
sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y install \
426-
rasdaemon
427-
fi
428-
429423
## Set /etc/shadow permissions to -rw-------.
430424
sudo LANG=c chroot $FILESYSTEM_ROOT chmod 600 /etc/shadow
431425

files/build_templates/sonic_debian_extension.j2

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,11 @@ sudo cp $IMAGE_CONFIGS/corefile_uploader/core_analyzer.rc.json $FILESYSTEM_ROOT_
510510
sudo chmod og-rw $FILESYSTEM_ROOT_ETC_SONIC/core_analyzer.rc.json
511511

512512
if [[ $CONFIGURED_ARCH == amd64 ]]; then
513+
# Install rasdaemon package
514+
# NOTE: Can be installed from debian directly when we move to trixie
515+
sudo dpkg --root=$FILESYSTEM_ROOT -i $debs_path/rasdaemon_*.deb || \
516+
sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y install -f
517+
513518
# Rasdaemon service configuration. Use timer to start rasdaemon with a delay for better fast/warm boot performance
514519
sudo cp $IMAGE_CONFIGS/rasdaemon/rasdaemon.timer $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
515520
sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT systemctl disable rasdaemon.service

rules/rasdaemon.dep

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
SPATH := $($(RASDAEMON)_SRC_PATH)
3+
DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/rasdaemon.mk rules/rasdaemon.dep
4+
DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST)
5+
DEP_FILES += $(shell git ls-files $(SPATH))
6+
7+
$(RASDAEMON)_CACHE_MODE := GIT_CONTENT_SHA
8+
$(RASDAEMON)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST)
9+
$(RASDAEMON)_DEP_FILES := $(DEP_FILES)
10+

rules/rasdaemon.mk

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# rasdaemon package
2+
3+
RASDAEMON_VERSION = 0.6.8-1
4+
export RASDAEMON_VERSION
5+
6+
RASDAEMON = rasdaemon_$(RASDAEMON_VERSION)_$(CONFIGURED_ARCH).deb
7+
$(RASDAEMON)_SRC_PATH = $(SRC_PATH)/rasdaemon
8+
SONIC_MAKE_DEBS += $(RASDAEMON)

slave.mk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1393,6 +1393,7 @@ $(addprefix $(TARGET_PATH)/, $(SONIC_INSTALLERS)) : $(TARGET_PATH)/% : \
13931393
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_PLATFORM_API_PY3)) \
13941394
$(if $(findstring y,$(PDDF_SUPPORT)),$(addprefix $(PYTHON_WHEELS_PATH)/,$(PDDF_PLATFORM_API_BASE_PY2))) \
13951395
$(if $(findstring y,$(PDDF_SUPPORT)),$(addprefix $(PYTHON_WHEELS_PATH)/,$(PDDF_PLATFORM_API_BASE_PY3))) \
1396+
$(if $(findstring amd64,$(CONFIGURED_ARCH)),$(addprefix $(IMAGE_DISTRO_DEBS_PATH)/,$(RASDAEMON))) \
13961397
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MODELS_PY3)) \
13971398
$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_CTRMGRD)) \
13981399
$(addprefix $(FILES_PATH)/,$($(SONIC_CTRMGRD)_FILES)) \

sonic-slave-bookworm/Dockerfile.j2

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,9 @@ RUN apt-get update && apt-get install -y \
337337
qemu-kvm \
338338
libvirt-clients \
339339
python3-pexpect \
340+
# For rasdaemon build
341+
libsqlite3-dev \
342+
libgettextpo-dev \
340343
{%- endif %}
341344
# For ntp
342345
autogen \
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
From f1ea76375281001cdf4a048c1a4a24d86c6fbe48 Mon Sep 17 00:00:00 2001
2+
From: Zeph / Liz Loss-Cutler-Hull <[email protected]>
3+
Date: Sun, 9 Jul 2023 04:57:19 -0700
4+
Subject: [PATCH] Check CPUs online, not configured.
5+
6+
When the number of CPUs detected is greater than the number of CPUs in
7+
the system, rasdaemon will crash when it receives some events.
8+
9+
Looking deeper, we also fail to use the poll method for similar reasons
10+
in this case.
11+
12+
All of this can be prevented by checking to see how many CPUs are
13+
currently online (sysconf(_SC_NPROCESSORS_ONLN)) instead of how many
14+
CPUs the current kernel was configured to support
15+
(sysconf(_SC_NPROCESSORS_CONF)).
16+
17+
For the kernel side of the discussion, see https://lore.kernel.org/lkml/CAM6Wdxft33zLeeXHhmNX5jyJtfGTLiwkQSApc=10fqf+rQh9DA@mail.gmail.com/T/
18+
Signed-off-by: Mauro Carvalho Chehab <[email protected]>
19+
---
20+
ras-events.c | 2 +-
21+
1 file changed, 1 insertion(+), 1 deletion(-)
22+
23+
diff --git a/ras-events.c b/ras-events.c
24+
index a82dab2..5935163 100644
25+
--- a/ras-events.c
26+
+++ b/ras-events.c
27+
@@ -350,7 +350,7 @@ static void parse_ras_data(struct pthread_data *pdata, struct kbuffer *kbuf,
28+
29+
static int get_num_cpus(struct ras_events *ras)
30+
{
31+
- return sysconf(_SC_NPROCESSORS_CONF);
32+
+ return sysconf(_SC_NPROCESSORS_ONLN);
33+
#if 0
34+
char fname[MAX_PATH + 1];
35+
int num_cpus = 0;
36+
--
37+
2.36.1
38+

src/rasdaemon/Makefile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
.ONESHELL:
2+
SHELL = /bin/bash
3+
.SHELLFLAGS += -e
4+
5+
MAIN_TARGET = rasdaemon_$(RASDAEMON_VERSION)_$(CONFIGURED_ARCH).deb
6+
7+
$(addprefix $(DEST)/, $(MAIN_TARGET)): $(DEST)/% :
8+
rm -rf rasdaemon/
9+
10+
# Checkout Repository
11+
git clone https://salsa.debian.org/tai271828/rasdaemon.git -b debian/$(RASDAEMON_VERSION)
12+
13+
pushd ./rasdaemon
14+
# Patch
15+
git apply ../0001-Check-CPUs-online-not-configured.patch
16+
ifeq ($(CROSS_BUILD_ENVIRON), y)
17+
dpkg-buildpackage -rfakeroot -b -us -uc -a$(CONFIGURED_ARCH) -Pcross,nocheck -j$(SONIC_CONFIG_MAKE_JOBS) --admindir $(SONIC_DPKG_ADMINDIR)
18+
else
19+
dpkg-buildpackage -rfakeroot -b -us -uc -j$(SONIC_CONFIG_MAKE_JOBS) --admindir $(SONIC_DPKG_ADMINDIR)
20+
endif
21+
popd
22+
23+
mv $* $(DEST)/

0 commit comments

Comments
 (0)