Skip to content

Commit 2ddefb3

Browse files
lhotarimanas-ctds
authored andcommitted
[improve][ci] Add Netty leak detection reporting to Pulsar CI (apache#24272)
(cherry picked from commit f51123c) (cherry picked from commit fcdf7b2)
1 parent e5dbd04 commit 2ddefb3

File tree

28 files changed

+616
-34
lines changed

28 files changed

+616
-34
lines changed

.github/workflows/pulsar-ci-flaky.yaml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ on:
2626
- pulsar-*
2727
- 4.0_ds
2828
schedule:
29-
# scheduled job with JDK 17
30-
- cron: '0 12 * * *'
3129
# scheduled job with JDK 21
30+
- cron: '0 12 * * *'
31+
# scheduled job with JDK 17
3232
# if cron expression is changed, make sure to update the expression in jdk_major_version step in preconditions job
3333
- cron: '0 6 * * *'
3434
workflow_dispatch:
@@ -62,6 +62,15 @@ on:
6262
required: true
6363
type: number
6464
default: 10000
65+
netty_leak_detection:
66+
description: 'Controls Netty leak detection. When set to "report", Netty leak detection is enabled. When set to "fail_on_leak", Netty leak detection is enabled and a build job will fail if leaks are detected. When set to "off", Netty leak detection is disabled.'
67+
required: true
68+
type: choice
69+
options:
70+
- 'report'
71+
- 'fail_on_leak'
72+
- 'off'
73+
default: 'report'
6574

6675
concurrency:
6776
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}${{ github.event_name == 'workflow_dispatch' && github.event.inputs.jdk_major_version || '' }}
@@ -85,7 +94,8 @@ jobs:
8594
need_owasp: ${{ steps.changes.outputs.need_owasp }}
8695
collect_coverage: ${{ steps.check_coverage.outputs.collect_coverage }}
8796
jdk_major_version: ${{ steps.jdk_major_version.outputs.jdk_major_version }}
88-
97+
java_non_tests: ${{ steps.changes.outputs.java_non_tests }}
98+
netty_leak_detection: ${{ steps.netty_leak_detection.outputs.netty_leak_detection }}
8999
steps:
90100
- name: Cancel scheduled jobs in forks by default
91101
if: ${{ github.repository != 'apache/pulsar' && github.event_name == 'schedule' }}
@@ -137,6 +147,13 @@ jobs:
137147
|| (github.event_name == 'workflow_dispatch' && github.event.inputs.collect_coverage == 'true')
138148
}}" >> $GITHUB_OUTPUT
139149
150+
- name: Set Netty leak detection mode
151+
id: netty_leak_detection
152+
run: |
153+
echo "netty_leak_detection=${{
154+
github.event_name == 'workflow_dispatch' && github.event.inputs.netty_leak_detection || 'report'
155+
}}" >> $GITHUB_OUTPUT
156+
140157
- name: Check if the PR has been approved for testing
141158
if: ${{ steps.check_changes.outputs.docs_only != 'true' && github.repository == 'apache/pulsar' && github.event_name == 'pull_request' }}
142159
env:
@@ -157,6 +174,8 @@ jobs:
157174
TRACE_TEST_RESOURCE_CLEANUP_DIR: ${{ github.workspace }}/target/trace-test-resource-cleanup
158175
THREAD_LEAK_DETECTOR_WAIT_MILLIS: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.thread_leak_detector_wait_millis || 10000 }}
159176
THREAD_LEAK_DETECTOR_DIR: ${{ github.workspace }}/target/thread-leak-dumps
177+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
178+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
160179
runs-on: ubuntu-22.04
161180
timeout-minutes: 100
162181
if: ${{ needs.preconditions.outputs.docs_only != 'true' }}
@@ -227,6 +246,10 @@ jobs:
227246
cat threadleak*.txt | awk '/^Summary:/ {print "::warning::" $0 "\n"; next} {print}'
228247
fi
229248
249+
- name: Report detected Netty leaks
250+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
251+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
252+
230253
- name: Create Jacoco reports
231254
if: ${{ needs.preconditions.outputs.collect_coverage == 'true' }}
232255
continue-on-error: true
@@ -269,6 +292,7 @@ jobs:
269292
/tmp/*.hprof
270293
**/hs_err_*.log
271294
**/core.*
295+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
272296
${{ env.TRACE_TEST_RESOURCE_CLEANUP_DIR }}/*
273297
${{ env.THREAD_LEAK_DETECTOR_DIR }}/*
274298
retention-days: 7

.github/workflows/pulsar-ci.yaml

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,15 @@ on:
6262
required: true
6363
type: number
6464
default: 10000
65+
netty_leak_detection:
66+
description: 'Controls Netty leak detection. When set to "report", Netty leak detection is enabled. When set to "fail_on_leak", Netty leak detection is enabled and a build job will fail if leaks are detected. When set to "off", Netty leak detection is disabled.'
67+
required: true
68+
type: choice
69+
options:
70+
- 'report'
71+
- 'fail_on_leak'
72+
- 'off'
73+
default: 'report'
6574

6675
concurrency:
6776
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}${{ github.event_name == 'workflow_dispatch' && github.event.inputs.jdk_major_version || '' }}
@@ -86,6 +95,7 @@ jobs:
8695
collect_coverage: ${{ steps.check_coverage.outputs.collect_coverage }}
8796
jdk_major_version: ${{ steps.jdk_major_version.outputs.jdk_major_version }}
8897
java_non_tests: ${{ steps.changes.outputs.java_non_tests }}
98+
netty_leak_detection: ${{ steps.netty_leak_detection.outputs.netty_leak_detection }}
8999
steps:
90100
- name: Cancel scheduled jobs in forks by default
91101
if: ${{ github.repository != 'apache/pulsar' && github.event_name == 'schedule' }}
@@ -137,6 +147,13 @@ jobs:
137147
|| (github.event_name == 'workflow_dispatch' && github.event.inputs.collect_coverage == 'true')
138148
}}" >> $GITHUB_OUTPUT
139149
150+
- name: Set Netty leak detection mode
151+
id: netty_leak_detection
152+
run: |
153+
echo "netty_leak_detection=${{
154+
github.event_name == 'workflow_dispatch' && github.event.inputs.netty_leak_detection || 'report'
155+
}}" >> $GITHUB_OUTPUT
156+
140157
- name: Check if the PR has been approved for testing
141158
if: ${{ steps.check_changes.outputs.docs_only != 'true' && github.repository == 'apache/pulsar' && github.event_name == 'pull_request' }}
142159
env:
@@ -233,6 +250,8 @@ jobs:
233250
TRACE_TEST_RESOURCE_CLEANUP_DIR: ${{ github.workspace }}/target/trace-test-resource-cleanup
234251
THREAD_LEAK_DETECTOR_WAIT_MILLIS: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.thread_leak_detector_wait_millis || 10000 }}
235252
THREAD_LEAK_DETECTOR_DIR: ${{ github.workspace }}/target/thread-leak-dumps
253+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
254+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
236255
runs-on: ubuntu-22.04
237256
timeout-minutes: ${{ matrix.timeout || 60 }}
238257
needs: ['preconditions', 'build-and-license-check']
@@ -350,6 +369,10 @@ jobs:
350369
cat threadleak*.txt | awk '/^Summary:/ {print "::warning::" $0 "\n"; next} {print}'
351370
fi
352371
372+
- name: Report detected Netty leaks
373+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
374+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
375+
353376
- name: Upload Surefire reports
354377
uses: actions/upload-artifact@v4
355378
if: ${{ !success() || env.TRACE_TEST_RESOURCE_CLEANUP != 'off' }}
@@ -367,6 +390,7 @@ jobs:
367390
/tmp/*.hprof
368391
**/hs_err_*.log
369392
**/core.*
393+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
370394
${{ env.TRACE_TEST_RESOURCE_CLEANUP_DIR }}/*
371395
${{ env.THREAD_LEAK_DETECTOR_DIR }}/*
372396
retention-days: 7
@@ -555,6 +579,8 @@ jobs:
555579
PULSAR_TEST_IMAGE_NAME: apachepulsar/java-test-image:latest
556580
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
557581
CI_JDK_MAJOR_VERSION: ${{ needs.preconditions.outputs.jdk_major_version }}
582+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
583+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
558584
strategy:
559585
fail-fast: false
560586
matrix:
@@ -703,6 +729,10 @@ jobs:
703729
report_paths: 'test-reports/TEST-*.xml'
704730
annotate_only: 'true'
705731

732+
- name: Report detected Netty leaks
733+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
734+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
735+
706736
- name: Upload Surefire reports
707737
uses: actions/upload-artifact@v4
708738
if: ${{ !success() }}
@@ -711,6 +741,19 @@ jobs:
711741
path: surefire-reports
712742
retention-days: 7
713743

744+
- name: Upload possible heap dump, core dump or crash files
745+
uses: actions/upload-artifact@v4
746+
if: ${{ always() }}
747+
with:
748+
name: Integration-${{ matrix.upload_name || matrix.group }}-dumps
749+
path: |
750+
/tmp/*.hprof
751+
**/hs_err_*.log
752+
**/core.*
753+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
754+
retention-days: 7
755+
if-no-files-found: ignore
756+
714757
- name: Upload container logs
715758
uses: actions/upload-artifact@v4
716759
if: ${{ !success() }}
@@ -973,6 +1016,8 @@ jobs:
9731016
PULSAR_TEST_IMAGE_NAME: apachepulsar/pulsar-test-latest-version:latest
9741017
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
9751018
CI_JDK_MAJOR_VERSION: ${{ needs.preconditions.outputs.jdk_major_version }}
1019+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
1020+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
9761021
strategy:
9771022
fail-fast: false
9781023
matrix:
@@ -1080,6 +1125,10 @@ jobs:
10801125
report_paths: 'test-reports/TEST-*.xml'
10811126
annotate_only: 'true'
10821127

1128+
- name: Report detected Netty leaks
1129+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
1130+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
1131+
10831132
- name: Upload container logs
10841133
uses: actions/upload-artifact@v4
10851134
if: ${{ !success() }}
@@ -1097,6 +1146,19 @@ jobs:
10971146
path: surefire-reports
10981147
retention-days: 7
10991148

1149+
- name: Upload possible heap dump, core dump or crash files
1150+
uses: actions/upload-artifact@v4
1151+
if: ${{ always() }}
1152+
with:
1153+
name: System-${{ matrix.group }}-dumps
1154+
path: |
1155+
/tmp/*.hprof
1156+
**/hs_err_*.log
1157+
**/core.*
1158+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
1159+
retention-days: 7
1160+
if-no-files-found: ignore
1161+
11001162
- name: Wait for ssh connection when build fails
11011163
# ssh access is enabled for builds in own forks
11021164
uses: ./.github/actions/ssh-access
@@ -1203,6 +1265,8 @@ jobs:
12031265
PULSAR_TEST_IMAGE_NAME: apachepulsar/pulsar-test-latest-version:latest
12041266
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
12051267
CI_JDK_MAJOR_VERSION: ${{ needs.preconditions.outputs.jdk_major_version }}
1268+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
1269+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
12061270
strategy:
12071271
fail-fast: false
12081272
matrix:
@@ -1287,23 +1351,40 @@ jobs:
12871351
report_paths: 'test-reports/TEST-*.xml'
12881352
annotate_only: 'true'
12891353

1354+
- name: Report detected Netty leaks
1355+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
1356+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
1357+
12901358
- name: Upload container logs
12911359
uses: actions/upload-artifact@v4
12921360
if: ${{ !success() }}
12931361
continue-on-error: true
12941362
with:
1295-
name: System-${{ matrix.group }}-container-logs
1363+
name: Flaky-System-${{ matrix.group }}-container-logs
12961364
path: tests/integration/target/container-logs
12971365
retention-days: 7
12981366

12991367
- name: Upload Surefire reports
13001368
uses: actions/upload-artifact@v4
13011369
if: ${{ !success() }}
13021370
with:
1303-
name: System-${{ matrix.name }}-surefire-reports
1371+
name: Flaky-System-${{ matrix.name }}-surefire-reports
13041372
path: surefire-reports
13051373
retention-days: 7
13061374

1375+
- name: Upload possible heap dump, core dump or crash files
1376+
uses: actions/upload-artifact@v4
1377+
if: ${{ always() }}
1378+
with:
1379+
name: Flaky-System-${{ matrix.group }}-dumps
1380+
path: |
1381+
/tmp/*.hprof
1382+
**/hs_err_*.log
1383+
**/core.*
1384+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
1385+
retention-days: 7
1386+
if-no-files-found: ignore
1387+
13071388
- name: Wait for ssh connection when build fails
13081389
# ssh access is enabled for builds in own forks
13091390
uses: ./.github/actions/ssh-access

build/pulsar_ci_tool.sh

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,68 @@ ci_create_inttest_coverage_report() {
579579
echo "::endgroup::"
580580
}
581581

582+
ci_report_netty_leaks() {
583+
if [ -z "$NETTY_LEAK_DUMP_DIR" ]; then
584+
echo "NETTY_LEAK_DUMP_DIR isn't set"
585+
return 0
586+
fi
587+
local temp_file=$(mktemp -t netty_leak.XXXX)
588+
589+
# concat all netty_leak_*.txt files in the dump directory to a temp file
590+
if [ -d "$NETTY_LEAK_DUMP_DIR" ]; then
591+
find "$NETTY_LEAK_DUMP_DIR" -maxdepth 1 -type f -name "netty_leak_*.txt" -exec cat {} \; >> $temp_file
592+
fi
593+
594+
# check if there are any netty_leak_*.txt files in the container logs
595+
local container_logs_dir="tests/integration/target/container-logs"
596+
if [ -d "$container_logs_dir" ]; then
597+
local container_netty_leak_dump_dir="$NETTY_LEAK_DUMP_DIR/container-logs"
598+
mkdir -p "$container_netty_leak_dump_dir"
599+
while read -r file; do
600+
# example file name "tests/integration/target/container-logs/ltnizrzm-standalone/var-log-pulsar.tar.gz"
601+
# take ltnizrzm-standalone part
602+
container_name=$(basename "$(dirname "$file")")
603+
target_dir="$container_netty_leak_dump_dir/$container_name"
604+
mkdir -p "$target_dir"
605+
tar -C "$target_dir" -zxf "$file" --strip-components=1 --wildcards --wildcards-match-slash '*/netty_leak_*.txt' >/dev/null 2>&1 || true
606+
done < <(find "$container_logs_dir" -type f -name "*.tar.gz")
607+
# remove all empty directories
608+
find "$container_netty_leak_dump_dir" -type d -empty -delete
609+
# print all netty_leak_*.txt files in the container logs dump directory to the temp file
610+
if [ -d "$container_netty_leak_dump_dir" ]; then
611+
find "$container_netty_leak_dump_dir" -type f -name "netty_leak_*.txt" -exec cat {} \; >> $temp_file
612+
fi
613+
fi
614+
615+
if [ -s $temp_file ]; then
616+
local leak_found_log_message
617+
if [[ "$NETTY_LEAK_DETECTION" == "fail_on_leak" ]]; then
618+
leak_found_log_message="::error::Netty leaks found. Failing the build since Netty leak detection is set to 'fail_on_leak'."
619+
else
620+
leak_found_log_message="::warning::Netty leaks found."
621+
fi
622+
{
623+
echo "${leak_found_log_message}"
624+
local test_file_locations=$(grep -h -i test $temp_file | grep org.apache | sed 's/^[[:space:]]*//;s/[[:space:]]*$//;s/^Hint: //' | sort -u || true)
625+
if [[ -n "$test_file_locations" ]]; then
626+
echo "Test file locations in stack traces:"
627+
echo
628+
echo "$test_file_locations"
629+
fi
630+
echo "Details:"
631+
cat $temp_file
632+
} | tee $NETTY_LEAK_DUMP_DIR/leak_report.txt
633+
touch target/netty_leaks_found
634+
if [[ "$NETTY_LEAK_DETECTION" == "fail_on_leak" ]]; then
635+
exit 1
636+
fi
637+
else
638+
echo "No netty leaks found."
639+
touch target/netty_leaks_not_found
640+
fi
641+
rm $temp_file
642+
}
643+
582644
if [ -z "$1" ]; then
583645
echo "usage: $0 [ci_tool_function_name]"
584646
echo "Available ci tool functions:"

buildtools/pom.xml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,12 +162,17 @@
162162
</exclusion>
163163
</exclusions>
164164
</dependency>
165-
<!-- for testing FastThreadLocalStateCleaner -->
166165
<dependency>
167166
<groupId>io.netty</groupId>
168167
<artifactId>netty-common</artifactId>
169168
<version>${netty.version}</version>
170-
<scope>test</scope>
169+
<scope>provided</scope>
170+
</dependency>
171+
<dependency>
172+
<groupId>io.netty</groupId>
173+
<artifactId>netty-buffer</artifactId>
174+
<version>${netty.version}</version>
175+
<scope>provided</scope>
171176
</dependency>
172177
<dependency>
173178
<groupId>org.mockito</groupId>

0 commit comments

Comments
 (0)