Skip to content

Commit fcdf7b2

Browse files
committed
[improve][ci] Add Netty leak detection reporting to Pulsar CI (#24272)
(cherry picked from commit f51123c)
1 parent 965940d commit fcdf7b2

File tree

28 files changed

+616
-34
lines changed

28 files changed

+616
-34
lines changed

.github/workflows/pulsar-ci-flaky.yaml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ on:
2525
- branch-*
2626
- pulsar-*
2727
schedule:
28-
# scheduled job with JDK 17
29-
- cron: '0 12 * * *'
3028
# scheduled job with JDK 21
29+
- cron: '0 12 * * *'
30+
# scheduled job with JDK 17
3131
# if cron expression is changed, make sure to update the expression in jdk_major_version step in preconditions job
3232
- cron: '0 6 * * *'
3333
workflow_dispatch:
@@ -61,6 +61,15 @@ on:
6161
required: true
6262
type: number
6363
default: 10000
64+
netty_leak_detection:
65+
description: 'Controls Netty leak detection. When set to "report", Netty leak detection is enabled. When set to "fail_on_leak", Netty leak detection is enabled and a build job will fail if leaks are detected. When set to "off", Netty leak detection is disabled.'
66+
required: true
67+
type: choice
68+
options:
69+
- 'report'
70+
- 'fail_on_leak'
71+
- 'off'
72+
default: 'report'
6473

6574
concurrency:
6675
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}${{ github.event_name == 'workflow_dispatch' && github.event.inputs.jdk_major_version || '' }}
@@ -84,7 +93,8 @@ jobs:
8493
need_owasp: ${{ steps.changes.outputs.need_owasp }}
8594
collect_coverage: ${{ steps.check_coverage.outputs.collect_coverage }}
8695
jdk_major_version: ${{ steps.jdk_major_version.outputs.jdk_major_version }}
87-
96+
java_non_tests: ${{ steps.changes.outputs.java_non_tests }}
97+
netty_leak_detection: ${{ steps.netty_leak_detection.outputs.netty_leak_detection }}
8898
steps:
8999
- name: Cancel scheduled jobs in forks by default
90100
if: ${{ github.repository != 'apache/pulsar' && github.event_name == 'schedule' }}
@@ -136,6 +146,13 @@ jobs:
136146
|| (github.event_name == 'workflow_dispatch' && github.event.inputs.collect_coverage == 'true')
137147
}}" >> $GITHUB_OUTPUT
138148
149+
- name: Set Netty leak detection mode
150+
id: netty_leak_detection
151+
run: |
152+
echo "netty_leak_detection=${{
153+
github.event_name == 'workflow_dispatch' && github.event.inputs.netty_leak_detection || 'report'
154+
}}" >> $GITHUB_OUTPUT
155+
139156
- name: Check if the PR has been approved for testing
140157
if: ${{ steps.check_changes.outputs.docs_only != 'true' && github.repository == 'apache/pulsar' && github.event_name == 'pull_request' }}
141158
env:
@@ -156,6 +173,8 @@ jobs:
156173
TRACE_TEST_RESOURCE_CLEANUP_DIR: ${{ github.workspace }}/target/trace-test-resource-cleanup
157174
THREAD_LEAK_DETECTOR_WAIT_MILLIS: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.thread_leak_detector_wait_millis || 10000 }}
158175
THREAD_LEAK_DETECTOR_DIR: ${{ github.workspace }}/target/thread-leak-dumps
176+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
177+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
159178
runs-on: ubuntu-22.04
160179
timeout-minutes: 100
161180
if: ${{ needs.preconditions.outputs.docs_only != 'true' }}
@@ -226,6 +245,10 @@ jobs:
226245
cat threadleak*.txt | awk '/^Summary:/ {print "::warning::" $0 "\n"; next} {print}'
227246
fi
228247
248+
- name: Report detected Netty leaks
249+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
250+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
251+
229252
- name: Create Jacoco reports
230253
if: ${{ needs.preconditions.outputs.collect_coverage == 'true' }}
231254
continue-on-error: true
@@ -268,6 +291,7 @@ jobs:
268291
/tmp/*.hprof
269292
**/hs_err_*.log
270293
**/core.*
294+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
271295
${{ env.TRACE_TEST_RESOURCE_CLEANUP_DIR }}/*
272296
${{ env.THREAD_LEAK_DETECTOR_DIR }}/*
273297
retention-days: 7

.github/workflows/pulsar-ci.yaml

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,15 @@ on:
6161
required: true
6262
type: number
6363
default: 10000
64+
netty_leak_detection:
65+
description: 'Controls Netty leak detection. When set to "report", Netty leak detection is enabled. When set to "fail_on_leak", Netty leak detection is enabled and a build job will fail if leaks are detected. When set to "off", Netty leak detection is disabled.'
66+
required: true
67+
type: choice
68+
options:
69+
- 'report'
70+
- 'fail_on_leak'
71+
- 'off'
72+
default: 'report'
6473

6574
concurrency:
6675
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}${{ github.event_name == 'workflow_dispatch' && github.event.inputs.jdk_major_version || '' }}
@@ -85,6 +94,7 @@ jobs:
8594
collect_coverage: ${{ steps.check_coverage.outputs.collect_coverage }}
8695
jdk_major_version: ${{ steps.jdk_major_version.outputs.jdk_major_version }}
8796
java_non_tests: ${{ steps.changes.outputs.java_non_tests }}
97+
netty_leak_detection: ${{ steps.netty_leak_detection.outputs.netty_leak_detection }}
8898
steps:
8999
- name: Cancel scheduled jobs in forks by default
90100
if: ${{ github.repository != 'apache/pulsar' && github.event_name == 'schedule' }}
@@ -136,6 +146,13 @@ jobs:
136146
|| (github.event_name == 'workflow_dispatch' && github.event.inputs.collect_coverage == 'true')
137147
}}" >> $GITHUB_OUTPUT
138148
149+
- name: Set Netty leak detection mode
150+
id: netty_leak_detection
151+
run: |
152+
echo "netty_leak_detection=${{
153+
github.event_name == 'workflow_dispatch' && github.event.inputs.netty_leak_detection || 'report'
154+
}}" >> $GITHUB_OUTPUT
155+
139156
- name: Check if the PR has been approved for testing
140157
if: ${{ steps.check_changes.outputs.docs_only != 'true' && github.repository == 'apache/pulsar' && github.event_name == 'pull_request' }}
141158
env:
@@ -232,6 +249,8 @@ jobs:
232249
TRACE_TEST_RESOURCE_CLEANUP_DIR: ${{ github.workspace }}/target/trace-test-resource-cleanup
233250
THREAD_LEAK_DETECTOR_WAIT_MILLIS: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.thread_leak_detector_wait_millis || 10000 }}
234251
THREAD_LEAK_DETECTOR_DIR: ${{ github.workspace }}/target/thread-leak-dumps
252+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
253+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
235254
runs-on: ubuntu-22.04
236255
timeout-minutes: ${{ matrix.timeout || 60 }}
237256
needs: ['preconditions', 'build-and-license-check']
@@ -349,6 +368,10 @@ jobs:
349368
cat threadleak*.txt | awk '/^Summary:/ {print "::warning::" $0 "\n"; next} {print}'
350369
fi
351370
371+
- name: Report detected Netty leaks
372+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
373+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
374+
352375
- name: Upload Surefire reports
353376
uses: actions/upload-artifact@v4
354377
if: ${{ !success() || env.TRACE_TEST_RESOURCE_CLEANUP != 'off' }}
@@ -366,6 +389,7 @@ jobs:
366389
/tmp/*.hprof
367390
**/hs_err_*.log
368391
**/core.*
392+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
369393
${{ env.TRACE_TEST_RESOURCE_CLEANUP_DIR }}/*
370394
${{ env.THREAD_LEAK_DETECTOR_DIR }}/*
371395
retention-days: 7
@@ -554,6 +578,8 @@ jobs:
554578
PULSAR_TEST_IMAGE_NAME: apachepulsar/java-test-image:latest
555579
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
556580
CI_JDK_MAJOR_VERSION: ${{ needs.preconditions.outputs.jdk_major_version }}
581+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
582+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
557583
strategy:
558584
fail-fast: false
559585
matrix:
@@ -702,6 +728,10 @@ jobs:
702728
report_paths: 'test-reports/TEST-*.xml'
703729
annotate_only: 'true'
704730

731+
- name: Report detected Netty leaks
732+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
733+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
734+
705735
- name: Upload Surefire reports
706736
uses: actions/upload-artifact@v4
707737
if: ${{ !success() }}
@@ -710,6 +740,19 @@ jobs:
710740
path: surefire-reports
711741
retention-days: 7
712742

743+
- name: Upload possible heap dump, core dump or crash files
744+
uses: actions/upload-artifact@v4
745+
if: ${{ always() }}
746+
with:
747+
name: Integration-${{ matrix.upload_name || matrix.group }}-dumps
748+
path: |
749+
/tmp/*.hprof
750+
**/hs_err_*.log
751+
**/core.*
752+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
753+
retention-days: 7
754+
if-no-files-found: ignore
755+
713756
- name: Upload container logs
714757
uses: actions/upload-artifact@v4
715758
if: ${{ !success() }}
@@ -972,6 +1015,8 @@ jobs:
9721015
PULSAR_TEST_IMAGE_NAME: apachepulsar/pulsar-test-latest-version:latest
9731016
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
9741017
CI_JDK_MAJOR_VERSION: ${{ needs.preconditions.outputs.jdk_major_version }}
1018+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
1019+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
9751020
strategy:
9761021
fail-fast: false
9771022
matrix:
@@ -1079,6 +1124,10 @@ jobs:
10791124
report_paths: 'test-reports/TEST-*.xml'
10801125
annotate_only: 'true'
10811126

1127+
- name: Report detected Netty leaks
1128+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
1129+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
1130+
10821131
- name: Upload container logs
10831132
uses: actions/upload-artifact@v4
10841133
if: ${{ !success() }}
@@ -1096,6 +1145,19 @@ jobs:
10961145
path: surefire-reports
10971146
retention-days: 7
10981147

1148+
- name: Upload possible heap dump, core dump or crash files
1149+
uses: actions/upload-artifact@v4
1150+
if: ${{ always() }}
1151+
with:
1152+
name: System-${{ matrix.group }}-dumps
1153+
path: |
1154+
/tmp/*.hprof
1155+
**/hs_err_*.log
1156+
**/core.*
1157+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
1158+
retention-days: 7
1159+
if-no-files-found: ignore
1160+
10991161
- name: Wait for ssh connection when build fails
11001162
# ssh access is enabled for builds in own forks
11011163
uses: ./.github/actions/ssh-access
@@ -1202,6 +1264,8 @@ jobs:
12021264
PULSAR_TEST_IMAGE_NAME: apachepulsar/pulsar-test-latest-version:latest
12031265
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
12041266
CI_JDK_MAJOR_VERSION: ${{ needs.preconditions.outputs.jdk_major_version }}
1267+
NETTY_LEAK_DETECTION: "${{ needs.preconditions.outputs.netty_leak_detection }}"
1268+
NETTY_LEAK_DUMP_DIR: ${{ github.workspace }}/target/netty-leak-dumps
12051269
strategy:
12061270
fail-fast: false
12071271
matrix:
@@ -1286,23 +1350,40 @@ jobs:
12861350
report_paths: 'test-reports/TEST-*.xml'
12871351
annotate_only: 'true'
12881352

1353+
- name: Report detected Netty leaks
1354+
if: ${{ always() && env.NETTY_LEAK_DETECTION != 'off' }}
1355+
run: $GITHUB_WORKSPACE/build/pulsar_ci_tool.sh report_netty_leaks
1356+
12891357
- name: Upload container logs
12901358
uses: actions/upload-artifact@v4
12911359
if: ${{ !success() }}
12921360
continue-on-error: true
12931361
with:
1294-
name: System-${{ matrix.group }}-container-logs
1362+
name: Flaky-System-${{ matrix.group }}-container-logs
12951363
path: tests/integration/target/container-logs
12961364
retention-days: 7
12971365

12981366
- name: Upload Surefire reports
12991367
uses: actions/upload-artifact@v4
13001368
if: ${{ !success() }}
13011369
with:
1302-
name: System-${{ matrix.name }}-surefire-reports
1370+
name: Flaky-System-${{ matrix.name }}-surefire-reports
13031371
path: surefire-reports
13041372
retention-days: 7
13051373

1374+
- name: Upload possible heap dump, core dump or crash files
1375+
uses: actions/upload-artifact@v4
1376+
if: ${{ always() }}
1377+
with:
1378+
name: Flaky-System-${{ matrix.group }}-dumps
1379+
path: |
1380+
/tmp/*.hprof
1381+
**/hs_err_*.log
1382+
**/core.*
1383+
${{ env.NETTY_LEAK_DUMP_DIR }}/*
1384+
retention-days: 7
1385+
if-no-files-found: ignore
1386+
13061387
- name: Wait for ssh connection when build fails
13071388
# ssh access is enabled for builds in own forks
13081389
uses: ./.github/actions/ssh-access

build/pulsar_ci_tool.sh

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,68 @@ ci_create_inttest_coverage_report() {
579579
echo "::endgroup::"
580580
}
581581

582+
ci_report_netty_leaks() {
583+
if [ -z "$NETTY_LEAK_DUMP_DIR" ]; then
584+
echo "NETTY_LEAK_DUMP_DIR isn't set"
585+
return 0
586+
fi
587+
local temp_file=$(mktemp -t netty_leak.XXXX)
588+
589+
# concat all netty_leak_*.txt files in the dump directory to a temp file
590+
if [ -d "$NETTY_LEAK_DUMP_DIR" ]; then
591+
find "$NETTY_LEAK_DUMP_DIR" -maxdepth 1 -type f -name "netty_leak_*.txt" -exec cat {} \; >> $temp_file
592+
fi
593+
594+
# check if there are any netty_leak_*.txt files in the container logs
595+
local container_logs_dir="tests/integration/target/container-logs"
596+
if [ -d "$container_logs_dir" ]; then
597+
local container_netty_leak_dump_dir="$NETTY_LEAK_DUMP_DIR/container-logs"
598+
mkdir -p "$container_netty_leak_dump_dir"
599+
while read -r file; do
600+
# example file name "tests/integration/target/container-logs/ltnizrzm-standalone/var-log-pulsar.tar.gz"
601+
# take ltnizrzm-standalone part
602+
container_name=$(basename "$(dirname "$file")")
603+
target_dir="$container_netty_leak_dump_dir/$container_name"
604+
mkdir -p "$target_dir"
605+
tar -C "$target_dir" -zxf "$file" --strip-components=1 --wildcards --wildcards-match-slash '*/netty_leak_*.txt' >/dev/null 2>&1 || true
606+
done < <(find "$container_logs_dir" -type f -name "*.tar.gz")
607+
# remove all empty directories
608+
find "$container_netty_leak_dump_dir" -type d -empty -delete
609+
# print all netty_leak_*.txt files in the container logs dump directory to the temp file
610+
if [ -d "$container_netty_leak_dump_dir" ]; then
611+
find "$container_netty_leak_dump_dir" -type f -name "netty_leak_*.txt" -exec cat {} \; >> $temp_file
612+
fi
613+
fi
614+
615+
if [ -s $temp_file ]; then
616+
local leak_found_log_message
617+
if [[ "$NETTY_LEAK_DETECTION" == "fail_on_leak" ]]; then
618+
leak_found_log_message="::error::Netty leaks found. Failing the build since Netty leak detection is set to 'fail_on_leak'."
619+
else
620+
leak_found_log_message="::warning::Netty leaks found."
621+
fi
622+
{
623+
echo "${leak_found_log_message}"
624+
local test_file_locations=$(grep -h -i test $temp_file | grep org.apache | sed 's/^[[:space:]]*//;s/[[:space:]]*$//;s/^Hint: //' | sort -u || true)
625+
if [[ -n "$test_file_locations" ]]; then
626+
echo "Test file locations in stack traces:"
627+
echo
628+
echo "$test_file_locations"
629+
fi
630+
echo "Details:"
631+
cat $temp_file
632+
} | tee $NETTY_LEAK_DUMP_DIR/leak_report.txt
633+
touch target/netty_leaks_found
634+
if [[ "$NETTY_LEAK_DETECTION" == "fail_on_leak" ]]; then
635+
exit 1
636+
fi
637+
else
638+
echo "No netty leaks found."
639+
touch target/netty_leaks_not_found
640+
fi
641+
rm $temp_file
642+
}
643+
582644
if [ -z "$1" ]; then
583645
echo "usage: $0 [ci_tool_function_name]"
584646
echo "Available ci tool functions:"

buildtools/pom.xml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,12 +162,17 @@
162162
</exclusion>
163163
</exclusions>
164164
</dependency>
165-
<!-- for testing FastThreadLocalStateCleaner -->
166165
<dependency>
167166
<groupId>io.netty</groupId>
168167
<artifactId>netty-common</artifactId>
169168
<version>${netty.version}</version>
170-
<scope>test</scope>
169+
<scope>provided</scope>
170+
</dependency>
171+
<dependency>
172+
<groupId>io.netty</groupId>
173+
<artifactId>netty-buffer</artifactId>
174+
<version>${netty.version}</version>
175+
<scope>provided</scope>
171176
</dependency>
172177
<dependency>
173178
<groupId>org.mockito</groupId>

0 commit comments

Comments
 (0)