Skip to content

Commit f0ca223

Browse files
Fix issue #5076: Integration test github action (#5077)
Co-authored-by: Engel Nyst <[email protected]>
1 parent 082a551 commit f0ca223

File tree

5 files changed

+225
-53
lines changed

5 files changed

+225
-53
lines changed

.github/workflows/eval-runner.yml

+1-22
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Run Evaluation
1+
name: Run SWE-Bench Evaluation
22

33
on:
44
pull_request:
@@ -58,24 +58,6 @@ jobs:
5858
echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
5959
echo "temperature = 0.0" >> config.toml
6060
61-
- name: Run integration test evaluation
62-
env:
63-
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
64-
RUNTIME: remote
65-
SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
66-
EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
67-
68-
run: |
69-
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
70-
71-
# get evaluation report
72-
REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
73-
echo "REPORT_FILE: $REPORT_FILE"
74-
echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
75-
cat $REPORT_FILE >> $GITHUB_ENV
76-
echo >> $GITHUB_ENV
77-
echo "EOF" >> $GITHUB_ENV
78-
7961
- name: Run SWE-Bench evaluation
8062
env:
8163
ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
@@ -143,9 +125,6 @@ jobs:
143125
**SWE-Bench Evaluation Report**
144126
${{ env.SWEBENCH_REPORT }}
145127
---
146-
**Integration Tests Evaluation Report**
147-
${{ env.INTEGRATION_TEST_REPORT }}
148-
---
149128
You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
150129
151130
- name: Post to a Slack channel
+158
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
name: Run Integration Tests
2+
3+
on:
4+
pull_request:
5+
types: [labeled]
6+
workflow_dispatch:
7+
inputs:
8+
reason:
9+
description: 'Reason for manual trigger'
10+
required: true
11+
default: ''
12+
schedule:
13+
- cron: '30 22 * * *' # Runs at 10:30pm UTC every day
14+
15+
env:
16+
N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation
17+
18+
jobs:
19+
run-integration-tests:
20+
if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
21+
runs-on: ubuntu-latest
22+
permissions:
23+
contents: "read"
24+
id-token: "write"
25+
pull-requests: "write"
26+
issues: "write"
27+
strategy:
28+
matrix:
29+
python-version: ["3.12"]
30+
steps:
31+
- name: Checkout repository
32+
uses: actions/checkout@v4
33+
34+
- name: Install poetry via pipx
35+
run: pipx install poetry
36+
37+
- name: Set up Python
38+
uses: actions/setup-python@v5
39+
with:
40+
python-version: ${{ matrix.python-version }}
41+
cache: "poetry"
42+
43+
- name: Comment on PR if 'integration-test' label is present
44+
if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
45+
uses: KeisukeYamashita/create-comment@v1
46+
with:
47+
unique: false
48+
comment: |
49+
Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
50+
51+
- name: Install Python dependencies using Poetry
52+
run: poetry install --without evaluation,llama-index
53+
54+
- name: Configure config.toml for testing with Haiku
55+
env:
56+
LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
57+
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
58+
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
59+
run: |
60+
echo "[llm.eval]" > config.toml
61+
echo "model = \"$LLM_MODEL\"" >> config.toml
62+
echo "api_key = \"$LLM_API_KEY\"" >> config.toml
63+
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
64+
echo "temperature = 0.0" >> config.toml
65+
66+
- name: Build environment
67+
run: make build
68+
69+
- name: Run integration test evaluation for Haiku
70+
env:
71+
SANDBOX_FORCE_REBUILD_RUNTIME: True
72+
run: |
73+
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
74+
75+
# get integration tests report
76+
REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
77+
echo "REPORT_FILE: $REPORT_FILE_HAIKU"
78+
echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
79+
cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
80+
echo >> $GITHUB_ENV
81+
echo "EOF" >> $GITHUB_ENV
82+
83+
- name: Wait a little bit
84+
run: sleep 10
85+
86+
- name: Configure config.toml for testing with DeepSeek
87+
env:
88+
LLM_MODEL: "litellm_proxy/deepseek-chat"
89+
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
90+
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
91+
run: |
92+
echo "[llm.eval]" > config.toml
93+
echo "model = \"$LLM_MODEL\"" >> config.toml
94+
echo "api_key = \"$LLM_API_KEY\"" >> config.toml
95+
echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
96+
echo "temperature = 0.0" >> config.toml
97+
98+
- name: Run integration test evaluation for DeepSeek
99+
env:
100+
SANDBOX_FORCE_REBUILD_RUNTIME: True
101+
run: |
102+
poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
103+
104+
# get integration tests report
105+
REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
106+
echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
107+
echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
108+
cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
109+
echo >> $GITHUB_ENV
110+
echo "EOF" >> $GITHUB_ENV
111+
112+
- name: Create archive of evaluation outputs
113+
run: |
114+
TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
115+
cd evaluation/evaluation_outputs/outputs # Change to the outputs directory
116+
tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories
117+
118+
- name: Upload evaluation results as artifact
119+
uses: actions/upload-artifact@v4
120+
id: upload_results_artifact
121+
with:
122+
name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
123+
path: integration_tests_*.tar.gz
124+
125+
- name: Get artifact URLs
126+
run: |
127+
echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
128+
129+
- name: Set timestamp and trigger reason
130+
run: |
131+
echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
132+
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
133+
echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
134+
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
135+
echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
136+
else
137+
echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
138+
fi
139+
140+
- name: Comment with results and artifact link
141+
id: create_comment
142+
uses: KeisukeYamashita/create-comment@v1
143+
with:
144+
# if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers
145+
number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
146+
unique: false
147+
comment: |
148+
Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
149+
Commit: ${{ github.sha }}
150+
**Integration Tests Report (Haiku)**
151+
Haiku LLM Test Results:
152+
${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
153+
---
154+
**Integration Tests Report (DeepSeek)**
155+
DeepSeek LLM Test Results:
156+
${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
157+
---
158+
Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})

evaluation/integration_tests/run_infer.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,19 @@ def get_config(
4848
# use default base_container_image
4949
enable_auto_lint=True,
5050
use_host_network=False,
51-
timeout=100,
51+
timeout=300,
52+
# Add platform to the sandbox config to solve issue 4401
53+
platform='linux/amd64',
5254
api_key=os.environ.get('ALLHANDS_API_KEY', None),
5355
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
56+
keep_runtime_alive=False,
57+
remote_runtime_init_timeout=3600,
5458
),
5559
# do not mount workspace
5660
workspace_base=None,
5761
workspace_mount_path=None,
62+
# debug
63+
debug=True,
5864
)
5965
config.set_llm_config(
6066
update_llm_config_for_completions_logging(
@@ -129,7 +135,12 @@ def process_instance(
129135
# # result evaluation
130136
# # =============================================
131137

132-
histories = [event_to_dict(event) for event in state.history]
138+
histories = state.history
139+
140+
# some basic check
141+
logger.info(f'Total events in history: {len(histories)}')
142+
assert len(histories) > 0, 'History should not be empty'
143+
133144
test_result: TestResult = test_class.verify_result(runtime, histories)
134145
metrics = state.metrics.get() if state.metrics else None
135146

@@ -139,7 +150,7 @@ def process_instance(
139150
instance=instance.to_dict(),
140151
instruction=instruction,
141152
metadata=metadata,
142-
history=histories,
153+
history=[event_to_dict(event) for event in histories],
143154
metrics=metrics,
144155
error=state.last_error if state and state.last_error else None,
145156
test_result=test_result.model_dump(),

evaluation/integration_tests/tests/t05_simple_browsing.py

+23-11
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
108108

109109
@classmethod
110110
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
111+
from openhands.core.logger import openhands_logger as logger
112+
111113
# check if the "The answer is OpenHands is all you need!" is in any message
112114
message_actions = [
113115
event
@@ -116,19 +118,29 @@ def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
116118
event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
117119
)
118120
]
121+
logger.debug(f'Total message-like events: {len(message_actions)}')
122+
119123
for event in message_actions:
120-
if isinstance(event, AgentDelegateObservation):
121-
content = event.content
122-
elif isinstance(event, AgentFinishAction):
123-
content = event.outputs.get('content', '')
124-
elif isinstance(event, MessageAction):
125-
content = event.content
126-
else:
127-
raise ValueError(f'Unknown event type: {type(event)}')
124+
try:
125+
if isinstance(event, AgentDelegateObservation):
126+
content = event.content
127+
elif isinstance(event, AgentFinishAction):
128+
content = event.outputs.get('content', '')
129+
elif isinstance(event, MessageAction):
130+
content = event.content
131+
else:
132+
logger.warning(f'Unexpected event type: {type(event)}')
133+
continue
128134

129-
if 'OpenHands is all you need!' in content:
130-
return TestResult(success=True)
135+
if 'OpenHands is all you need!' in content:
136+
return TestResult(success=True)
137+
except Exception as e:
138+
logger.error(f'Error processing event: {e}')
139+
140+
logger.debug(
141+
f'Total messages: {len(message_actions)}. Messages: {message_actions}'
142+
)
131143
return TestResult(
132144
success=False,
133-
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
145+
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
134146
)

evaluation/integration_tests/tests/t06_github_pr_browsing.py

+29-17
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,43 @@ def initialize_runtime(cls, runtime: Runtime) -> None:
1414

1515
@classmethod
1616
def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
17-
# check if the "The answer is OpenHands is all you need!" is in any message
17+
from openhands.core.logger import openhands_logger as logger
18+
19+
# check if the license information is in any message
1820
message_actions = [
1921
event
2022
for event in histories
2123
if isinstance(
2224
event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
2325
)
2426
]
27+
logger.info(f'Total message-like events: {len(message_actions)}')
28+
2529
for event in message_actions:
26-
if isinstance(event, AgentDelegateObservation):
27-
content = event.content
28-
elif isinstance(event, AgentFinishAction):
29-
content = event.outputs.get('content', '')
30-
elif isinstance(event, MessageAction):
31-
content = event.content
32-
else:
33-
raise ValueError(f'Unknown event type: {type(event)}')
34-
35-
if (
36-
'non-commercial' in content
37-
or 'MIT' in content
38-
or 'Apache 2.0' in content
39-
):
40-
return TestResult(success=True)
30+
try:
31+
if isinstance(event, AgentDelegateObservation):
32+
content = event.content
33+
elif isinstance(event, AgentFinishAction):
34+
content = event.outputs.get('content', '')
35+
elif isinstance(event, MessageAction):
36+
content = event.content
37+
else:
38+
logger.warning(f'Unexpected event type: {type(event)}')
39+
continue
40+
41+
if (
42+
'non-commercial' in content
43+
or 'MIT' in content
44+
or 'Apache 2.0' in content
45+
):
46+
return TestResult(success=True)
47+
except Exception as e:
48+
logger.error(f'Error processing event: {e}')
49+
50+
logger.debug(
51+
f'Total messages: {len(message_actions)}. Messages: {message_actions}'
52+
)
4153
return TestResult(
4254
success=False,
43-
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
55+
reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
4456
)

0 commit comments

Comments
 (0)