|
| 1 | +name: Run Integration Tests |
| 2 | + |
| 3 | +on: |
| 4 | + pull_request: |
| 5 | + types: [labeled] |
| 6 | + workflow_dispatch: |
| 7 | + inputs: |
| 8 | + reason: |
| 9 | + description: 'Reason for manual trigger' |
| 10 | + required: true |
| 11 | + default: '' |
| 12 | + schedule: |
| 13 | + - cron: '30 22 * * *' # Runs at 10:30pm UTC every day |
| 14 | + |
| 15 | +env: |
| 16 | + N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation |
| 17 | + |
| 18 | +jobs: |
| 19 | + run-integration-tests: |
| 20 | + if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' |
| 21 | + runs-on: ubuntu-latest |
| 22 | + permissions: |
| 23 | + contents: "read" |
| 24 | + id-token: "write" |
| 25 | + pull-requests: "write" |
| 26 | + issues: "write" |
| 27 | + strategy: |
| 28 | + matrix: |
| 29 | + python-version: ["3.12"] |
| 30 | + steps: |
| 31 | + - name: Checkout repository |
| 32 | + uses: actions/checkout@v4 |
| 33 | + |
| 34 | + - name: Install poetry via pipx |
| 35 | + run: pipx install poetry |
| 36 | + |
| 37 | + - name: Set up Python |
| 38 | + uses: actions/setup-python@v5 |
| 39 | + with: |
| 40 | + python-version: ${{ matrix.python-version }} |
| 41 | + cache: "poetry" |
| 42 | + |
| 43 | + - name: Comment on PR if 'integration-test' label is present |
| 44 | + if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test' |
| 45 | + uses: KeisukeYamashita/create-comment@v1 |
| 46 | + with: |
| 47 | + unique: false |
| 48 | + comment: | |
| 49 | + Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly. |
| 50 | +
|
| 51 | + - name: Install Python dependencies using Poetry |
| 52 | + run: poetry install --without evaluation,llama-index |
| 53 | + |
| 54 | + - name: Configure config.toml for testing with Haiku |
| 55 | + env: |
| 56 | + LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022" |
| 57 | + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} |
| 58 | + LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} |
| 59 | + run: | |
| 60 | + echo "[llm.eval]" > config.toml |
| 61 | + echo "model = \"$LLM_MODEL\"" >> config.toml |
| 62 | + echo "api_key = \"$LLM_API_KEY\"" >> config.toml |
| 63 | + echo "base_url = \"$LLM_BASE_URL\"" >> config.toml |
| 64 | + echo "temperature = 0.0" >> config.toml |
| 65 | +
|
| 66 | + - name: Build environment |
| 67 | + run: make build |
| 68 | + |
| 69 | + - name: Run integration test evaluation for Haiku |
| 70 | + env: |
| 71 | + SANDBOX_FORCE_REBUILD_RUNTIME: True |
| 72 | + run: | |
| 73 | + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run' |
| 74 | +
|
| 75 | + # get integration tests report |
| 76 | + REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1) |
| 77 | + echo "REPORT_FILE: $REPORT_FILE_HAIKU" |
| 78 | + echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV |
| 79 | + cat $REPORT_FILE_HAIKU >> $GITHUB_ENV |
| 80 | + echo >> $GITHUB_ENV |
| 81 | + echo "EOF" >> $GITHUB_ENV |
| 82 | +
|
| 83 | + - name: Wait a little bit |
| 84 | + run: sleep 10 |
| 85 | + |
| 86 | + - name: Configure config.toml for testing with DeepSeek |
| 87 | + env: |
| 88 | + LLM_MODEL: "litellm_proxy/deepseek-chat" |
| 89 | + LLM_API_KEY: ${{ secrets.LLM_API_KEY }} |
| 90 | + LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }} |
| 91 | + run: | |
| 92 | + echo "[llm.eval]" > config.toml |
| 93 | + echo "model = \"$LLM_MODEL\"" >> config.toml |
| 94 | + echo "api_key = \"$LLM_API_KEY\"" >> config.toml |
| 95 | + echo "base_url = \"$LLM_BASE_URL\"" >> config.toml |
| 96 | + echo "temperature = 0.0" >> config.toml |
| 97 | +
|
| 98 | + - name: Run integration test evaluation for DeepSeek |
| 99 | + env: |
| 100 | + SANDBOX_FORCE_REBUILD_RUNTIME: True |
| 101 | + run: | |
| 102 | + poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run' |
| 103 | +
|
| 104 | + # get integration tests report |
| 105 | + REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1) |
| 106 | + echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK" |
| 107 | + echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV |
| 108 | + cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV |
| 109 | + echo >> $GITHUB_ENV |
| 110 | + echo "EOF" >> $GITHUB_ENV |
| 111 | +
|
| 112 | + - name: Create archive of evaluation outputs |
| 113 | + run: | |
| 114 | + TIMESTAMP=$(date +'%y-%m-%d-%H-%M') |
| 115 | + cd evaluation/evaluation_outputs/outputs # Change to the outputs directory |
| 116 | + tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories |
| 117 | +
|
| 118 | + - name: Upload evaluation results as artifact |
| 119 | + uses: actions/upload-artifact@v4 |
| 120 | + id: upload_results_artifact |
| 121 | + with: |
| 122 | + name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }} |
| 123 | + path: integration_tests_*.tar.gz |
| 124 | + |
| 125 | + - name: Get artifact URLs |
| 126 | + run: | |
| 127 | + echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV |
| 128 | +
|
| 129 | + - name: Set timestamp and trigger reason |
| 130 | + run: | |
| 131 | + echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV |
| 132 | + if [[ "${{ github.event_name }}" == "pull_request" ]]; then |
| 133 | + echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV |
| 134 | + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then |
| 135 | + echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV |
| 136 | + else |
| 137 | + echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV |
| 138 | + fi |
| 139 | +
|
| 140 | + - name: Comment with results and artifact link |
| 141 | + id: create_comment |
| 142 | + uses: KeisukeYamashita/create-comment@v1 |
| 143 | + with: |
| 144 | + # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers |
| 145 | + number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }} |
| 146 | + unique: false |
| 147 | + comment: | |
| 148 | + Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }} |
| 149 | + Commit: ${{ github.sha }} |
| 150 | + **Integration Tests Report (Haiku)** |
| 151 | + Haiku LLM Test Results: |
| 152 | + ${{ env.INTEGRATION_TEST_REPORT_HAIKU }} |
| 153 | + --- |
| 154 | + **Integration Tests Report (DeepSeek)** |
| 155 | + DeepSeek LLM Test Results: |
| 156 | + ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }} |
| 157 | + --- |
| 158 | + Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }}) |
0 commit comments