adityasoni9998
diff --git a/‎.devcontainer/README.MD
Lines changed: 0 additions & 1 deletion b/‎.devcontainer/README.MD
Lines changed: 0 additions & 1 deletion
diff --git a/‎.devcontainer/devcontainer.json
Lines changed: 0 additions & 15 deletions b/‎.devcontainer/devcontainer.json
Lines changed: 0 additions & 15 deletions
diff --git a/‎.devcontainer/on_create.sh
Lines changed: 0 additions & 6 deletions b/‎.devcontainer/on_create.sh
Lines changed: 0 additions & 6 deletions
diff --git a/‎.github/dependabot.yml
Lines changed: 6 additions & 1 deletion b/‎.github/dependabot.yml
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/scripts/check_version_consistency.py
Lines changed: 66 additions & 0 deletions b/‎.github/scripts/check_version_consistency.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎.github/workflows/dummy-agent-test.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/dummy-agent-test.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/eval-runner.yml
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/eval-runner.yml
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/fe-unit-tests.yml
Lines changed: 3 additions & 2 deletions b/‎.github/workflows/fe-unit-tests.yml
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/ghcr-build.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/ghcr-build.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/integration-runner.yml
Lines changed: 75 additions & 3 deletions b/‎.github/workflows/integration-runner.yml
Lines changed: 75 additions & 3 deletions
diff --git a/‎.github/workflows/lint.yml
Lines changed: 13 additions & 0 deletions b/‎.github/workflows/lint.yml
Lines changed: 13 additions & 0 deletions
@@ -18,7 +18,7 @@ updates:
           - "chromadb"
       browsergym:
         patterns:
-          - "browsergym"
+          - "browsergym*"
       security-all:
         applies-to: "security-updates"
         patterns:
@@ -70,3 +70,8 @@ updates:
         applies-to: "version-updates"
         patterns:
           - "*"
+
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+import os
+import re
+import sys
+from typing import Set, Tuple
+
+
+def find_version_references(directory: str) -> Tuple[Set[str], Set[str]]:
+    openhands_versions = set()
+    runtime_versions = set()
+
+    version_pattern_openhands = re.compile(r'openhands:(\d{1})\.(\d{2})')
+    version_pattern_runtime = re.compile(r'runtime:(\d{1})\.(\d{2})')
+
+    for root, _, files in os.walk(directory):
+        # Skip .git directory
+        if '.git' in root:
+            continue
+
+        for file in files:
+            if file.endswith(
+                ('.md', '.yml', '.yaml', '.txt', '.html', '.py', '.js', '.ts')
+            ):
+                file_path = os.path.join(root, file)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        content = f.read()
+
+                        # Find all openhands version references
+                        matches = version_pattern_openhands.findall(content)
+                        openhands_versions.update(matches)
+
+                        # Find all runtime version references
+                        matches = version_pattern_runtime.findall(content)
+                        runtime_versions.update(matches)
+                except Exception as e:
+                    print(f'Error reading {file_path}: {e}', file=sys.stderr)
+
+    return openhands_versions, runtime_versions
+
+
+def main():
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+    openhands_versions, runtime_versions = find_version_references(repo_root)
+
+    exit_code = 0
+
+    if len(openhands_versions) > 1:
+        print('Error: Multiple openhands versions found:', file=sys.stderr)
+        print('Found versions:', sorted(openhands_versions), file=sys.stderr)
+        exit_code = 1
+    elif len(openhands_versions) == 0:
+        print('Warning: No openhands version references found', file=sys.stderr)
+
+    if len(runtime_versions) > 1:
+        print('Error: Multiple runtime versions found:', file=sys.stderr)
+        print('Found versions:', sorted(runtime_versions), file=sys.stderr)
+        exit_code = 1
+    elif len(runtime_versions) == 0:
+        print('Warning: No runtime version references found', file=sys.stderr)
+
+    sys.exit(exit_code)
+
+
+if __name__ == '__main__':
+    main()
@@ -36,6 +36,8 @@ jobs:
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v3
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
       - name: Install poetry via pipx
         run: pipx install poetry
       - name: Set up Python
 
@@ -29,6 +29,8 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
 
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
       - name: Install poetry via pipx
         run: pipx install poetry
 
@@ -129,7 +131,7 @@ jobs:
 
       - name: Post to a Slack channel
         id: slack
-        uses: slackapi/slack-github-action@v1.27.0
+        uses: slackapi/slack-github-action@v2.0.0
         with:
           channel-id: 'C07SVQSCR6F'
           slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
 
@@ -24,7 +24,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        node-version: [20]
+        node-version: [20, 22]
+      fail-fast: true
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -42,6 +43,6 @@ jobs:
         working-directory: ./frontend
         run: npm run test:coverage
       - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
+        uses: codecov/codecov-action@v5
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -56,7 +56,7 @@ jobs:
           docker-images: false
           swap-storage: true
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3.0.0
+        uses: docker/setup-qemu-action@v3.3.0
         with:
           image: tonistiigi/binfmt:latest
       - name: Login to GHCR
@@ -119,7 +119,7 @@ jobs:
           docker-images: false
           swap-storage: true
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3.0.0
+        uses: docker/setup-qemu-action@v3.3.0
         with:
           image: tonistiigi/binfmt:latest
       - name: Login to GHCR
@@ -293,7 +293,7 @@ jobs:
           RUN_AS_OPENHANDS=false \
           poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
       - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
+        uses: codecov/codecov-action@v5
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
@@ -370,7 +370,7 @@ jobs:
           RUN_AS_OPENHANDS=true \
           poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime --ignore=tests/runtime/test_browsergym_envs.py
       - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v4
+        uses: codecov/codecov-action@v5
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
 
@@ -56,6 +56,7 @@ jobs:
           LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 10
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml
@@ -70,7 +71,7 @@ jobs:
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'haiku_run'
 
           # get integration tests report
           REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
@@ -88,6 +89,7 @@ jobs:
           LLM_MODEL: "litellm_proxy/deepseek-chat"
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 10
         run: |
           echo "[llm.eval]" > config.toml
           echo "model = \"$LLM_MODEL\"" >> config.toml
@@ -99,7 +101,7 @@ jobs:
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
         run: |
-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' 10 $N_PROCESSES '' 'deepseek_run'
 
           # get integration tests report
           REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
@@ -109,11 +111,75 @@ jobs:
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
 
+      # -------------------------------------------------------------
+      # Run DelegatorAgent tests for Haiku, limited to t01 and t02
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing DelegatorAgent (Haiku)
+        env:
+          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation for DelegatorAgent (Haiku)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_haiku_run'
+
+          # Find and export the delegator test results
+          REPORT_FILE_DELEGATOR_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/*haiku*_maxiter_30_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_DELEGATOR_HAIKU: $REPORT_FILE_DELEGATOR_HAIKU"
+          echo "INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DELEGATOR_HAIKU >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
+      # -------------------------------------------------------------
+      # Run DelegatorAgent tests for DeepSeek, limited to t01 and t02
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing DelegatorAgent (DeepSeek)
+        env:
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 30
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+
+      - name: Run integration test evaluation for DelegatorAgent (DeepSeek)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD DelegatorAgent '' 30 $N_PROCESSES "t01_fix_simple_typo,t02_add_bash_hello" 'delegator_deepseek_run'
+
+          # Find and export the delegator test results
+          REPORT_FILE_DELEGATOR_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/DelegatorAgent/deepseek*_maxiter_30_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_DELEGATOR_DEEPSEEK: $REPORT_FILE_DELEGATOR_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
+
       - name: Create archive of evaluation outputs
         run: |
           TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
           cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
-          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/*  # Only include the actual result directories
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/*  # Only include the actual result directories
 
       - name: Upload evaluation results as artifact
         uses: actions/upload-artifact@v4
@@ -154,5 +220,11 @@ jobs:
               **Integration Tests Report (DeepSeek)**
               DeepSeek LLM Test Results:
               ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
+              ---
+                **Integration Tests Report Delegator (Haiku)**
+              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_HAIKU }}
+              ---
+              **Integration Tests Report Delegator (DeepSeek)**
+              ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK }}
               ---
               Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
@@ -53,3 +53,16 @@ jobs:
         run: pip install pre-commit==3.7.0
       - name: Run pre-commit hooks
         run: pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
+
+  # Check version consistency across documentation
+  check-version-consistency:
+    name: Check version consistency
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+      - name: Run version consistency check
+        run: .github/scripts/check_version_consistency.py