Skip to content

[Feature] Add verifier for VQA/MCQ evaluation #424

[Feature] Add verifier for VQA/MCQ evaluation

[Feature] Add verifier for VQA/MCQ evaluation #424

Workflow file for this run

name: pr_run_test
on:
pull_request:
branches:
- "main"
paths-ignore:
- "docs/**"
- "**.md"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2-VL-7B-Instruct":0.8727272727272727,"InternVL2_5-8B":0.8727272727272727,"llava_onevision_qwen2_7b_si":0.8363636363636363},"MMStar_MINI":{"Qwen2-VL-7B-Instruct":0.6266666666666667,"InternVL2_5-8B":0.6333333333333333,"llava_onevision_qwen2_7b_si":0.49333333333333335},"AI2D_MINI":{"Qwen2-VL-7B-Instruct":0.7975708502024291,"InternVL2_5-8B":0.854251012145749,"llava_onevision_qwen2_7b_si":0.8178137651821862},"OCRBench_MINI":{"Qwen2-VL-7B-Instruct":16.6,"InternVL2_5-8B":16.4,"llava_onevision_qwen2_7b_si":12.9}}'
jobs:
vlm_test:
if: ${{!cancelled()}}
runs-on: [linux-a100]
strategy:
fail-fast: false
matrix:
model: [Qwen/Qwen2-VL-7B-Instruct,OpenGVLab/InternVL2_5-8B,lmms-lab/llava-onevision-qwen2-7b-si]
dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"]
container:
image: kkscilife/vlmevalkit_2:a100
options: "--gpus=all --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy --pull never"
volumes:
- /mnt/187:/mnt/187
steps:
- name: clone_repo
uses: actions/checkout@v3
- name: evaluation_model
run: |
pip install -e .
pre_model=$(echo ${{matrix.model}} | awk -F'/' '{print $1}')
ln -s /mnt/187/$pre_model .
if [ "${{matrix.model}}" = "lmms-lab/llava-onevision-qwen2-7b-si" ];then
model_name="llava_onevision_qwen2_7b_si"
else
model_name=$(echo ${{matrix.model}} | awk -F'/' '{print $2}')
fi
nvidia-smi
python run.py --data ${{matrix.dataset}} --model $model_name
python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name $model_name