Skip to content

Add Janus-1.3B #541

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion vlmeval/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,11 @@
'deepseek_vl_1.3b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-1.3b-chat'),
}


janus_series = {
'janus_1.3b': partial(Janus, model_path='deepseek-ai/Janus-1.3B')
}

cogvlm_series = {
'cogvlm-grounding-generalist': partial(CogVlm, model_path='THUDM/cogvlm-grounding-generalist-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
'cogvlm-chat': partial(CogVlm, model_path='THUDM/cogvlm-chat-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
Expand Down Expand Up @@ -318,7 +323,7 @@
ungrouped, api_models,
xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
deepseekvl_series, minicpm_series, cogvlm_series, wemm_series,
deepseekvl_series, janus_series, minicpm_series, cogvlm_series, wemm_series,
cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
slime_series, eagle_series, moondream_series, llama_series, molmo_series,
Expand Down
2 changes: 1 addition & 1 deletion vlmeval/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
'TransCore_M', 'emu2_chat', 'MiniCPM-V', 'MiniCPM-V-2', 'OmniLMM_12B',
'cogvlm-grounding-generalist', 'cogvlm-chat', 'cogvlm2-llama3-chat-19B',
'mPLUG-Owl3'
] + list(xtuner_series) + list(yivl_series) + list(deepseekvl_series) + list(cambrian_series),
] + list(xtuner_series) + list(yivl_series) + list(deepseekvl_series) + list(janus_series) + list(cambrian_series),
'4.36.2': ['Moondream1'],
'4.40.0': [
'idefics2_8b', 'Bunny-llama3-8B', 'MiniCPM-Llama3-V-2_5', '360VL-70B', 'Phi-3-Vision',
Expand Down
1 change: 1 addition & 0 deletions vlmeval/vlm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .yi_vl import Yi_VL
from .internvl_chat import InternVLChat
from .deepseek_vl import DeepSeekVL
from .janus import Janus
from .mgm import Mini_Gemini
from .bunnyllama3 import BunnyLLama3
from .vxverse import VXVERSE
Expand Down
130 changes: 130 additions & 0 deletions vlmeval/vlm/janus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import sys
import torch
from transformers import AutoModelForCausalLM, AutoConfig
import warnings
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE


class Janus(BaseModel):

INSTALL_REQ = True
INTERLEAVE = True

def check_install(self):
try:
import janus
except Exception as e:
logging.critical(
'Please first install janus from source codes in: https://github.com/deepseek-ai/Janus')
raise e

def __init__(self, model_path='deepseek-ai/Janus-1.3B', **kwargs):
self.check_install()
assert model_path is not None
self.model_path = model_path
from janus.models import VLChatProcessor

self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
self.tokenizer = self.vl_chat_processor.tokenizer

model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
self.model = model.to(torch.bfloat16).cuda().eval()

torch.cuda.empty_cache()
default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True, output_logits=False, output_scores=False, return_dict_in_generate=False)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')

def prepare_inputs(self, message):
def prepare_itlist(msgs):
content, images = '', []
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
content += '<image_placeholder>'
elif s['type'] == 'text':
content += s['value']
return content, images
conversation = []
if 'role' not in message[0]:
content, images = prepare_itlist(message)
conversation.append(dict(role='User', content=content, images=images))
else:
role_map = {'user': 'User', 'assistant': 'Assistant'}
for msgs in message:
role = role_map[msgs['role']]
content, images = prepare_itlist(msgs['content'])
conversation.append(dict(role=role, content=content, images=images))
conversation.append(dict(role='Assistant', content=''))
return conversation

def generate_inner(self, message, dataset=None):
if not ('MMVet' in dataset):
self.vl_chat_processor.system_prompt = ""
else:
self.vl_chat_processor.system_prompt = "You are a helpful assistant. Please answer truthfully and write out your thinking step by step to be sure you get the right answer."

conversation = self.prepare_inputs(message)
from janus.utils.io import load_pil_images
pil_images = load_pil_images(conversation)
prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
prepare_inputs = prepare_inputs.to(self.model.device, dtype=torch.bfloat16)
inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)

outputs = self.model.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=self.tokenizer.eos_token_id,
bos_token_id=self.tokenizer.bos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
**self.kwargs)
answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer

def chat_inner(self, message, dataset=None):
return self.generate_inner(message, dataset=dataset)

def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
return True
return False

def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
if DATASET_TYPE(dataset) == 'Y/N':
if dataset == 'POPE':
question = question.replace(" Please answer yes or no.", "")
prompt = '\n' + question + "\nAnswer the question using a single word or phrase."
elif DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'

hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'\nHint: {hint}\n' if hint is not None else '\n'
prompt += f'{question}\n'
prompt += (
f"{options_prompt}\nAnswer with the option's letter from the given choices directly."
if len(options) else 'Answer the question directly. '
)
elif dataset == 'MMVet':
prompt = '\n' + question
else:
raise NotImplementedError

message = [dict(type='image', value=s) for s in tgt_path]
message.extend([dict(type='text', value=prompt)])
return message

Loading