Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion vlmeval/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,11 @@
'deepseek_vl_1.3b': partial(DeepSeekVL, model_path='deepseek-ai/deepseek-vl-1.3b-chat'),
}


janus_series = {
'janus_1.3b': partial(Janus, model_path='deepseek-ai/Janus-1.3B')
}

cogvlm_series = {
'cogvlm-grounding-generalist': partial(CogVlm, model_path='THUDM/cogvlm-grounding-generalist-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
'cogvlm-chat': partial(CogVlm, model_path='THUDM/cogvlm-chat-hf', tokenizer_name='lmsys/vicuna-7b-v1.5'),
Expand Down Expand Up @@ -318,7 +323,7 @@
ungrouped, api_models,
xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
deepseekvl_series, minicpm_series, cogvlm_series, wemm_series,
deepseekvl_series, janus_series, minicpm_series, cogvlm_series, wemm_series,
cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
slime_series, eagle_series, moondream_series, llama_series, molmo_series,
Expand Down
2 changes: 1 addition & 1 deletion vlmeval/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
'TransCore_M', 'emu2_chat', 'MiniCPM-V', 'MiniCPM-V-2', 'OmniLMM_12B',
'cogvlm-grounding-generalist', 'cogvlm-chat', 'cogvlm2-llama3-chat-19B',
'mPLUG-Owl3'
] + list(xtuner_series) + list(yivl_series) + list(deepseekvl_series) + list(cambrian_series),
] + list(xtuner_series) + list(yivl_series) + list(deepseekvl_series) + list(janus_series) + list(cambrian_series),
'4.36.2': ['Moondream1'],
'4.40.0': [
'idefics2_8b', 'Bunny-llama3-8B', 'MiniCPM-Llama3-V-2_5', '360VL-70B', 'Phi-3-Vision',
Expand Down
1 change: 1 addition & 0 deletions vlmeval/vlm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .yi_vl import Yi_VL
from .internvl_chat import InternVLChat
from .deepseek_vl import DeepSeekVL
from .janus import Janus
from .mgm import Mini_Gemini
from .bunnyllama3 import BunnyLLama3
from .vxverse import VXVERSE
Expand Down
130 changes: 130 additions & 0 deletions vlmeval/vlm/janus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import sys
import torch
from transformers import AutoModelForCausalLM, AutoConfig
import warnings
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE


class Janus(BaseModel):

INSTALL_REQ = True
INTERLEAVE = True

def check_install(self):
try:
import janus
except Exception as e:
logging.critical(
'Please first install janus from source codes in: https://github.com/deepseek-ai/Janus')
raise e

def __init__(self, model_path='deepseek-ai/Janus-1.3B', **kwargs):
self.check_install()
assert model_path is not None
self.model_path = model_path
from janus.models import VLChatProcessor

self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
self.tokenizer = self.vl_chat_processor.tokenizer

model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
self.model = model.to(torch.bfloat16).cuda().eval()

torch.cuda.empty_cache()
default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True, output_logits=False, output_scores=False, return_dict_in_generate=False)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')

def prepare_inputs(self, message):
def prepare_itlist(msgs):
content, images = '', []
for s in msgs:
if s['type'] == 'image':
images.append(s['value'])
content += '<image_placeholder>'
elif s['type'] == 'text':
content += s['value']
return content, images
conversation = []
if 'role' not in message[0]:
content, images = prepare_itlist(message)
conversation.append(dict(role='User', content=content, images=images))
else:
role_map = {'user': 'User', 'assistant': 'Assistant'}
for msgs in message:
role = role_map[msgs['role']]
content, images = prepare_itlist(msgs['content'])
conversation.append(dict(role=role, content=content, images=images))
conversation.append(dict(role='Assistant', content=''))
return conversation

def generate_inner(self, message, dataset=None):
if not ('MMVet' in dataset):
self.vl_chat_processor.system_prompt = ""
else:
self.vl_chat_processor.system_prompt = "You are a helpful assistant. Please answer truthfully and write out your thinking step by step to be sure you get the right answer."

conversation = self.prepare_inputs(message)
from janus.utils.io import load_pil_images
pil_images = load_pil_images(conversation)
prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
prepare_inputs = prepare_inputs.to(self.model.device, dtype=torch.bfloat16)
inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)

outputs = self.model.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=self.tokenizer.eos_token_id,
bos_token_id=self.tokenizer.bos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
**self.kwargs)
answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer

def chat_inner(self, message, dataset=None):
return self.generate_inner(message, dataset=dataset)

def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
return True
return False

def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
if DATASET_TYPE(dataset) == 'Y/N':
if dataset == 'POPE':
question = question.replace(" Please answer yes or no.", "")
prompt = '\n' + question + "\nAnswer the question using a single word or phrase."
elif DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'

hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'\nHint: {hint}\n' if hint is not None else '\n'
prompt += f'{question}\n'
prompt += (
f"{options_prompt}\nAnswer with the option's letter from the given choices directly."
if len(options) else 'Answer the question directly. '
)
elif dataset == 'MMVet':
prompt = '\n' + question
else:
raise NotImplementedError

message = [dict(type='image', value=s) for s in tgt_path]
message.extend([dict(type='text', value=prompt)])
return message

Loading