Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions tests/python_tests/samples/test_tools_llm_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,21 @@ def test_python_tool_llm_benchmark_text_embeddings(self, convert_model, sample_a
"-m", convert_model,
] + sample_args
run_sample(benchmark_py_command)


@pytest.mark.samples
@pytest.mark.parametrize("convert_model", ["ms-marco-TinyBERT-L2-v2"], indirect=True)
@pytest.mark.parametrize("sample_args", [
["-d", "cpu", "-n", "2", "--rerank"],
["-d", "cpu", "-n", "2", "--reranking_max_length", "10", "--reranking_top_n", "1", "--rerank"],
["-d", "cpu", "-n", "2", "--optimum", "--rerank"],
["-d", "cpu", "-n", "1", "--reranking_max_length", "10", "--reranking_top_n", "1", "--optimum", "--rerank"]
])
def test_python_tool_llm_benchmark_text_reranking(self, convert_model, sample_args):
benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py')
benchmark_py_command = [
sys.executable,
benchmark_script,
"-m", convert_model,
] + sample_args
run_sample(benchmark_py_command)
14 changes: 13 additions & 1 deletion tools/llm_bench/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import task.speech_to_text_generation as bench_speech
import task.text_embeddings as bench_text_embed
import task.text_to_speech_generation as bench_text_to_speech
import task.text_reranker as bench_text_rerank

DEFAULT_TORCH_THREAD_NUMS = 16

Expand Down Expand Up @@ -196,6 +197,16 @@ def get_argprser():
parser.add_argument("--embedding_normalize", action="store_true", help="Normalize embeddings. Applicable only for text embeddings")
parser.add_argument("--embedding_max_length", type=int, default=None,
help="Max length for text embeddings. Input text will be padded or truncated to specified value")
parser.add_argument("--reranking_max_length", type=int, default=None,
help="Max length for text reranking. Input text will be padded or truncated to specified value")
parser.add_argument("--reranking_top_n", type=int, default=None,
help="Number of top results to return for text reranking")
parser.add_argument("--texts", nargs='+', default=None,
help="List of candidates for reranking based on their relevance to a prompt(query). Applicable for Text Rerank pipeline.")
parser.add_argument('--texts_file', nargs='+', default=None,
help='Texts file(s) in jsonl format with candidates for reranking based on relevance to a prompt(query). '
'Multiple files should be separated with space(s). Applicable for Text Rerank pipeline.')
parser.add_argument('--rerank', action='store_true', help='Benchmark reranking pipeline.')
parser.add_argument("--apply_chat_template", action="store_true",
help="Apply chat template for LLM. By default chat template is not applied. It's better to use with --disable_prompt_permutation,"
" otherwise the prompt will be modified after applying the chat template, so the structure of chat template will not be kept.")
Expand All @@ -214,7 +225,8 @@ def get_argprser():
'speech2text': bench_speech.run_speech_2_txt_benchmark,
"vlm": bench_vlm.run_visual_language_generation_benchmark,
"text_embed": bench_text_embed.run_text_embddings_benchmark,
"text2speech": bench_text_to_speech.run_text_2_speech_benchmark
"text2speech": bench_text_to_speech.run_text_2_speech_benchmark,
"text_rerank": bench_text_rerank.run_text_reranker_benchmark
}


Expand Down
17 changes: 13 additions & 4 deletions tools/llm_bench/llm_bench_utils/config_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
AutoModel,
SpeechT5ForTextToSpeech,
SpeechT5Processor,
SpeechT5HifiGan
SpeechT5HifiGan,
AutoModelForSequenceClassification
)
from diffusers.pipelines import DiffusionPipeline, LDMSuperResolutionPipeline
from optimum.intel.openvino import (
Expand All @@ -21,7 +22,8 @@
OVPipelineForInpainting,
OVPipelineForImage2Image,
OVModelForFeatureExtraction,
OVModelForTextToSpeechSeq2Seq
OVModelForTextToSpeechSeq2Seq,
OVModelForSequenceClassification
)
from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel

Expand All @@ -31,7 +33,8 @@
't5': AutoTokenizer,
'blenderbot': AutoTokenizer,
'falcon': AutoTokenizer,
'speecht5': SpeechT5Processor
'speecht5': SpeechT5Processor,
'bert': AutoTokenizer
}

TEXT_TO_SPEECH_VOCODER_CLS = SpeechT5HifiGan
Expand All @@ -42,6 +45,10 @@

IMAGE_TO_IMAGE_GEN_CLS = OVPipelineForImage2Image

TEXT_RERANK_GEN_CLS = OVModelForSequenceClassification

TEXT_RERANK_PT_GEN_CLS = AutoModelForSequenceClassification

OV_MODEL_CLASSES_MAPPING = {
'decoder': OVModelForCausalLM,
't5': OVModelForSeq2SeqLM,
Expand Down Expand Up @@ -136,7 +143,7 @@
"gptj"
],
'ldm_super_resolution': ['ldm-super-resolution'],
'text_embed': ["bge", "bert", "albert", "roberta", "xlm-roberta"],
'rag': ["bge", "bert", "albert", "roberta", "xlm-roberta"],
'text2speech': ['speecht5'],
}

Expand All @@ -148,7 +155,9 @@
'code_gen': 'decoder',
'ldm_super_resolution': 'ldm_super_resolution',
"vlm": "vlm",
'rag': 'bert',
'text_embed': 'bert',
'text_rerank': 'bert',
'text2speech': 'speecht5',
}

Expand Down
6 changes: 3 additions & 3 deletions tools/llm_bench/llm_bench_utils/hook_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
TRANS_MIN_VERSION = '4.40.0'


def get_bench_hook(num_beams, ov_model, embed=False):
def get_bench_hook(num_beams, ov_model, rag=False):
min_version = version.parse(TRANS_MIN_VERSION)
trans_version = version.parse(transformers.__version__)
search_type = 'beam search' if num_beams > 1 else 'greedy search'
if embed:
if rag:
import llm_bench_utils.hook_forward
bench_hook = llm_bench_utils.hook_forward.EmbedForwardHook()
bench_hook = llm_bench_utils.hook_forward.RAGForwardHook()
bench_hook.new_forward(ov_model)
return bench_hook

Expand Down
2 changes: 1 addition & 1 deletion tools/llm_bench/llm_bench_utils/hook_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def my_vae_decoder(inputs, share_inputs=True, **kwargs):
pipe.vae_decoder.request = my_vae_decoder


class EmbedForwardHook:
class RAGForwardHook:
def __init__(self):
self.tm_list = []
self.tm_infer_list = []
Expand Down
24 changes: 15 additions & 9 deletions tools/llm_bench/llm_bench_utils/metrics_print.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import logging as log


def print_metrics(
iter_num, iter_data, tms=None, tms_infer=None, warm_up=False,
stable_diffusion=None, tokenization_time=None, batch_size=1, prompt_idx=-1, whisper=None, text_emb=None, latency_unit=None, tts=None, cb_metric=None
):
def print_metrics(iter_num, iter_data, tms=None, tms_infer=None, warm_up=False,
stable_diffusion=None, tokenization_time=None, batch_size=1,
prompt_idx=-1, whisper=None, text_emb=None, latency_unit=None,
tts=None, cb_metric=None, text_rerank=None):
iter_str = str(iter_num)
if warm_up:
iter_str = 'warm-up'
Expand Down Expand Up @@ -42,15 +42,21 @@ def print_metrics(
iter_data['other_tokens_avg_latency'] = sum(tms[1:]) / (len(tms) - 1) * 1000 if len(tms) > 1 else -1
first_token_latency = 'NA' if iter_data['first_token_latency'] == -1 else f"{iter_data['first_token_latency']:.2f} ms"
other_token_latency = 'NA' if iter_data['other_tokens_avg_latency'] == -1 else f"{iter_data['other_tokens_avg_latency']:.2f} ms/{latency_unit}"
if text_emb is None:
if text_emb is not None:
log.info(
f'{prefix} First token latency: {first_token_latency}, '
f'other tokens latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]} * {batch_size}',
f"{prefix} First iteration latency: {first_token_latency}, "
f'other iterations latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]} * {batch_size}',
)
elif text_rerank is not None:
log.info(
f"{prefix} First iteration latency: {first_token_latency}, "
f'other iteration latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]}, '
f'texts number: {text_rerank.get("texts_num", -1)}',
)
else:
log.info(
f'{prefix} First iteration latency: {first_token_latency}, '
f'other iterations latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]} * {batch_size}',
f"{prefix} First token latency: {first_token_latency}, "
f'other tokens latency: {other_token_latency}, len of input tokens: {iter_data["input_size"]} * {batch_size}',
)
if len(tms) == 0:
log.warning(f'{prefix} No hook data output for first token latency and other tokens latency')
Expand Down
9 changes: 9 additions & 0 deletions tools/llm_bench/llm_bench_utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def get_param_from_file(args, input_key):
if args[input_key] is None:
if args['use_case'] in ['text_gen', 'text_embed', 'text2speech']:
data_list.append('What is OpenVINO?')
elif args['use_case'] in ['text_rerank']:
data_list.append("What are the main features of Intel Core Ultra processors?")
elif args['use_case'] == 'code_gen':
data_list.append('def print_hello_world():')
elif args['use_case'] == 'image_gen':
Expand Down Expand Up @@ -137,6 +139,11 @@ def analyze_args(args):
model_args['emb_pooling_type'] = args.embedding_pooling
model_args['emb_normalize'] = args.embedding_normalize
model_args["emb_max_length"] = args.embedding_max_length
model_args['rerank_max_length'] = args.reranking_max_length
model_args["rerank_top_n"] = args.reranking_top_n
model_args["rerank_texts"] = args.texts
model_args["rerank_texts_file"] = args.texts_file
model_args["rerank"] = args.rerank
model_args["apply_chat_template"] = args.apply_chat_template

optimum = args.optimum
Expand Down Expand Up @@ -177,6 +184,8 @@ def analyze_args(args):
raise RuntimeError(f'==Failure FOUND==: Incorrect model path:{model_path}')
if model_framework in ('ov', 'pt'):
use_case, model_name = get_use_case(args.model)
if use_case == 'rag':
use_case = 'text_rerank' if args.rerank else 'text_embed'
model_args['use_case'] = use_case
if use_case == 'code_gen' and not model_args['prompt'] and not model_args['prompt_file']:
model_args['prompt'] = 'def print_hello_world():'
Expand Down
78 changes: 77 additions & 1 deletion tools/llm_bench/llm_bench_utils/ov_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import copy
import types
from llm_bench_utils.hook_common import get_bench_hook
from llm_bench_utils.memory_monitor import MemMonitorWrapper
from llm_bench_utils.hook_forward import MeanStdPair, RawImGenPerfMetrics
from llm_bench_utils.model_utils import get_version_in_format_to_pars
from llm_bench_utils.config_class import (
Expand All @@ -21,6 +22,7 @@
INPAINTING_IMAGE_GEN_CLS,
IMAGE_TO_IMAGE_GEN_CLS,
TEXT_TO_SPEECH_VOCODER_CLS,
TEXT_RERANK_GEN_CLS,
PA_ATTENTION_BACKEND
)
from transformers import pipeline
Expand Down Expand Up @@ -764,7 +766,7 @@ def forward_with_pooling(self, input_ids, attention_mask, token_type_ids=None, *
if kwargs.get("mem_consumption"):
memory_data_collector.stop_and_collect_data('compilation_phase')
memory_data_collector.log_data(compilation_phase=True)
bench_hook = get_bench_hook(1, ov_model, embed=True)
bench_hook = get_bench_hook(1, ov_model, rag=True)
from_pretrained_time = end - start
log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
return ov_model, tokenizer, from_pretrained_time, bench_hook, False
Expand Down Expand Up @@ -1144,3 +1146,77 @@ def _is_chinese_char(self, cp):
): #
return True
return False


def create_genai_text_reranker_model(model_path: Path, device: str, memory_monitor: MemMonitorWrapper, tokenizer: AutoTokenizer, **kwargs):
import openvino_genai

config = openvino_genai.TextRerankPipeline.Config()
if kwargs.get("rerank_top_n") is not None:
config.top_n = kwargs.get("rerank_top_n")
if kwargs.get("rerank_max_length") is not None:
config.max_length = kwargs.get("rerank_max_length")

ov_config = kwargs['config']

if kwargs.get("mem_consumption"):
memory_monitor.start()
start = time.perf_counter()
pipe = openvino_genai.TextRerankPipeline(model_path, device.upper(), config, **ov_config)
end = time.perf_counter()

log.info("Selected OpenVINO GenAI for benchmarking")
if kwargs.get("mem_consumption"):
memory_monitor.stop_and_collect_data('compilation_phase')
memory_monitor.log_data('for compilation phase')
log.info(f'Pipeline initialization time: {end - start:.2f}s')
return pipe, tokenizer, end - start, None, True


def create_text_reranker_model(model_path: Path, device: str, memory_monitor: MemMonitorWrapper, **kwargs):
if model_path.name.endswith('xml'):
model_path = model_path.parents[2]

ov_config = kwargs['config']

model_path_existed = Path(model_path).exists()
# load model
if not model_path_existed:
raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist')

trust_remote_code = False
try:
tokenizer = AutoTokenizer.from_pretrained(model_path)
except Exception:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
trust_remote_code = True
if kwargs.get("genai", True) and is_genai_available(log_msg=True):
try:
return create_genai_text_reranker_model(model_path, device, memory_monitor, tokenizer, **kwargs)
except Exception as exp:
log.warning(
f"Model is not supported by OpenVINO GenAI. "
f"GenAI pipeline loading failed with following error: {exp}"
"Benchmark will be switched to Optimum Intel pipeline realization"
)

log.info("Selected Optimum Intel for benchmarking")
if kwargs.get("mem_consumption"):
memory_monitor.start()
start = time.perf_counter()
ov_model = TEXT_RERANK_GEN_CLS.from_pretrained(
model_path,
device=device,
ov_config=ov_config,
trust_remote_code=trust_remote_code,
use_cache=False
)
end = time.perf_counter()

if kwargs.get("mem_consumption"):
memory_monitor.stop_and_collect_data('compilation_phase')
memory_monitor.log_data('for compilation phase')
bench_hook = get_bench_hook(1, ov_model, rag=True)
from_pretrained_time = end - start
log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
return ov_model, tokenizer, from_pretrained_time, bench_hook, False
58 changes: 54 additions & 4 deletions tools/llm_bench/llm_bench_utils/pt_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2023-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import torch
from llm_bench_utils.config_class import PT_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES, TEXT_TO_SPEECH_VOCODER_CLS
import os
import time
import torch
import json
import logging as log
from pathlib import Path

from llm_bench_utils.memory_monitor import MemMonitorWrapper
from llm_bench_utils.config_class import (
PT_MODEL_CLASSES_MAPPING,
TOKENIZE_CLASSES_MAPPING,
DEFAULT_MODEL_CLASSES,
TEXT_TO_SPEECH_VOCODER_CLS,
TEXT_RERANK_PT_GEN_CLS
)
import llm_bench_utils.hook_common as hook_common
import json


def set_bf16(model, device, **kwargs):
Expand Down Expand Up @@ -268,3 +276,45 @@ def create_ldm_super_resolution_model(model_path, device, memory_data_collector,
compiled_model = run_torch_compile(pipe, backend, memory_data_collector if kwargs.get("mem_consumption") else None)
pipe = compiled_model
return pipe, from_pretrain_time


def create_text_reranker_model(model_path: Path, device: str, memory_monitor: MemMonitorWrapper, **kwargs):
if not model_path.exists():
raise RuntimeError(f'==Failure ==: model path:{model_path} is not exist')
if not device:
raise RuntimeError('==Failure ==: no device to load')
if not model_path.is_dir() or len(os.listdir(model_path)) == 0:
raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty')

log.info(f'Load text reranker model from model path:{model_path}')
default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']]
model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']]
token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type])
if kwargs.get("mem_consumption"):
memory_monitor.start()
start = time.perf_counter()
pipe = TEXT_RERANK_PT_GEN_CLS.from_pretrained(model_path)
pipe = set_bf16(pipe, device, **kwargs)
end = time.perf_counter()
if kwargs.get("mem_consumption"):
memory_monitor.stop_and_collect_data('from_pretrained_phase')
memory_monitor.log_data('for from pretrained phase')
from_pretrain_time = end - start
processor = token_class.from_pretrained(model_path)
log.info(f'Model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s')

# If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch
if device.upper() == 'GPU':
device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable')
else:
device = torch.device(device.lower())
log.info(f'Torch device was set to: {device}')

pipe.to(device)

if kwargs['torch_compile_backend']:
backend = kwargs['torch_compile_backend']
compiled_model = run_torch_compile(pipe, backend, memory_monitor if kwargs.get("mem_consumption") else None)
pipe = compiled_model

return pipe, processor, from_pretrain_time, None, False
2 changes: 2 additions & 0 deletions tools/llm_bench/prompts/texts_for_rerank.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"text": "The commercial PC market is propelled by premium computing solutions that drive user productivity and help service organizations protect and maintain devices. Corporations must empower mobile and hybrid workers while extracting value from artificial intelligence (AI) to improve business outcomes. Moreover, both public and private sectors must address sustainability initiatives pertaining to the full life cycle of computing fleets. An inflection point in computing architecture is needed to stay ahead of evolving requirements. Introducing Intel® Core™ Ultra Processors Intel® Core™ Ultra processors shape the future of commercial computing in four major ways: Power Efficiency The new product line features a holistic approach to powerefficiency that benefits mobile work. Substantial changes to the microarchitecture, manufacturing process, packaging technology, and power management software result in up to 40% lower processor power consumption for modern tasks such as video conferencing with a virtual camera. Artificial Intelligence Intel Core Ultra processors incorporate an AI-optimized architecture that supports new user experiences and the next wave of commercial applications. The CPU, GPU, and the new neural processing unit (NPU) are all capable of executing AI tasks as directed by application developers. For example, elevated mobile collaboration is possible with support for AI assisted background blur, noise suppression, eye tracking, and picture framing. Intel Core Ultra processors are capable of up to 2.5x the AI inference performance per watt as compared to Intel’s previous mobile processor offering."}
{"text": "Intel Core Ultra processors incorporate an AI-optimized architecture that supports new user experiences and the next wave of commercial applications."}
1 change: 1 addition & 0 deletions tools/llm_bench/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ tiktoken
librosa # For Whisper
matplotlib
jinja2>=3.1.0
scipy
Loading
Loading