Skip to content

Commit 9724a9c

Browse files
committed
[llm_bench]Add possibility to run tool with models in gguf format
1 parent 1564b2e commit 9724a9c

File tree

6 files changed

+102
-75
lines changed

6 files changed

+102
-75
lines changed

tests/python_tests/samples/test_tools_llm_benchmark.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
from conftest import SAMPLES_PY_DIR, convert_model, download_test_content
99
from test_utils import run_sample
10+
from data.models import get_gguf_model_list
11+
from utils.hugging_face import download_gguf_model
1012

1113
convert_draft_model = convert_model
1214
download_mask_image = download_test_content
@@ -221,9 +223,9 @@ def test_python_tool_llm_benchmark_optimum(self, convert_model, download_test_co
221223
@pytest.mark.samples
222224
@pytest.mark.parametrize("convert_model", ["bge-small-en-v1.5"], indirect=True)
223225
@pytest.mark.parametrize("sample_args", [
224-
["-d", "cpu", "-n", "2"],
225-
["-d", "cpu", "-n", "2", "--embedding_max_length", "128", "--embedding_normalize", "--embedding_pooling", "mean"],
226-
["-d", "cpu", "-n", "2", "--optimum"],
226+
["-d", "cpu", "-n", "2"],
227+
["-d", "cpu", "-n", "2", "--embedding_max_length", "128", "--embedding_normalize", "--embedding_pooling", "mean"],
228+
["-d", "cpu", "-n", "2", "--optimum"],
227229
["-d", "cpu", "-n", "1", "--embedding_max_length", "128", "--embedding_normalize", "--embedding_pooling", "mean", "--optimum"]
228230
])
229231
def test_python_tool_llm_benchmark_text_embeddings(self, convert_model, sample_args):
@@ -234,21 +236,36 @@ def test_python_tool_llm_benchmark_text_embeddings(self, convert_model, sample_a
234236
"-m", convert_model,
235237
] + sample_args
236238
run_sample(benchmark_py_command)
237-
238-
239+
239240
@pytest.mark.samples
240241
@pytest.mark.parametrize("convert_model", ["ms-marco-TinyBERT-L2-v2"], indirect=True)
241242
@pytest.mark.parametrize("sample_args", [
242-
["-d", "cpu", "-n", "2", "--rerank"],
243+
["-d", "cpu", "-n", "2", "--rerank"],
243244
["-d", "cpu", "-n", "2", "--reranking_max_length", "10", "--reranking_top_n", "1", "--rerank"],
244-
["-d", "cpu", "-n", "2", "--optimum", "--rerank"],
245+
["-d", "cpu", "-n", "2", "--optimum", "--rerank"],
245246
["-d", "cpu", "-n", "1", "--reranking_max_length", "10", "--reranking_top_n", "1", "--optimum", "--rerank"]
246247
])
247248
def test_python_tool_llm_benchmark_text_reranking(self, convert_model, sample_args):
248249
benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py')
249250
benchmark_py_command = [
250251
sys.executable,
251252
benchmark_script,
252-
"-m", convert_model,
253+
"-m", convert_model,
254+
] + sample_args
255+
run_sample(benchmark_py_command)
256+
257+
@pytest.mark.samples
258+
@pytest.mark.parametrize("sample_args", [
259+
["-d", "cpu", "-n", "1"],
260+
["-d", "cpu", "-n", "1", "-f", "pt"],
261+
])
262+
def test_python_tool_llm_benchmark_gguf_format(self, sample_args):
263+
benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py')
264+
gguf_model = get_gguf_model_list()[0]
265+
gguf_full_path = download_gguf_model(gguf_model["gguf_model_id"], gguf_model["gguf_filename"])
266+
benchmark_py_command = [
267+
sys.executable,
268+
benchmark_script,
269+
"-m", os.path.join(gguf_full_path, gguf_model["gguf_filename"]),
253270
] + sample_args
254271
run_sample(benchmark_py_command)

tools/llm_bench/benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def num_infer_count_type(x):
4040

4141
def get_argprser():
4242
parser = argparse.ArgumentParser('LLM benchmarking tool', add_help=True, formatter_class=argparse.RawTextHelpFormatter)
43-
parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files', required=TabError)
43+
parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files or path to GGUF model', required=TabError)
4444
parser.add_argument('-d', '--device', default='cpu', help='inference device')
4545
parser.add_argument('-r', '--report', help='report csv')
4646
parser.add_argument('-rj', '--report_json', help='report json')

tools/llm_bench/llm_bench_utils/model_utils.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -234,26 +234,35 @@ def analyze_args(args):
234234
return model_path, model_framework, model_args, model_name
235235

236236

237-
def get_use_case(model_name_or_path):
238-
config_file = Path(model_name_or_path) / "config.json"
239-
config = None
240-
if config_file.exists():
241-
config = json.loads(config_file.read_text())
237+
def get_use_case(model_name_or_path: str | Path):
242238
if (Path(model_name_or_path) / "model_index.json").exists():
243239
diffusers_config = json.loads((Path(model_name_or_path) / "model_index.json").read_text())
244240
pipe_type = diffusers_config.get("_class_name")
245241
if pipe_type in ["StableDiffusionPipeline", "StableDiffusionXLPipeline", "StableDiffusion3Pipeline", "StableDiffusionInpaintPipeline",
246242
"StableDiffusionXLInpaintPipeline", "FluxPipeline", "LatentConsistencyModelPipeline"]:
247243
return "image_gen", pipe_type.replace("Pipeline", "")
248244

249-
if config is not None:
250-
case, model_name = resolve_complex_model_types(config)
251-
if case is not None:
252-
log.info(f'==SUCCESS FOUND==: use_case: {case}, model_type: {model_name}')
253-
return case, model_name
245+
model_type = None
246+
config_file = Path(model_name_or_path) / "config.json"
247+
if config_file.exists():
248+
config = json.loads(config_file.read_text())
249+
if config is not None:
250+
case, model_name = resolve_complex_model_types(config)
251+
if case is not None:
252+
log.info(f'==SUCCESS FOUND==: use_case: {case}, model_type: {model_name}')
253+
return case, model_name
254+
model_type = config.get("model_type").lower().replace('_', '-')
255+
elif Path(model_name_or_path).suffix in '.gguf':
256+
import gguf_parser
257+
parser = gguf_parser.GGUFParser(model_name_or_path)
258+
parser.parse()
259+
if parser.metadata and parser.metadata.get('general.architecture'):
260+
model_type = parser.metadata.get('general.architecture').lower()
261+
262+
if model_type is not None:
254263
for case, model_ids in USE_CASES.items():
255264
for idx, model_id in enumerate(normalize_model_ids(model_ids)):
256-
if config.get("model_type").lower().replace('_', '-').startswith(model_id):
265+
if model_type.startswith(model_id):
257266
log.info(f'==SUCCESS FOUND==: use_case: {case}, model_type: {model_id}')
258267
return case, model_ids[idx]
259268

tools/llm_bench/llm_bench_utils/ov_utils.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,15 +216,13 @@ def cb_pipeline_required(args):
216216

217217
def create_genai_text_gen_model(model_path, device, ov_config, memory_data_collector, **kwargs):
218218
import openvino_genai
219-
from transformers import AutoTokenizer
220219
from packaging.version import parse
221220

222-
if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists():
221+
if Path(model_path).suffix not in '.gguf'\
222+
and (not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists()):
223223
raise ValueError("OpenVINO Tokenizer model is not found in model directory. Please convert tokenizer using following command:\n"
224224
"convert_tokenizer --with-detokenizer MODEL_DIR --output MODEL_DIR ")
225225

226-
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
227-
228226
config = {}
229227
draft_model_path = kwargs.get("draft_model", '')
230228
cb_config = kwargs.get("cb_config")
@@ -296,7 +294,7 @@ def get_time_list(self):
296294
return self.token_generation_time
297295
streamer = TokenStreamer(llm_pipe.get_tokenizer()) if use_streamer_metrics else None
298296

299-
return llm_pipe, tokenizer, end - start, streamer, True
297+
return llm_pipe, None, end - start, streamer, True
300298

301299

302300
def convert_ov_tokenizer(tokenizer_path):

tools/llm_bench/llm_bench_utils/pt_utils.py

Lines changed: 51 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -66,63 +66,64 @@ def run_torch_compile(model, backend='openvino', dynamic=None, options=None, chi
6666

6767
def create_text_gen_model(model_path, device, memory_data_collector, **kwargs):
6868
model_path = Path(model_path)
69-
from_pretrain_time = 0
70-
if model_path.exists():
71-
if model_path.is_dir() and len(os.listdir(model_path)) != 0:
72-
log.info(f'Load text model from model path:{model_path}')
73-
default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']]
74-
model_type = kwargs.get('model_type', default_model_type)
75-
model_class = PT_MODEL_CLASSES_MAPPING.get(model_type, PT_MODEL_CLASSES_MAPPING[default_model_type])
76-
token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type])
77-
if kwargs.get("mem_consumption"):
78-
memory_data_collector.start()
79-
start = time.perf_counter()
80-
trust_remote_code = False
81-
try:
82-
model = model_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
83-
except Exception:
84-
start = time.perf_counter()
85-
trust_remote_code = True
86-
model = model_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
87-
tokenizer = token_class.from_pretrained(model_path, trust_remote_code=trust_remote_code)
88-
end = time.perf_counter()
89-
from_pretrain_time = end - start
90-
if kwargs.get("mem_consumption"):
91-
memory_data_collector.stop_and_collect_data('from_pretrained_phase')
92-
memory_data_collector.log_data(compilation_phase=True)
93-
else:
94-
raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty')
95-
else:
69+
is_gguf_model = model_path.suffix in '.gguf'
70+
if not model_path.exists():
9671
raise RuntimeError(f'==Failure ==: model path:{model_path} is not exist')
72+
if not is_gguf_model and not (model_path.is_dir() and len(os.listdir(model_path)) != 0):
73+
raise RuntimeError(f'==Failure ==: model path:{model_path} is not directory or directory is empty')
74+
if not device:
75+
raise RuntimeError('==Failure ==: no device to load')
76+
77+
log.info(f'Load text model from model path:{model_path}')
78+
default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']]
79+
model_type = kwargs.get('model_type', default_model_type)
80+
model_class = PT_MODEL_CLASSES_MAPPING.get(model_type, PT_MODEL_CLASSES_MAPPING[default_model_type])
81+
token_class = TOKENIZE_CLASSES_MAPPING.get(model_type, TOKENIZE_CLASSES_MAPPING[default_model_type])
82+
if kwargs.get("mem_consumption"):
83+
memory_data_collector.start()
84+
start = time.perf_counter()
85+
load_model_kwargs = {'trust_remote_code': False}
86+
if is_gguf_model:
87+
load_model_kwargs |= {'gguf_file': str(model_path)}
88+
model_path = model_path.parent
89+
try:
90+
model = model_class.from_pretrained(model_path, **load_model_kwargs)
91+
except Exception:
92+
start = time.perf_counter()
93+
load_model_kwargs['trust_remote_code'] = True
94+
model = model_class.from_pretrained(model_path, **load_model_kwargs)
95+
tokenizer = token_class.from_pretrained(model_path, **load_model_kwargs)
96+
end = time.perf_counter()
97+
from_pretrain_time = end - start
98+
if kwargs.get("mem_consumption"):
99+
memory_data_collector.stop_and_collect_data('from_pretrained_phase')
100+
memory_data_collector.log_data(compilation_phase=True)
97101

98102
log.info(f'model path:{model_path}, from pretrained time: {from_pretrain_time:.2f}s')
99103

100-
if device is not None:
101-
gptjfclm = 'transformers.models.gptj.modeling_gptj.GPTJForCausalLM'
102-
lfclm = 'transformers.models.llama.modeling_llama.LlamaForCausalLM'
103-
bfclm = 'transformers.models.bloom.modeling_bloom.BloomForCausalLM'
104-
gpt2lmhm = 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'
105-
gptneoxclm = 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM'
106-
chatglmfcg = 'transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration'
107-
real_base_model_name = str(type(model)).lower()
108-
log.info(f'Real base model={real_base_model_name}')
109-
# bfclm will trigger generate crash.
104+
gptjfclm = 'transformers.models.gptj.modeling_gptj.GPTJForCausalLM'
105+
lfclm = 'transformers.models.llama.modeling_llama.LlamaForCausalLM'
106+
bfclm = 'transformers.models.bloom.modeling_bloom.BloomForCausalLM'
107+
gpt2lmhm = 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'
108+
gptneoxclm = 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM'
109+
chatglmfcg = 'transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration'
110+
real_base_model_name = str(type(model)).lower()
111+
log.info(f'Real base model={real_base_model_name}')
112+
# bfclm will trigger generate crash.
110113

111-
# If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch
112-
if device.upper() == 'GPU':
113-
device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable')
114-
else:
115-
device = torch.device(device.lower())
116-
log.info(f'Torch device was set to: {device}')
114+
# If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch
115+
if device.upper() == 'GPU':
116+
device = torch.device('cuda') if torch.cuda.is_available() else log.info('CUDA device is unavailable')
117+
else:
118+
device = torch.device(device.lower())
119+
log.info(f'Torch device was set to: {device}')
117120

118-
if any(x in real_base_model_name for x in [gptjfclm, lfclm, bfclm, gpt2lmhm, gptneoxclm, chatglmfcg]):
119-
model = set_bf16(model, device, **kwargs)
120-
else:
121-
if len(kwargs['config']) > 0 and kwargs['config'].get('PREC_BF16') and kwargs['config']['PREC_BF16'] is True:
122-
log.info('Param [bf16/prec_bf16] will not work.')
123-
model.to(device)
121+
if any(x in real_base_model_name for x in [gptjfclm, lfclm, bfclm, gpt2lmhm, gptneoxclm, chatglmfcg]):
122+
model = set_bf16(model, device, **kwargs)
124123
else:
125-
raise RuntimeError('==Failure ==: no device to load')
124+
if len(kwargs['config']) > 0 and kwargs['config'].get('PREC_BF16') and kwargs['config']['PREC_BF16'] is True:
125+
log.info('Param [bf16/prec_bf16] will not work.')
126+
model.to(device)
126127

127128
bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], model)
128129

tools/llm_bench/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@ librosa # For Whisper
1818
matplotlib
1919
jinja2>=3.1.0
2020
scipy
21+
gguf_parser
22+
gguf>=0.10

0 commit comments

Comments
 (0)