Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 92 additions & 14 deletions src/cpp/src/gguf_utils/building_blocks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,12 +255,94 @@ std::pair<Output<ov::Node>, Output<ov::Node>> rope_emb(
};
}

//std::tuple<Output<Node>, std::vector<std::shared_ptr<VariableState>>, std::pair<Output<Node>, Output<Node>>, Output<Node>>

ov::Output<ov::Node> make_rms_norm_qwen3(
const std::string& key,
const ov::Output<ov::Node>& input,
const std::unordered_map<std::string, ov::Tensor>& weights,
float rms_norm_eps) {
auto eps_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1,1,1,1}, rms_norm_eps);
auto square = std::make_shared<ov::op::v1::Power>(
input,
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, 2.0f));

auto variance = std::make_shared<ov::op::v1::ReduceMean>(
square,
std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, -1),
true);

auto add_eps = std::make_shared<ov::op::v1::Add>(variance, eps_node);
auto sqrt_node = std::make_shared<ov::op::v0::Sqrt>(add_eps);
auto reciprocal = std::make_shared<ov::op::v1::Divide>(
std::make_shared<ov::op::v0::Constant>(
ov::element::f32, ov::Shape{}, 1.0f),
sqrt_node);

std::shared_ptr<ov::Node> mul = std::make_shared<ov::op::v1::Multiply>(
reciprocal, input, AutoBroadcastType::NUMPY);

auto weight_tensor = weights.at(key + ".weight");
// Check if all elements are 1.0
bool all_ones = true;
if (weight_tensor.get_element_type() == ov::element::f32) {
const float* data = weight_tensor.data<float>();
for (size_t i = 0; i < weight_tensor.get_size(); ++i) {
if (data[i] != 1.0f) {
all_ones = false;
break;
}
}
} else if (weight_tensor.get_element_type() == ov::element::f16) {
const uint16_t* data = weight_tensor.data<uint16_t>();
const uint16_t one_in_fp16 = 0x3C00;
for (size_t i = 0; i < weight_tensor.get_size(); ++i) {
if (data[i] != one_in_fp16) {
all_ones = false;
break;
}
}
} else {
OPENVINO_THROW("Unsupported weight type ", weight_tensor.get_element_type());
}

if (!all_ones) {
weight_tensor.set_shape(ov::Shape{1, 1, 1, weight_tensor.get_shape()[0]});
auto weights_const = std::make_shared<ov::op::v0::Constant>(weight_tensor);
auto weights_f32 = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
mul = std::make_shared<ov::op::v1::Multiply>(mul, weights_f32, AutoBroadcastType::NUMPY);
}

return mul;
}

// Helper function to split heads
// There are q_norm k_norm in Qwen3, if key_name + ".self_attn.q_norm" + ".weight" exists, a rms_norm will be built, if not it will go to else branch.
std::shared_ptr<v1::Transpose> split_heads(const Output<Node>& x,
int num_h,
int head_dim,
float rms_norm_eps,
const std::string& key,
const std::unordered_map<std::string, ov::Tensor>& weights) {
auto shape = std::make_shared<v0::Constant>(element::i64, Shape{4}, std::vector<int64_t>{0, 0, num_h, head_dim});
auto reshaped = std::make_shared<v1::Reshape>(x, shape, true);
if (weights.count(key + ".weight")) { //Qwen3 rms_norm
auto mul = make_rms_norm_qwen3(key, reshaped, weights, rms_norm_eps);
auto transpose_order = std::make_shared<v0::Constant>(element::i32, Shape{4}, std::vector<int32_t>{0, 2, 1, 3});

return std::make_shared<v1::Transpose>(mul, transpose_order);
} else { //none-Qwen3 architecture
auto transpose_order = std::make_shared<v0::Constant>(element::i32, Shape{4}, std::vector<int32_t>{0, 2, 1, 3});
return std::make_shared<v1::Transpose>(reshaped, transpose_order);
}
};

std::tuple<Output<Node>, ov::SinkVector, std::pair<Output<Node>, Output<Node>>, Output<Node>>
multi_head_attention(
const Output<Node>& query,
const Output<Node>& key,
const Output<Node>& value,
const std::string& key_name,
const std::unordered_map<std::string, ov::Tensor>& consts,
const std::map<std::string, GGUFMetaData>& configs,
const Output<Node>& batch_dim,
int layer_idx,
Expand All @@ -276,19 +358,13 @@ multi_head_attention(
int num_heads = std::get<int>(configs.at("head_num"));
int head_dim = std::get<int>(configs.at("head_size"));
int num_heads_kv = std::get<int>(configs.at("head_num_kv"));

// Helper function to split heads
auto split_heads = [&](const Output<Node>& x, int num_h) {
auto shape = std::make_shared<v0::Constant>(element::i64, Shape{4}, std::vector<int64_t>{0, 0, num_h, head_dim});
auto reshaped = std::make_shared<v1::Reshape>(x, shape, true);
auto transpose_order = std::make_shared<v0::Constant>(element::i32, Shape{4}, std::vector<int32_t>{0, 2, 1, 3});
return std::make_shared<v1::Transpose>(reshaped, transpose_order);
};

float rms_norm_eps = std::get<float>(configs.at("rms_norm_eps"));

// 1. Split heads
auto q_split = split_heads(query, num_heads);
auto k_split = split_heads(key, num_heads_kv);
auto v_split = split_heads(value, num_heads_kv);
// There are q_norm k_norm in Qwen3, if key_name + ".self_attn.q_norm" + ".weight" exists, a rms_norm will be built.
auto q_split = split_heads(query, num_heads, head_dim, rms_norm_eps, key_name + ".self_attn.q_norm", consts);
auto k_split = split_heads(key, num_heads_kv, head_dim, rms_norm_eps, key_name + ".self_attn.k_norm", consts);
auto v_split = split_heads(value, num_heads_kv, head_dim, rms_norm_eps, key_name + ".self_attn.v_norm", consts);

// 2. Apply rotary embeddings
Output<Node> cos, sin;
Expand Down Expand Up @@ -830,7 +906,7 @@ std::tuple<ov::Output<ov::Node>,
qtypes.at(layer_prefix + ".self_attn.q_proj.qtype"),
reorder,
std::get<int>(configs.at("head_size")));

auto k = make_fc(
layer_prefix + ".self_attn.k_proj",
input_layernorm,
Expand Down Expand Up @@ -863,6 +939,8 @@ std::tuple<ov::Output<ov::Node>,
// Multi-head attention
auto [attn_output, sinks, new_cos_sin, new_causal_mask] = multi_head_attention(
q, k, v,
layer_prefix,
consts,
configs,
batch_dim,
layer_idx,
Expand Down
14 changes: 12 additions & 2 deletions src/cpp/src/gguf_utils/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,10 @@ std::map<std::string, GGUFMetaData> config_from_meta(const std::unordered_map<st
config["architecture"] = arch;
config["layer_num"] = metadata_to_int(metadata, arch + ".block_count");
config["head_num"] = metadata_to_int(metadata, arch + ".attention.head_count");
config["head_size"] = metadata_to_int(metadata, arch + ".embedding_length") /
metadata_to_int(metadata, arch + ".attention.head_count");
config["head_size"] = metadata.count(arch + ".attention.key_length") ?
metadata_to_int(metadata, arch + ".attention.key_length") :
(metadata_to_int(metadata, arch + ".embedding_length") /
metadata_to_int(metadata, arch + ".attention.head_count"));
config["head_num_kv"] = metadata.count(arch + ".attention.head_count_kv") ?
metadata_to_int(metadata, arch + ".attention.head_count_kv") :
metadata_to_int(metadata, arch + ".attention.head_count");
Expand Down Expand Up @@ -440,6 +442,14 @@ std::unordered_map<std::string, ov::Tensor> consts_from_weights(
consts[format("model.layers[%d].self_attn.o_proj.bias", i)] = weights.at(format("blk.%d.attn_output.bias", i));
}

//Qwen3
if (weights.count(format("blk.%d.attn_k_norm.weight", i))) {
consts[format("model.layers[%d].self_attn.k_norm.weight", i)] = weights.at(format("blk.%d.attn_k_norm.weight", i));
}
if (weights.count(format("blk.%d.attn_q_norm.weight", i))) {
consts[format("model.layers[%d].self_attn.q_norm.weight", i)] = weights.at(format("blk.%d.attn_q_norm.weight", i));
}

// MLP weights
consts[format("model.layers[%d].mlp.gate_proj.weight", i)] = weights.at(format("blk.%d.ffn_gate.weight", i));
if (weights.count(format("blk.%d.ffn_gate.bias", i))) {
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/gguf_utils/gguf_modeling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ std::shared_ptr<ov::Model> create_from_gguf(const std::string& model_path) {
std::shared_ptr<ov::Model> model;

const std::string model_arch = std::get<std::string>(config.at("architecture"));
if (!model_arch.compare("llama") || !model_arch.compare("qwen2")) {
if (!model_arch.compare("llama") || !model_arch.compare("qwen2") || !model_arch.compare("qwen3")) {
model = create_language_model(config, consts, qtypes);
} else {
OPENVINO_THROW("Unsupported model architecture '", model_arch, "'");
Expand Down
24 changes: 21 additions & 3 deletions src/cpp/src/gguf_utils/gguf_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,11 +576,29 @@ std::string patch_gguf_chat_template(const std::string& chat_template) {
const std::string qwen2_5_replacement_substring =
R"({\"name\": <function-name>, \"arguments\": <args-json-object>})";
// Find the position of the substring to be replaced
size_t pos = patched_chat_template.find(qwen2_5_substring_to_find);
if (pos != std::string::npos) {
size_t pos_qwen2_5 = patched_chat_template.find(qwen2_5_substring_to_find);
if (pos_qwen2_5 != std::string::npos) {
// Substring found, perform the replacement
patched_chat_template.replace(pos, qwen2_5_substring_to_find.length(), qwen2_5_replacement_substring);
patched_chat_template.replace(pos_qwen2_5, qwen2_5_substring_to_find.length(), qwen2_5_replacement_substring);
}

const std::string qwen3_substring_to_find_0 = R"({%- for index in range(ns.last_query_index, -1, -1) %})";
const std::string qwen3_substring_to_find_1 = R"({%- set message = messages[index] %})";
const std::string qwen3_substring_to_find_2 = R"({%- if ns.multi_step_tool and message.role == "user" and not('<tool_response>' in message.content and '</tool_response>' in message.content) %})";

const std::string qwen3_replacement_substring_0 = R"({%- for message in messages[::-1] %})";
const std::string qwen3_replacement_substring_1 = R"({%- set index = (messages|length - 1) - loop.index0 %})";
const std::string qwen3_replacement_substring_2 = R"({%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %})";

const std::string qwen3_substring_to_find = qwen3_substring_to_find_0 + "\n" + " " + qwen3_substring_to_find_1 + "\n" + " " + qwen3_substring_to_find_2;
const std::string qwen3_replacement_substring = qwen3_replacement_substring_0 + "\n" + " " + qwen3_replacement_substring_1 + "\n" + " " + qwen3_replacement_substring_2;
size_t pos_qwen3 = patched_chat_template.find(qwen3_substring_to_find);

if (pos_qwen3 != std::string::npos) {
// Substring found, perform the replacement
patched_chat_template.replace(pos_qwen3, qwen3_substring_to_find.length(), qwen3_replacement_substring);
}

return patched_chat_template;
}

Expand Down
24 changes: 24 additions & 0 deletions tests/python_tests/test_llm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,3 +879,27 @@ def test_full_gguf_pipeline(pipeline_type, model_ids):
res_string_input_2 = ov_pipe_gguf.generate(prompt, generation_config=ov_generation_config)

assert res_string_input_1 == res_string_input_2




@pytest.mark.parametrize("pipeline_type", get_gguf_pipeline_types())
@pytest.mark.parametrize("model_ids", {"gguf_model_id": "Qwen/Qwen3-0.6B-GGUF", "gguf_filename": "Qwen3-0.6B-Q8_0.gguf"})
@pytest.mark.precommit
def test_full_gguf_qwen3_pipeline(pipeline_type, model_ids):
gguf_model_id = model_ids["gguf_model_id"]
gguf_filename = model_ids["gguf_filename"]
prompt = 'Why is the Sun yellow?'

ov_generation_config = ov_genai.GenerationConfig()
ov_generation_config.max_new_tokens = 30
ov_generation_config.apply_chat_template = False
ov_generation_config.set_eos_token_id('151645')

res_string_input_1 = "<|im_end|>\nOkay, the user is asking why the Sun is yellow. Let me think about this. First, I need to recall"

gguf_full_path = download_gguf_model(gguf_model_id, gguf_filename)
ov_pipe_gguf = create_ov_pipeline(gguf_full_path, pipeline_type=pipeline_type)
res_string_input_2 = ov_pipe_gguf.generate(prompt, generation_config=ov_generation_config)

assert res_string_input_1 == res_string_input_2
Loading