openvinotoolkit · Wovchena · Jun 10, 2025 · May 23, 2025 · May 27, 2025 · May 27, 2025
diff --git a/src/cpp/src/gguf_utils/building_blocks.cpp b/src/cpp/src/gguf_utils/building_blocks.cpp
@@ -255,12 +255,94 @@ std::pair<Output<ov::Node>, Output<ov::Node>> rope_emb(
     };
 }
 
-//std::tuple<Output<Node>, std::vector<std::shared_ptr<VariableState>>, std::pair<Output<Node>, Output<Node>>, Output<Node>>
+
+ov::Output<ov::Node> make_rms_norm_qwen3(
+    const std::string& key,
+    const ov::Output<ov::Node>& input,
+    const std::unordered_map<std::string, ov::Tensor>& weights,
+    float rms_norm_eps) {
+    auto eps_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1,1,1,1}, rms_norm_eps);
+    auto square = std::make_shared<ov::op::v1::Power>(
+        input, 
+        std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, 2.0f));
+
+    auto variance = std::make_shared<ov::op::v1::ReduceMean>(
+        square, 
+        std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, -1),
+        true);
+
+    auto add_eps = std::make_shared<ov::op::v1::Add>(variance, eps_node);
+    auto sqrt_node = std::make_shared<ov::op::v0::Sqrt>(add_eps);
+    auto reciprocal = std::make_shared<ov::op::v1::Divide>(
+        std::make_shared<ov::op::v0::Constant>(
+            ov::element::f32, ov::Shape{}, 1.0f),
+        sqrt_node);
+
+    std::shared_ptr<ov::Node> mul = std::make_shared<ov::op::v1::Multiply>(
+        reciprocal, input, AutoBroadcastType::NUMPY);
+
+    auto weight_tensor = weights.at(key + ".weight");
+    // Check if all elements are 1.0
+    bool all_ones = true;
+    if (weight_tensor.get_element_type() == ov::element::f32) {
+        const float* data = weight_tensor.data<float>();
+        for (size_t i = 0; i < weight_tensor.get_size(); ++i) {
+            if (data[i] != 1.0f) {
+                all_ones = false;
+                break;
+            }
+        }
+    } else if (weight_tensor.get_element_type() == ov::element::f16) {
+        const uint16_t* data = weight_tensor.data<uint16_t>();
+        const uint16_t one_in_fp16 = 0x3C00;
+        for (size_t i = 0; i < weight_tensor.get_size(); ++i) {
+            if (data[i] != one_in_fp16) {
+                all_ones = false;
+                break;
+            }
+        }
+    } else {
+        OPENVINO_THROW("Unsupported weight type ", weight_tensor.get_element_type());
+    }
+
+    if (!all_ones) {
+        weight_tensor.set_shape(ov::Shape{1, 1, 1, weight_tensor.get_shape()[0]});
+        auto weights_const = std::make_shared<ov::op::v0::Constant>(weight_tensor);
+        auto weights_f32 = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
+        mul = std::make_shared<ov::op::v1::Multiply>(mul, weights_f32, AutoBroadcastType::NUMPY);
+    }
+
+    return mul;
+}
+
+// Helper function to split heads
+// There are q_norm k_norm in Qwen3, if key_name + ".self_attn.q_norm" + ".weight" exists, a rms_norm will be built, if not it will go to else branch.
+std::shared_ptr<v1::Transpose> split_heads(const Output<Node>& x,
+                                            int num_h,
+                                            int  head_dim,
+                                            float rms_norm_eps,
+                                            const std::string& key,
+                                            const std::unordered_map<std::string, ov::Tensor>& weights) {
+    auto shape = std::make_shared<v0::Constant>(element::i64, Shape{4}, std::vector<int64_t>{0, 0, num_h, head_dim});
+    auto reshaped = std::make_shared<v1::Reshape>(x, shape, true);
+    if (weights.count(key + ".weight")) { //Qwen3 rms_norm
+        auto mul = make_rms_norm_qwen3(key, reshaped, weights, rms_norm_eps);
+        auto transpose_order = std::make_shared<v0::Constant>(element::i32, Shape{4}, std::vector<int32_t>{0, 2, 1, 3});
+
+        return std::make_shared<v1::Transpose>(mul, transpose_order);
+    } else { //none-Qwen3 architecture
+        auto transpose_order = std::make_shared<v0::Constant>(element::i32, Shape{4}, std::vector<int32_t>{0, 2, 1, 3});
+        return std::make_shared<v1::Transpose>(reshaped, transpose_order);
+    } 
+};
+
 std::tuple<Output<Node>, ov::SinkVector, std::pair<Output<Node>, Output<Node>>, Output<Node>>
 multi_head_attention(
     const Output<Node>& query,
     const Output<Node>& key,
     const Output<Node>& value,
+    const std::string& key_name,
+    const std::unordered_map<std::string, ov::Tensor>& consts,
     const std::map<std::string, GGUFMetaData>& configs,
     const Output<Node>& batch_dim,
     int layer_idx,
@@ -276,19 +358,13 @@ multi_head_attention(
     int num_heads = std::get<int>(configs.at("head_num"));
     int head_dim = std::get<int>(configs.at("head_size"));
     int num_heads_kv = std::get<int>(configs.at("head_num_kv"));
-
-    // Helper function to split heads
-    auto split_heads = [&](const Output<Node>& x, int num_h) {
-        auto shape = std::make_shared<v0::Constant>(element::i64, Shape{4}, std::vector<int64_t>{0, 0, num_h, head_dim});
-        auto reshaped = std::make_shared<v1::Reshape>(x, shape, true);
-        auto transpose_order = std::make_shared<v0::Constant>(element::i32, Shape{4}, std::vector<int32_t>{0, 2, 1, 3});
-        return std::make_shared<v1::Transpose>(reshaped, transpose_order);
-    };
-
+    float rms_norm_eps = std::get<float>(configs.at("rms_norm_eps"));
+
     // 1. Split heads
-    auto q_split = split_heads(query, num_heads);
-    auto k_split = split_heads(key, num_heads_kv);
-    auto v_split = split_heads(value, num_heads_kv);
+    // There are q_norm k_norm in Qwen3, if key_name + ".self_attn.q_norm" + ".weight" exists, a rms_norm will be built.
+    auto q_split = split_heads(query, num_heads, head_dim, rms_norm_eps, key_name + ".self_attn.q_norm", consts);
+    auto k_split = split_heads(key, num_heads_kv, head_dim, rms_norm_eps, key_name  + ".self_attn.k_norm", consts);
+    auto v_split = split_heads(value, num_heads_kv, head_dim, rms_norm_eps, key_name + ".self_attn.v_norm", consts);
 
     // 2. Apply rotary embeddings
     Output<Node> cos, sin;
@@ -830,7 +906,7 @@ std::tuple<ov::Output<ov::Node>,
         qtypes.at(layer_prefix + ".self_attn.q_proj.qtype"),
         reorder,
         std::get<int>(configs.at("head_size")));
-
+    
     auto k = make_fc(
         layer_prefix + ".self_attn.k_proj",
         input_layernorm,
@@ -863,6 +939,8 @@ std::tuple<ov::Output<ov::Node>,
     // Multi-head attention
     auto [attn_output, sinks, new_cos_sin, new_causal_mask] = multi_head_attention(
         q, k, v,
+        layer_prefix,
+        consts,
         configs,
         batch_dim,
         layer_idx,

diff --git a/src/cpp/src/gguf_utils/gguf.cpp b/src/cpp/src/gguf_utils/gguf.cpp
@@ -378,8 +378,10 @@ std::map<std::string, GGUFMetaData> config_from_meta(const std::unordered_map<st
     config["architecture"] = arch;
     config["layer_num"] = metadata_to_int(metadata, arch + ".block_count");
     config["head_num"] = metadata_to_int(metadata, arch + ".attention.head_count");
-    config["head_size"] = metadata_to_int(metadata, arch + ".embedding_length") / 
-                     metadata_to_int(metadata, arch + ".attention.head_count");
+    config["head_size"] = metadata.count(arch + ".attention.key_length") ?
+                     metadata_to_int(metadata, arch + ".attention.key_length") :
+                     (metadata_to_int(metadata, arch + ".embedding_length") / 
+                     metadata_to_int(metadata, arch + ".attention.head_count"));
     config["head_num_kv"] = metadata.count(arch + ".attention.head_count_kv") ?
             metadata_to_int(metadata, arch + ".attention.head_count_kv") :
             metadata_to_int(metadata, arch + ".attention.head_count");
@@ -440,6 +442,14 @@ std::unordered_map<std::string, ov::Tensor> consts_from_weights(
             consts[format("model.layers[%d].self_attn.o_proj.bias", i)] = weights.at(format("blk.%d.attn_output.bias", i));
         }
 
+        //Qwen3
+        if (weights.count(format("blk.%d.attn_k_norm.weight", i))) {
+            consts[format("model.layers[%d].self_attn.k_norm.weight", i)] = weights.at(format("blk.%d.attn_k_norm.weight", i));
+        }
+        if (weights.count(format("blk.%d.attn_q_norm.weight", i))) {
+            consts[format("model.layers[%d].self_attn.q_norm.weight", i)] = weights.at(format("blk.%d.attn_q_norm.weight", i));
+        }
+
         // MLP weights
         consts[format("model.layers[%d].mlp.gate_proj.weight", i)] = weights.at(format("blk.%d.ffn_gate.weight", i));
         if (weights.count(format("blk.%d.ffn_gate.bias", i))) {

diff --git a/src/cpp/src/gguf_utils/gguf_modeling.cpp b/src/cpp/src/gguf_utils/gguf_modeling.cpp
@@ -163,7 +163,7 @@ std::shared_ptr<ov::Model> create_from_gguf(const std::string& model_path) {
     std::shared_ptr<ov::Model> model;
 
     const std::string model_arch = std::get<std::string>(config.at("architecture"));
-    if (!model_arch.compare("llama") || !model_arch.compare("qwen2")) {
+    if (!model_arch.compare("llama") || !model_arch.compare("qwen2") || !model_arch.compare("qwen3")) {
         model = create_language_model(config, consts, qtypes);
     } else {
         OPENVINO_THROW("Unsupported model architecture '", model_arch, "'");

diff --git a/src/cpp/src/gguf_utils/gguf_tokenizer.cpp b/src/cpp/src/gguf_utils/gguf_tokenizer.cpp
@@ -576,11 +576,29 @@ std::string patch_gguf_chat_template(const std::string& chat_template) {
     const std::string qwen2_5_replacement_substring =
         R"({\"name\": <function-name>, \"arguments\": <args-json-object>})";
     // Find the position of the substring to be replaced
-    size_t pos = patched_chat_template.find(qwen2_5_substring_to_find);
-    if (pos != std::string::npos) {
+    size_t pos_qwen2_5 = patched_chat_template.find(qwen2_5_substring_to_find);
+    if (pos_qwen2_5 != std::string::npos) {
         // Substring found, perform the replacement
-        patched_chat_template.replace(pos, qwen2_5_substring_to_find.length(), qwen2_5_replacement_substring);
+        patched_chat_template.replace(pos_qwen2_5, qwen2_5_substring_to_find.length(), qwen2_5_replacement_substring);
     }
+
+    const std::string qwen3_substring_to_find_0 = R"({%- for index in range(ns.last_query_index, -1, -1) %})";
+    const std::string qwen3_substring_to_find_1 = R"({%- set message = messages[index] %})";
+    const std::string qwen3_substring_to_find_2 = R"({%- if ns.multi_step_tool and message.role == "user" and not('<tool_response>' in message.content and '</tool_response>' in message.content) %})";
+
+    const std::string qwen3_replacement_substring_0 = R"({%- for message in messages[::-1] %})";
+    const std::string qwen3_replacement_substring_1 = R"({%- set index = (messages|length - 1) - loop.index0 %})";
+    const std::string qwen3_replacement_substring_2 = R"({%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %})";
+
+    const std::string qwen3_substring_to_find = qwen3_substring_to_find_0 + "\n" + "    " + qwen3_substring_to_find_1 + "\n" + "    "  + qwen3_substring_to_find_2;
+    const std::string qwen3_replacement_substring = qwen3_replacement_substring_0 + "\n" + "    " + qwen3_replacement_substring_1 + "\n" + "    "  + qwen3_replacement_substring_2;
+    size_t pos_qwen3 = patched_chat_template.find(qwen3_substring_to_find);
+
+    if (pos_qwen3 != std::string::npos) {
+        // Substring found, perform the replacement
+        patched_chat_template.replace(pos_qwen3, qwen3_substring_to_find.length(), qwen3_replacement_substring);
+    }
+
     return patched_chat_template;
 }
 

diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
@@ -879,3 +879,27 @@ def test_full_gguf_pipeline(pipeline_type, model_ids):
     res_string_input_2 = ov_pipe_gguf.generate(prompt, generation_config=ov_generation_config)
 
     assert res_string_input_1 == res_string_input_2
+
+
+
+
+@pytest.mark.parametrize("pipeline_type", get_gguf_pipeline_types())
+@pytest.mark.parametrize("model_ids", {"gguf_model_id": "Qwen/Qwen3-0.6B-GGUF", "gguf_filename": "Qwen3-0.6B-Q8_0.gguf"})
+@pytest.mark.precommit
+def test_full_gguf_qwen3_pipeline(pipeline_type, model_ids):
+    gguf_model_id = model_ids["gguf_model_id"]
+    gguf_filename = model_ids["gguf_filename"]
+    prompt = 'Why is the Sun yellow?'
+
+    ov_generation_config = ov_genai.GenerationConfig()
+    ov_generation_config.max_new_tokens = 30
+    ov_generation_config.apply_chat_template = False
+    ov_generation_config.set_eos_token_id('151645')
+
+    res_string_input_1 = "<|im_end|>\nOkay, the user is asking why the Sun is yellow. Let me think about this. First, I need to recall"
+
+    gguf_full_path = download_gguf_model(gguf_model_id, gguf_filename)
+    ov_pipe_gguf = create_ov_pipeline(gguf_full_path, pipeline_type=pipeline_type)
+    res_string_input_2 = ov_pipe_gguf.generate(prompt, generation_config=ov_generation_config)
+
+    assert res_string_input_1 == res_string_input_2