fix bug

Gfreely · Gfreely · commit b7f4136042fc · 2025-08-29T21:22:38.000+08:00
diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
@@ -391,7 +391,7 @@ def __init__(self, llm_config_object):
 输出格式：
 - 仅输出一行内容，以 KEYWORDS: 为前缀，后跟列表项，关键词提取列表项为 关键词：重要性评分，评分建议保留两位小数，同义词提取列表项为对应的同义词，列表项之间用逗号分隔。抽取的关键词中不允许出现空格或空字符
 - 格式示例：
-KEYWORDS:关键词 1：分数 1，关键词 2：分数 2，...,关键词 n：分数 n
+KEYWORDS:关键词_1：分数_1，关键词_2：分数_2，...,关键词_n：分数_n
 
 MAX_KEYWORDS: {max_keywords}
 文本：
@@ -442,4 +442,3 @@ def __init__(self, llm_config_object):
 ## Language Requirement:
 Please generate the prompt in {language} language.
 """
-
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/document_op/textrank_word_extract.py b/hugegraph-llm/src/hugegraph_llm/operators/document_op/textrank_word_extract.py
@@ -37,7 +37,7 @@ def __init__(self, keyword_num: int = 5, window_size: int = 3):
             'chinese': ('n', 'nr', 'ns', 'nt', 'nrt', 'nz', 'v', 'vd', 'vn', "eng", "j", "l"),
             'english': ('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBG', 'VBN', 'VBZ')
         }
-        self.rules = [r"'https?://\S+|www\.\S+",
+        self.rules = [r"https?://\S+|www\.\S+",
                       r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
                       r"\b\w+(?:[-’\']\w+)+\b",
                       r"\b\d+[,.]\d+\b"]
@@ -93,10 +93,10 @@ def _multi_preprocess(self, text):
             if word in placeholder_map:
                 words.append(placeholder_map[word])
             else:
-                if len(word) >= 1 and flag in self.pos_filter['english'] and word not in en_stop_words:
+                if len(word) >= 1 and flag in self.pos_filter['english'] and word.lower() not in en_stop_words:
                     # 存在中文字符会重新分词，否则加入分词
                     words.append(word)
-                    if re.compile('[\u4e00-\u9fa5]').search(word):
+                    if re.compile('[\u4e00-\u9fff]').search(word):
                         ch_tokens.append(word)
 
         # 5. 需要进一步的话，中文分词
@@ -136,7 +136,8 @@ def _rank_nodes(self):
             return {}
 
         pagerank_scores = self.graph.pagerank(directed=False, damping=0.85, weights='weight')
-        pagerank_scores = [scores/max(pagerank_scores) for scores in pagerank_scores]
+        if max(pagerank_scores) > 0:
+            pagerank_scores = [scores/max(pagerank_scores) for scores in pagerank_scores]
         node_names = self.graph.vs['name']
         return dict(zip(node_names, pagerank_scores))
 
@@ -153,7 +154,7 @@ def extract_keywords(self, text) -> dict:
         # 3. 构建图，运行 PageRank 算法
         unique_words = list(set(words))
         ranks = dict(zip(unique_words, [0] * len(unique_words)))
-        if len(unique_words) > self.window:
+        if len(unique_words) > 1:
             self._build_graph(words)
             if not self.graph or self.graph.vcount() == 0:
                 return {}
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py
@@ -168,9 +168,18 @@ def _extract_keywords_from_response(
         for match in matches:
             match = match.strip()
             for k in re.split(r"[,，]+", match):
-                lis = re.split(r"[:：]", k.strip())
-                word, score = lis[0].strip(), float(lis[1].strip())
-                if len(word) > 1:
-                    results[word.lower() if lowercase else word] = score
-
+                item = k.strip()
+                if not item:
+                    continue
+                parts = re.split(r"[:：]", item, maxsplit=1)
+                if len(parts) != 2:
+                    continue
+                word_raw, score_raw = parts[0].strip(), parts[1].strip()
+                if len(word_raw) > 0:
+                    try:
+                        score_val = float(score_raw)
+                    except ValueError:
+                        continue
+                    word_out = word_raw.lower() if lowercase else word_raw
+                    results[word_out] = score_val
         return results