Skip to content

Commit b7f4136

Browse files
committed
fix bug
1 parent 3c3f7bb commit b7f4136

File tree

3 files changed

+21
-12
lines changed

3 files changed

+21
-12
lines changed

hugegraph-llm/src/hugegraph_llm/config/prompt_config.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ def __init__(self, llm_config_object):
391391
输出格式:
392392
- 仅输出一行内容,以 KEYWORDS: 为前缀,后跟列表项,关键词提取列表项为 关键词:重要性评分,评分建议保留两位小数,同义词提取列表项为对应的同义词,列表项之间用逗号分隔。抽取的关键词中不允许出现空格或空字符
393393
- 格式示例:
394-
KEYWORDS:关键词 1:分数 1,关键词 2:分数 2,...,关键词 n:分数 n
394+
KEYWORDS:关键词_1:分数_1,关键词_2:分数_2,...,关键词_n:分数_n
395395
396396
MAX_KEYWORDS: {max_keywords}
397397
文本:
@@ -442,4 +442,3 @@ def __init__(self, llm_config_object):
442442
## Language Requirement:
443443
Please generate the prompt in {language} language.
444444
"""
445-

hugegraph-llm/src/hugegraph_llm/operators/document_op/textrank_word_extract.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(self, keyword_num: int = 5, window_size: int = 3):
3737
'chinese': ('n', 'nr', 'ns', 'nt', 'nrt', 'nz', 'v', 'vd', 'vn', "eng", "j", "l"),
3838
'english': ('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBG', 'VBN', 'VBZ')
3939
}
40-
self.rules = [r"'https?://\S+|www\.\S+",
40+
self.rules = [r"https?://\S+|www\.\S+",
4141
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
4242
r"\b\w+(?:[-’\']\w+)+\b",
4343
r"\b\d+[,.]\d+\b"]
@@ -93,10 +93,10 @@ def _multi_preprocess(self, text):
9393
if word in placeholder_map:
9494
words.append(placeholder_map[word])
9595
else:
96-
if len(word) >= 1 and flag in self.pos_filter['english'] and word not in en_stop_words:
96+
if len(word) >= 1 and flag in self.pos_filter['english'] and word.lower() not in en_stop_words:
9797
# 存在中文字符会重新分词,否则加入分词
9898
words.append(word)
99-
if re.compile('[\u4e00-\u9fa5]').search(word):
99+
if re.compile('[\u4e00-\u9fff]').search(word):
100100
ch_tokens.append(word)
101101

102102
# 5. 需要进一步的话,中文分词
@@ -136,7 +136,8 @@ def _rank_nodes(self):
136136
return {}
137137

138138
pagerank_scores = self.graph.pagerank(directed=False, damping=0.85, weights='weight')
139-
pagerank_scores = [scores/max(pagerank_scores) for scores in pagerank_scores]
139+
if max(pagerank_scores) > 0:
140+
pagerank_scores = [scores/max(pagerank_scores) for scores in pagerank_scores]
140141
node_names = self.graph.vs['name']
141142
return dict(zip(node_names, pagerank_scores))
142143

@@ -153,7 +154,7 @@ def extract_keywords(self, text) -> dict:
153154
# 3. 构建图,运行 PageRank 算法
154155
unique_words = list(set(words))
155156
ranks = dict(zip(unique_words, [0] * len(unique_words)))
156-
if len(unique_words) > self.window:
157+
if len(unique_words) > 1:
157158
self._build_graph(words)
158159
if not self.graph or self.graph.vcount() == 0:
159160
return {}

hugegraph-llm/src/hugegraph_llm/operators/llm_op/keyword_extract.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,18 @@ def _extract_keywords_from_response(
168168
for match in matches:
169169
match = match.strip()
170170
for k in re.split(r"[,,]+", match):
171-
lis = re.split(r"[::]", k.strip())
172-
word, score = lis[0].strip(), float(lis[1].strip())
173-
if len(word) > 1:
174-
results[word.lower() if lowercase else word] = score
175-
171+
item = k.strip()
172+
if not item:
173+
continue
174+
parts = re.split(r"[::]", item, maxsplit=1)
175+
if len(parts) != 2:
176+
continue
177+
word_raw, score_raw = parts[0].strip(), parts[1].strip()
178+
if len(word_raw) > 0:
179+
try:
180+
score_val = float(score_raw)
181+
except ValueError:
182+
continue
183+
word_out = word_raw.lower() if lowercase else word_raw
184+
results[word_out] = score_val
176185
return results

0 commit comments

Comments
 (0)