Skip to content
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
11c211d
TextRank-fix
Gfreely Jun 27, 2025
4e3fa9b
feat(llm):TextRank fix
Gfreely Jun 30, 2025
a8313df
fix
Gfreely Jun 30, 2025
a4180ea
pylint bug fix
Gfreely Jun 30, 2025
98471a4
fix Potential issue
Gfreely Jul 10, 2025
750d338
fix default num
Gfreely Jul 10, 2025
29ddeb1
fix spilt
Gfreely Jul 10, 2025
d2e846c
fix bug
Gfreely Jul 16, 2025
9530dfb
Update keyword_extract.py
Gfreely Jul 16, 2025
f994411
support regular expression
Gfreely Jul 16, 2025
5c66bff
Update keyword_extract.py
Gfreely Jul 16, 2025
f305f6c
Update keyword_extract.py
Gfreely Jul 16, 2025
78f9356
fix language bug
Gfreely Jul 16, 2025
777589e
pylint fix
Gfreely Jul 16, 2025
2da9054
python-igraph version
Gfreely Jul 22, 2025
79383bf
Merge remote-tracking branch 'origin/main' into test
Gfreely Jul 22, 2025
9aae252
fix pyproject
Gfreely Jul 22, 2025
8b4884c
Update pyproject.toml
Gfreely Jul 22, 2025
0131563
mark todo
Gfreely Jul 22, 2025
6b6bfe5
merge main branch
Gfreely Aug 6, 2025
960481a
Update keyword_extract.py
Gfreely Aug 12, 2025
108caa5
Update textrank
Gfreely Aug 18, 2025
9790469
fix bug and gitignore
Gfreely Aug 18, 2025
5975c57
fix bug
Gfreely Aug 18, 2025
d6c54ae
fix bug
Gfreely Aug 18, 2025
02520ba
fix bug
Gfreely Aug 19, 2025
59ad7ed
update hybrid method
Gfreely Aug 21, 2025
1053060
fix bug
Gfreely Aug 21, 2025
a7c3543
fix bug
Gfreely Aug 21, 2025
236094e
Merge branch 'main' into TextRank-fix
imbajin Aug 21, 2025
ece0fc1
update new version
Gfreely Aug 29, 2025
3c3f7bb
Merge remote-tracking branch 'origin/TextRank-fix' into test
Gfreely Aug 29, 2025
b7f4136
fix bug
Gfreely Aug 29, 2025
38064c3
fix bug
Gfreely Aug 29, 2025
4379456
Update keyword_extract.py
Gfreely Sep 1, 2025
61f91de
update language
Gfreely Sep 8, 2025
27b048e
update language
Gfreely Sep 8, 2025
66c7ea8
Update graph_rag_task.py
Gfreely Sep 8, 2025
00edd28
Update word_extract.py
Gfreely Sep 8, 2025
7f1ce87
fix bug
Gfreely Sep 12, 2025
f31a500
fix bug
Gfreely Sep 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions hugegraph-llm/.gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
src/hugegraph_llm/resources/*
!/src/hugegraph_llm/resources/demo/*
!/src/hugegraph_llm/resources/nltk_data/*
!/src/hugegraph_llm/resources/prompt_examples/*
!/src/hugegraph_llm/resources/demo/
!/src/hugegraph_llm/resources/nltk_data/corpora/stopwords/
!/src/hugegraph_llm/resources/prompt_examples/


uv.lock
3 changes: 3 additions & 0 deletions hugegraph-llm/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ dependencies = [
"numpy",
"pandas",
"pydantic",
"scipy",
"python-igraph",

Comment on lines +42 to +44
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new dependencies scipy and python-igraph are added without version constraints in the main dependencies section, but they have constraints in the constraint-dependencies. This inconsistency could lead to dependency resolution issues.

Suggested change
"scipy",
"python-igraph",
# Add version constraints to main dependencies as well:\n"scipy~=1.15.3",\n"python-igraph~=0.11.9",

Did we get this right? 👍 / 👎 to inform future reviews.


# LLM specific dependencies
"openai",
Expand Down
5 changes: 5 additions & 0 deletions hugegraph-llm/src/hugegraph_llm/api/models/rag_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class RAGRequest(BaseModel):
topk_return_results: int = Query(20, description="Number of sorted results to return finally.")
vector_dis_threshold: float = Query(0.9, description="Threshold for vector similarity\
(results greater than this will be ignored).")
extract_method: str = Query("Hybrid", description="Method to extract keywords from the text.")
topk_per_keyword: int = Query(1, description="TopK results returned for each keyword \
extracted from the query, by default only the most similar one is returned.")
client_config: Optional[GraphConfigRequest] = Query(None, description="hugegraph server config.")
Expand All @@ -56,6 +57,10 @@ class RAGRequest(BaseModel):
prompt.keywords_extract_prompt,
description="Prompt for extracting keywords from query.",
)
mask_words: Optional[str] = Query(
prompt.maskword_input_text,
description="Mask words to be protected during word segmentation.",
)
gremlin_tmpl_num: int = Query(1, description="Number of Gremlin templates to use.")
gremlin_prompt: Optional[str] = Query(
prompt.gremlin_generate_prompt,
Expand Down
5 changes: 4 additions & 1 deletion hugegraph-llm/src/hugegraph_llm/api/rag_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@
RerankerConfigRequest,
GraphRAGRequest,
)
from hugegraph_llm.config import huge_settings
from hugegraph_llm.api.models.rag_response import RAGResponse
from hugegraph_llm.config import huge_settings
from hugegraph_llm.config import llm_settings, prompt
from hugegraph_llm.utils.log import log


# pylint: disable=too-many-statements
def rag_http_api(
router: APIRouter,
Expand Down Expand Up @@ -60,9 +61,11 @@ def rag_answer_api(req: RAGRequest):
topk_return_results=req.topk_return_results,
vector_dis_threshold=req.vector_dis_threshold,
topk_per_keyword=req.topk_per_keyword,
keywords_extract_method=req.extract_method,
# Keep prompt params in the end
custom_related_information=req.custom_priority_info,
answer_prompt=req.answer_prompt or prompt.answer_prompt,
mask_words=req.mask_words or prompt.maskword_input_text,
keywords_extract_prompt=req.keywords_extract_prompt or prompt.keywords_extract_prompt,
gremlin_prompt=req.gremlin_prompt or prompt.gremlin_generate_prompt,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# specific language governing permissions and limitations
# under the License.

import sys
import os
import sys
from pathlib import Path

import yaml
Expand Down Expand Up @@ -49,6 +49,7 @@ class BasePromptConfig:
doc_input_text: str = ""
_language_generated: str = ""
generate_extract_prompt_template: str = ""
maskword_input_text: str = ""

def ensure_yaml_file_exists(self):
current_dir = Path.cwd().resolve()
Expand Down Expand Up @@ -117,6 +118,7 @@ def to_literal(val):
"doc_input_text": to_literal(self.doc_input_text),
"_language_generated": str(self.llm_settings.language).lower().strip(),
"generate_extract_prompt_template": to_literal(self.generate_extract_prompt_template),
"maskword_input_text": to_literal(self.maskword_input_text),
}
with open(yaml_file_path, "w", encoding="utf-8") as file:
yaml.dump(data, file, allow_unicode=True, sort_keys=False, default_flow_style=False)
Expand Down
6 changes: 6 additions & 0 deletions hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,4 +427,10 @@ def __init__(self, llm_config_object):
## Your Generated "Graph Extract Prompt Header":
## Language Requirement:
Please generate the prompt in {language} language.
"""

maskword_input_text: str = r"""/'https?://\S+|www\.\S+/,
/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/,
/\b\w+(?:[-’\']\w+)+\b/,
/\b\d+[,.]\d+\b/
"""
4 changes: 4 additions & 0 deletions hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.

import argparse

import gradio as gr
import uvicorn
from fastapi import FastAPI, Depends, APIRouter
Expand Down Expand Up @@ -101,6 +102,7 @@ def init_rag_ui() -> gr.Interface:
textbox_answer_prompt_input,
textbox_keywords_extract_prompt_input,
textbox_custom_related_information,
textbox_textrank_mask_words,
) = create_rag_block()
with gr.Tab(label="3. Text2gremlin ⚙️"):
textbox_gremlin_inp, textbox_gremlin_schema, textbox_gremlin_prompt = (
Expand Down Expand Up @@ -129,6 +131,7 @@ def refresh_ui_config_prompt() -> tuple:
prompt.answer_prompt,
prompt.keywords_extract_prompt,
prompt.custom_rerank_info,
prompt.maskword_input_text,
prompt.default_question,
huge_settings.graph_name,
prompt.gremlin_generate_prompt,
Expand All @@ -149,6 +152,7 @@ def refresh_ui_config_prompt() -> tuple:
textbox_answer_prompt_input,
textbox_keywords_extract_prompt_input,
textbox_custom_related_information,
textbox_textrank_mask_words,
textbox_gremlin_inp,
textbox_gremlin_schema,
textbox_gremlin_prompt,
Expand Down
61 changes: 54 additions & 7 deletions hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,19 @@
# pylint: disable=E1101

import os
from typing import AsyncGenerator, Tuple, Literal, Optional
from typing import AsyncGenerator, Literal, Optional, Tuple

import gradio as gr
import pandas as pd
from gradio.utils import NamedString

from hugegraph_llm.config import resource_path, prompt, huge_settings, llm_settings
from hugegraph_llm.config import huge_settings, llm_settings, prompt, resource_path
from hugegraph_llm.operators.graph_rag_task import RAGPipeline
from hugegraph_llm.utils.decorators import with_task_id
from hugegraph_llm.operators.llm_op.answer_synthesize import AnswerSynthesize
from hugegraph_llm.utils.decorators import with_task_id
from hugegraph_llm.utils.log import log


def rag_answer(
text: str,
raw_answer: bool,
Expand All @@ -42,6 +43,9 @@ def rag_answer(
custom_related_information: str,
answer_prompt: str,
keywords_extract_prompt: str,
keywords_extract_method: str,
mask_words: str,
max_keywords_num: int = 5,
gremlin_tmpl_num: Optional[int] = -1,
gremlin_prompt: Optional[str] = None,
max_graph_items=30,
Expand All @@ -66,6 +70,7 @@ def rag_answer(
keywords_extract_prompt,
text,
vector_only_answer,
mask_words
)
if raw_answer is False and not vector_search and not graph_search:
gr.Warning("Please select at least one generate mode.")
Expand All @@ -75,7 +80,12 @@ def rag_answer(
if vector_search:
rag.query_vector_index()
if graph_search:
rag.extract_keywords(extract_template=keywords_extract_prompt).keywords_to_vid(
rag.extract_keywords(
extract_template=keywords_extract_prompt,
max_keywords=max_keywords_num,
extract_method=keywords_extract_method,
mask_words=mask_words
).keywords_to_vid(
vector_dis_threshold=vector_dis_threshold,
topk_per_keyword=topk_per_keyword,
).import_schema(huge_settings.graph_name).query_graphdb(
Expand Down Expand Up @@ -125,6 +135,7 @@ def update_ui_configs(
keywords_extract_prompt,
text,
vector_only_answer,
mask_words,
):
gremlin_prompt = gremlin_prompt or prompt.gremlin_generate_prompt
should_update_prompt = (
Expand All @@ -133,13 +144,15 @@ def update_ui_configs(
or prompt.keywords_extract_prompt != keywords_extract_prompt
or prompt.gremlin_generate_prompt != gremlin_prompt
or prompt.custom_rerank_info != custom_related_information
or prompt.maskword_input_text != mask_words
)
if should_update_prompt:
prompt.custom_rerank_info = custom_related_information
prompt.default_question = text
prompt.answer_prompt = answer_prompt
prompt.keywords_extract_prompt = keywords_extract_prompt
prompt.gremlin_generate_prompt = gremlin_prompt
prompt.maskword_input_text = mask_words
prompt.update_yaml_file()
vector_search = vector_only_answer or graph_vector_answer
graph_search = graph_only_answer or graph_vector_answer
Expand All @@ -157,6 +170,9 @@ async def rag_answer_streaming(
custom_related_information: str,
answer_prompt: str,
keywords_extract_prompt: str,
keywords_extract_method: str,
mask_words: str,
max_keywords_num: int = 5,
gremlin_tmpl_num: Optional[int] = -1,
gremlin_prompt: Optional[str] = None,
) -> AsyncGenerator[Tuple[str, str, str, str], None]:
Expand All @@ -177,6 +193,7 @@ async def rag_answer_streaming(
keywords_extract_prompt,
text,
vector_only_answer,
mask_words
)
if raw_answer is False and not vector_search and not graph_search:
gr.Warning("Please select at least one generate mode.")
Expand All @@ -187,7 +204,12 @@ async def rag_answer_streaming(
if vector_search:
rag.query_vector_index()
if graph_search:
rag.extract_keywords(extract_template=keywords_extract_prompt).keywords_to_vid().import_schema(
rag.extract_keywords(
extract_template=keywords_extract_prompt,
extract_method=keywords_extract_method,
mask_words=mask_words,
max_keywords=max_keywords_num
).keywords_to_vid().import_schema(
huge_settings.graph_name
).query_graphdb(
num_gremlin_generate_example=gremlin_tmpl_num,
Expand Down Expand Up @@ -261,7 +283,6 @@ def create_rag_block():
show_copy_button=True,
latex_delimiters=[{"left": "$", "right": "$", "display": False}],
)

answer_prompt_input = gr.Textbox(
value=prompt.answer_prompt, label="Query Prompt", show_copy_button=True, lines=7
)
Expand All @@ -271,6 +292,15 @@ def create_rag_block():
show_copy_button=True,
lines=7,
)
mask_words_input = gr.Textbox(
value=prompt.maskword_input_text,
label="TextRank mask words",
info=r"""Enter words or regular expressions to protect during word segmentation.
Separate items with a comma and enclose regular expressions in "/".
(Example: C++,/https?://\S+|www\.\S+/,//)""",
show_copy_button=True,
lines=7,
)

with gr.Column(scale=1):
with gr.Row():
Expand All @@ -279,6 +309,11 @@ def create_rag_block():
with gr.Row():
graph_only_radio = gr.Radio(choices=[True, False], value=True, label="Graph-only Answer")
graph_vector_radio = gr.Radio(choices=[True, False], value=False, label="Graph-Vector Answer")
with gr.Column():
with gr.Row():
extraction_method_input = gr.Radio(choices=["LLM", "Hybrid", "TextRank"],
value="Hybrid", label="Keywords Extraction Method")
max_keyword_num = gr.Number(value=5, label="Max Keywords Num", precision=5)

def toggle_slider(enable):
return gr.update(interactive=enable)
Expand Down Expand Up @@ -322,6 +357,9 @@ def toggle_slider(enable):
custom_related_information,
answer_prompt_input,
keywords_extract_prompt_input,
extraction_method_input,
mask_words_input,
max_keyword_num,
example_num,
],
outputs=[raw_out, vector_only_out, graph_only_out, graph_vector_out],
Expand Down Expand Up @@ -387,6 +425,9 @@ def several_rag_answer(
custom_related_information_ui: str,
answer_prompt: str,
keywords_extract_prompt: str,
keywords_extraction_method: str,
mask_words: str,
keyword_num: int,
answer_max_line_count_ui: int = 1,
progress=gr.Progress(track_tqdm=True),
):
Expand All @@ -406,6 +447,9 @@ def several_rag_answer(
custom_related_information_ui,
answer_prompt,
keywords_extract_prompt,
keywords_extraction_method,
mask_words,
keyword_num,
)
df.at[index, "Basic LLM Answer"] = basic_llm_answer
df.at[index, "Vector-only Answer"] = vector_only_answer
Expand Down Expand Up @@ -439,10 +483,13 @@ def several_rag_answer(
custom_related_information,
answer_prompt_input,
keywords_extract_prompt_input,
extraction_method_input,
mask_words_input,
max_keyword_num,
answer_max_line_count,
],
outputs=[qa_dataframe, gr.File(label="Download Answered File", min_width=40)],
)
questions_file.change(read_file_to_excel, questions_file, [qa_dataframe, answer_max_line_count])
answer_max_line_count.change(change_showing_excel, answer_max_line_count, qa_dataframe)
return inp, answer_prompt_input, keywords_extract_prompt_input, custom_related_information
return inp, answer_prompt_input, keywords_extract_prompt_input, custom_related_information, mask_words_input
17 changes: 17 additions & 0 deletions hugegraph-llm/src/hugegraph_llm/operators/common_op/nltk_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,23 @@ def stopwords(self, lang: str = "chinese") -> List[str]:

return self._stopwords[lang]

def check_nltk_data(self):
required_packages = ['punkt', 'punkt_tab', 'averaged_perceptron_tagger', "averaged_perceptron_tagger_eng"]
cache_dir = self.get_cache_dir()
nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir)

if nltk_data_dir not in nltk.data.path:
nltk.data.path.append(nltk_data_dir)

for package in required_packages:
try:
if package in ['punkt', 'punkt_tab']:
nltk.data.find(f'tokenizers/{package}')
else:
nltk.data.find(f'taggers/{package}')
except LookupError:
nltk.download(package, download_dir=nltk_data_dir)

@staticmethod
def get_cache_dir() -> str:
"""Locate a platform-appropriate cache directory for hugegraph-llm,
Expand Down
Loading
Loading