Skip to content

Commit 0da115c

Browse files
authored
Merge pull request #513 from funstory-ai/dev
prompt: Optimize prompt structure to improve prefix cache hit rate
2 parents 1f3e2f4 + e577e78 commit 0da115c

File tree

9 files changed

+138
-108
lines changed

9 files changed

+138
-108
lines changed

babeldoc/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.14"
1+
__version__ = "0.5.15"

babeldoc/const.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import threading
77
from pathlib import Path
88

9-
__version__ = "0.5.14"
9+
__version__ = "0.5.15"
1010

1111
CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"
1212

babeldoc/format/pdf/document_il/midend/automatic_term_extractor.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,18 +138,24 @@ def calc_token_count(self, text: str) -> int:
138138
except Exception:
139139
return 0
140140

141-
def _snapshot_token_usage(self) -> tuple[int, int, int]:
141+
def _snapshot_token_usage(self) -> tuple[int, int, int, int]:
142142
if not self.translate_engine:
143-
return 0, 0, 0
143+
return 0, 0, 0, 0
144144
token_counter = getattr(self.translate_engine, "token_count", None)
145145
prompt_counter = getattr(self.translate_engine, "prompt_token_count", None)
146146
completion_counter = getattr(
147147
self.translate_engine, "completion_token_count", None
148148
)
149+
cache_hit_prompt_counter = getattr(
150+
self.translate_engine, "cache_hit_prompt_token_count", None
151+
)
149152
total_tokens = token_counter.value if token_counter else 0
150153
prompt_tokens = prompt_counter.value if prompt_counter else 0
151154
completion_tokens = completion_counter.value if completion_counter else 0
152-
return total_tokens, prompt_tokens, completion_tokens
155+
cache_hit_prompt_tokens = (
156+
cache_hit_prompt_counter.value if cache_hit_prompt_counter else 0
157+
)
158+
return total_tokens, prompt_tokens, completion_tokens, cache_hit_prompt_tokens
153159

154160
def _clean_json_output(self, llm_output: str) -> str:
155161
llm_output = llm_output.strip()
@@ -327,7 +333,9 @@ def extract_terms_from_paragraphs(
327333

328334
def procress(self, doc_il: ILDocument):
329335
logger.info(f"{self.stage_name}: Starting term extraction for document.")
330-
start_total, start_prompt, start_completion = self._snapshot_token_usage()
336+
start_total, start_prompt, start_completion, start_cache_hit_prompt = (
337+
self._snapshot_token_usage()
338+
)
331339
tracker = DocumentTermExtractTracker()
332340
total = sum(len(page.pdf_paragraph) for page in doc_il.page)
333341
with self.translation_config.progress_monitor.stage_start(
@@ -341,11 +349,14 @@ def procress(self, doc_il: ILDocument):
341349
self.process_page(page, executor, pbar, tracker.new_page())
342350

343351
self.shared_context.finalize_auto_extracted_glossary()
344-
end_total, end_prompt, end_completion = self._snapshot_token_usage()
352+
end_total, end_prompt, end_completion, end_cache_hit_prompt = (
353+
self._snapshot_token_usage()
354+
)
345355
self.translation_config.record_term_extraction_usage(
346356
end_total - start_total,
347357
end_prompt - start_prompt,
348358
end_completion - start_completion,
359+
end_cache_hit_prompt - start_cache_hit_prompt,
349360
)
350361

351362
if self.translation_config.debug:

babeldoc/format/pdf/document_il/midend/il_translator.py

Lines changed: 32 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,38 @@ def generate_prompt_for_llm(
890890
f"You are a professional and reliable machine translation engine responsible for translating the input text into {self.translation_config.lang_out}."
891891
]
892892

893+
llm_input.append("When translating, please follow the following rules:")
894+
895+
rich_text_left_placeholder = (
896+
self.translate_engine.get_rich_text_left_placeholder(1)
897+
)
898+
if isinstance(rich_text_left_placeholder, tuple):
899+
rich_text_left_placeholder = rich_text_left_placeholder[0]
900+
rich_text_right_placeholder = (
901+
self.translate_engine.get_rich_text_right_placeholder(2)
902+
)
903+
if isinstance(rich_text_right_placeholder, tuple):
904+
rich_text_right_placeholder = rich_text_right_placeholder[0]
905+
906+
# Create a structured prompt template for LLM translation
907+
llm_input.append(
908+
f'1. Do not translate style tags, such as "{rich_text_left_placeholder}xxx{rich_text_right_placeholder}"!'
909+
)
910+
911+
formula_placeholder = self.translate_engine.get_formular_placeholder(3)
912+
if isinstance(formula_placeholder, tuple):
913+
formula_placeholder = formula_placeholder[0]
914+
915+
llm_input.append(
916+
f'2. Do not translate formula placeholders, such as "{formula_placeholder}". The system will automatically replace the placeholders with the corresponding formulas.'
917+
)
918+
llm_input.append(
919+
"3. If there is no need to translate (such as proper nouns, codes, etc.), then return the original text."
920+
)
921+
llm_input.append(
922+
f"4. Only output the translation result in {self.translation_config.lang_out} without explanations and annotations."
923+
)
924+
893925
llm_context_hints = []
894926

895927
if title_paragraph:
@@ -952,41 +984,8 @@ def generate_prompt_for_llm(
952984
for md_block in active_glossary_markdown_blocks:
953985
llm_input.append(f"\n{md_block}\n")
954986

955-
llm_input.append("When translating, please follow the following rules:")
956-
957-
rich_text_left_placeholder = (
958-
self.translate_engine.get_rich_text_left_placeholder(1)
959-
)
960-
if isinstance(rich_text_left_placeholder, tuple):
961-
rich_text_left_placeholder = rich_text_left_placeholder[0]
962-
rich_text_right_placeholder = (
963-
self.translate_engine.get_rich_text_right_placeholder(2)
964-
)
965-
if isinstance(rich_text_right_placeholder, tuple):
966-
rich_text_right_placeholder = rich_text_right_placeholder[0]
967-
968-
# Create a structured prompt template for LLM translation
969-
llm_input.append(
970-
f'1. Do not translate style tags, such as "{rich_text_left_placeholder}xxx{rich_text_right_placeholder}"!'
971-
)
972-
973-
formula_placeholder = self.translate_engine.get_formular_placeholder(3)
974-
if isinstance(formula_placeholder, tuple):
975-
formula_placeholder = formula_placeholder[0]
976-
977-
llm_input.append(
978-
f'2. Do not translate formula placeholders, such as "{formula_placeholder}". The system will automatically replace the placeholders with the corresponding formulas.'
979-
)
980-
llm_input.append(
981-
"3. If there is no need to translate (such as proper nouns, codes, etc.), then return the original text."
982-
)
983-
llm_input.append(
984-
f"4. Only output the translation result in {self.translation_config.lang_out} without explanations and annotations."
985-
)
986-
llm_input.append(f"5. Translate text into {self.translation_config.lang_out}.")
987987
prompt_template = f"""
988988
Now, please carefully read the following text to be translated and directly output your translation.\n\n{text}
989-
990989
"""
991990
llm_input.append(prompt_template)
992991

babeldoc/format/pdf/document_il/midend/il_translator_llm_only.py

Lines changed: 63 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -643,69 +643,6 @@ def translate_paragraph(
643643
"When translating, strictly follow the instructions below to ensure translation quality and preserve all formatting, tags, and placeholders:\n"
644644
)
645645

646-
# 2. ##Contextual Hints for Better Translation
647-
contextual_hints_section: list[str] = []
648-
hint_idx = 1
649-
if title_paragraph:
650-
contextual_hints_section.append(
651-
f"{hint_idx}. First title in full text: {title_paragraph.unicode}"
652-
)
653-
hint_idx += 1
654-
655-
if local_title_paragraph:
656-
is_different_from_global = True
657-
if title_paragraph:
658-
if local_title_paragraph.debug_id == title_paragraph.debug_id:
659-
is_different_from_global = False
660-
661-
if is_different_from_global:
662-
contextual_hints_section.append(
663-
f"{hint_idx}. Most similar section title: {local_title_paragraph.unicode}"
664-
)
665-
hint_idx += 1
666-
667-
# --- ADD GLOSSARY HINTS ---
668-
batch_text_for_glossary_matching = "\n".join(
669-
item.get("input", "") for item in json_format_input
670-
)
671-
672-
active_glossary_markdown_blocks: list[str] = []
673-
# Use cached glossaries
674-
if self._cached_glossaries:
675-
for glossary in self._cached_glossaries:
676-
# Get active entries for the current batch_text_for_glossary_matching
677-
active_entries = glossary.get_active_entries_for_text(
678-
batch_text_for_glossary_matching
679-
)
680-
681-
if active_entries:
682-
current_glossary_md_entries: list[str] = []
683-
for original_source, target_text in sorted(active_entries):
684-
current_glossary_md_entries.append(
685-
f"| {original_source} | {target_text} |"
686-
)
687-
688-
if current_glossary_md_entries:
689-
glossary_table_md = (
690-
f"### Glossary: {glossary.name}\n\n"
691-
"| Source Term | Target Term |\n"
692-
"|-------------|-------------|\n"
693-
+ "\n".join(current_glossary_md_entries)
694-
)
695-
active_glossary_markdown_blocks.append(glossary_table_md)
696-
697-
if contextual_hints_section or active_glossary_markdown_blocks:
698-
llm_prompt_parts.append("\n## Contextual Hints for Better Translation")
699-
llm_prompt_parts.extend(contextual_hints_section)
700-
701-
if active_glossary_markdown_blocks:
702-
llm_prompt_parts.append(
703-
f"{hint_idx}. You MUST strictly adhere to the following glossaries. auto_extracted_glossary has a lower priority; please give preference to other glossaries. If a source term from a table appears in the text, use the corresponding target term in your translation:"
704-
)
705-
# hint_idx += 1 # No need to increment if tables are part of this point
706-
for md_block in active_glossary_markdown_blocks:
707-
llm_prompt_parts.append(f"\n{md_block}\n")
708-
709646
# 3. ## Strict Rules:
710647
llm_prompt_parts.append("\n## Strict Rules:")
711648
llm_prompt_parts.append(
@@ -771,6 +708,69 @@ def translate_paragraph(
771708
llm_prompt_parts.append("```")
772709
llm_prompt_parts.append("</example>")
773710

711+
# 2. ##Contextual Hints for Better Translation
712+
contextual_hints_section: list[str] = []
713+
hint_idx = 1
714+
if title_paragraph:
715+
contextual_hints_section.append(
716+
f"{hint_idx}. First title in full text: {title_paragraph.unicode}"
717+
)
718+
hint_idx += 1
719+
720+
if local_title_paragraph:
721+
is_different_from_global = True
722+
if title_paragraph:
723+
if local_title_paragraph.debug_id == title_paragraph.debug_id:
724+
is_different_from_global = False
725+
726+
if is_different_from_global:
727+
contextual_hints_section.append(
728+
f"{hint_idx}. The most recent title is: {local_title_paragraph.unicode}"
729+
)
730+
hint_idx += 1
731+
732+
# --- ADD GLOSSARY HINTS ---
733+
batch_text_for_glossary_matching = "\n".join(
734+
item.get("input", "") for item in json_format_input
735+
)
736+
737+
active_glossary_markdown_blocks: list[str] = []
738+
# Use cached glossaries
739+
if self._cached_glossaries:
740+
for glossary in self._cached_glossaries:
741+
# Get active entries for the current batch_text_for_glossary_matching
742+
active_entries = glossary.get_active_entries_for_text(
743+
batch_text_for_glossary_matching
744+
)
745+
746+
if active_entries:
747+
current_glossary_md_entries: list[str] = []
748+
for original_source, target_text in sorted(active_entries):
749+
current_glossary_md_entries.append(
750+
f"| {original_source} | {target_text} |"
751+
)
752+
753+
if current_glossary_md_entries:
754+
glossary_table_md = (
755+
f"### Glossary: {glossary.name}\n\n"
756+
"| Source Term | Target Term |\n"
757+
"|-------------|-------------|\n"
758+
+ "\n".join(current_glossary_md_entries)
759+
)
760+
active_glossary_markdown_blocks.append(glossary_table_md)
761+
762+
if contextual_hints_section or active_glossary_markdown_blocks:
763+
llm_prompt_parts.append("\n## Contextual Hints for Better Translation")
764+
llm_prompt_parts.extend(contextual_hints_section)
765+
766+
if active_glossary_markdown_blocks:
767+
llm_prompt_parts.append(
768+
f"{hint_idx}. You MUST strictly adhere to the following glossaries. please give preference to other glossaries. If a source term from a table appears in the text, use the corresponding target term in your translation:"
769+
)
770+
# hint_idx += 1 # No need to increment if tables are part of this point
771+
for md_block in active_glossary_markdown_blocks:
772+
llm_prompt_parts.append(f"\n{md_block}\n")
773+
774774
# 6. ## Here is the input:
775775
llm_prompt_parts.append("\n## Here is the input:")
776776

babeldoc/format/pdf/translation_config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ def __init__(
321321
"total_tokens": 0,
322322
"prompt_tokens": 0,
323323
"completion_tokens": 0,
324+
"cache_hit_prompt_tokens": 0,
324325
}
325326

326327
if self.ocr_workaround:
@@ -437,6 +438,7 @@ def record_term_extraction_usage(
437438
total_tokens: int,
438439
prompt_tokens: int,
439440
completion_tokens: int,
441+
cache_hit_prompt_tokens: int,
440442
) -> None:
441443
"""Accumulate token usage for automatic term extraction."""
442444
if total_tokens > 0:
@@ -445,6 +447,10 @@ def record_term_extraction_usage(
445447
self.term_extraction_token_usage["prompt_tokens"] += prompt_tokens
446448
if completion_tokens > 0:
447449
self.term_extraction_token_usage["completion_tokens"] += completion_tokens
450+
if cache_hit_prompt_tokens > 0:
451+
self.term_extraction_token_usage["cache_hit_prompt_tokens"] += (
452+
cache_hit_prompt_tokens
453+
)
448454

449455

450456
class TranslateResult:

babeldoc/main.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from babeldoc.translator.translator import set_translate_rate_limiter
2727

2828
logger = logging.getLogger(__name__)
29-
__version__ = "0.5.14"
29+
__version__ = "0.5.15"
3030

3131

3232
def create_parser():
@@ -636,6 +636,7 @@ async def main():
636636
total_term_extraction_total_tokens = 0
637637
total_term_extraction_prompt_tokens = 0
638638
total_term_extraction_completion_tokens = 0
639+
total_term_extraction_cache_hit_prompt_tokens = 0
639640

640641
for file in pending_files:
641642
# 清理文件路径,去除两端的引号
@@ -721,21 +722,29 @@ def nop(_x):
721722
total_term_extraction_total_tokens += usage["total_tokens"]
722723
total_term_extraction_prompt_tokens += usage["prompt_tokens"]
723724
total_term_extraction_completion_tokens += usage["completion_tokens"]
725+
total_term_extraction_cache_hit_prompt_tokens += usage[
726+
"cache_hit_prompt_tokens"
727+
]
724728
logger.info(f"Total tokens: {translator.token_count.value}")
725729
logger.info(f"Prompt tokens: {translator.prompt_token_count.value}")
726730
logger.info(f"Completion tokens: {translator.completion_token_count.value}")
727731
logger.info(
728-
"Term extraction tokens: total=%s prompt=%s completion=%s",
732+
f"Cache hit prompt tokens: {translator.cache_hit_prompt_token_count.value}"
733+
)
734+
logger.info(
735+
"Term extraction tokens: total=%s prompt=%s completion=%s cache_hit_prompt=%s",
729736
total_term_extraction_total_tokens,
730737
total_term_extraction_prompt_tokens,
731738
total_term_extraction_completion_tokens,
739+
total_term_extraction_cache_hit_prompt_tokens,
732740
)
733741
if term_extraction_translator is not translator:
734742
logger.info(
735-
"Term extraction translator raw tokens: total=%s prompt=%s completion=%s",
743+
"Term extraction translator raw tokens: total=%s prompt=%s completion=%s cache_hit_prompt=%s",
736744
term_extraction_translator.token_count.value,
737745
term_extraction_translator.prompt_token_count.value,
738746
term_extraction_translator.completion_token_count.value,
747+
term_extraction_translator.cache_hit_prompt_token_count.value,
739748
)
740749

741750

babeldoc/translator/translator.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ def __init__(
249249
self.token_count = AtomicInteger()
250250
self.prompt_token_count = AtomicInteger()
251251
self.completion_token_count = AtomicInteger()
252+
self.cache_hit_prompt_token_count = AtomicInteger()
252253

253254
@retry(
254255
retry=retry_if_exception_type(openai.RateLimitError),
@@ -338,6 +339,10 @@ def update_token_count(self, response):
338339
self.prompt_token_count.inc(response.usage.prompt_tokens)
339340
if response.usage and response.usage.completion_tokens:
340341
self.completion_token_count.inc(response.usage.completion_tokens)
342+
if response.usage and (
343+
hit_count := getattr(response.usage, "prompt_cache_hit_tokens", 0)
344+
):
345+
self.cache_hit_prompt_token_count.inc(hit_count)
341346
except Exception as e:
342347
logger.exception("Error updating token count")
343348

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "BabelDOC"
3-
version = "0.5.14"
3+
version = "0.5.15"
44
description = "Yet Another Document Translator"
55
license = "AGPL-3.0"
66
readme = "README.md"
@@ -162,7 +162,7 @@ pythonpath = [".", "src"]
162162
testpaths = ["tests"]
163163

164164
[bumpver]
165-
current_version = "0.5.14"
165+
current_version = "0.5.15"
166166
version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"
167167

168168
[bumpver.file_patterns]

0 commit comments

Comments
 (0)