Skip to content

Commit 4fe0807

Browse files
authored
fix: improve citation logic (#578) bump:patch
1 parent 3bd19f3 commit 4fe0807

File tree

6 files changed

+110
-26
lines changed

6 files changed

+110
-26
lines changed

libs/kotaemon/kotaemon/indices/qa/citation_qa.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,11 +334,19 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document
334334
highlight_text = ""
335335

336336
ss = sorted(ss, key=lambda x: x["start"])
337+
last_end = 0
337338
text = cur_doc.text[: ss[0]["start"]]
339+
338340
for idx, span in enumerate(ss):
339-
to_highlight = cur_doc.text[span["start"] : span["end"]]
340-
if len(to_highlight) > len(highlight_text):
341-
highlight_text = to_highlight
341+
# prevent overlapping between span
342+
span_start = max(last_end, span["start"])
343+
span_end = max(last_end, span["end"])
344+
345+
to_highlight = cur_doc.text[span_start:span_end]
346+
last_end = span_end
347+
348+
# append to highlight on PDF viewer
349+
highlight_text += (" " if highlight_text else "") + to_highlight
342350

343351
span_idx = span.get("idx", None)
344352
if span_idx is not None:
@@ -350,6 +358,7 @@ def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document
350358
)
351359
if idx < len(ss) - 1:
352360
text += cur_doc.text[span["end"] : ss[idx + 1]["start"]]
361+
353362
text += cur_doc.text[ss[-1]["end"] :]
354363
# add to display list
355364
with_citation.append(

libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ def answer_to_citations(self, answer) -> list[InlineEvidence]:
152152
def replace_citation_with_link(self, answer: str):
153153
# Define the regex pattern to match 【number】
154154
pattern = r"【\d+】"
155+
alternate_pattern = r"\[\d+\]"
155156

156157
# Regular expression to match merged citations
157158
multi_pattern = r"【([\d,\s]+)】"
@@ -166,19 +167,22 @@ def split_citations(match):
166167
answer = re.sub(multi_pattern, split_citations, answer)
167168

168169
# Find all citations in the answer
169-
matches = re.finditer(pattern, answer)
170+
matches = list(re.finditer(pattern, answer))
171+
if not matches:
172+
matches = list(re.finditer(alternate_pattern, answer))
170173

171174
matched_citations = set()
172175
for match in matches:
173176
citation = match.group()
174177
matched_citations.add(citation)
175178

176179
for citation in matched_citations:
180+
citation_id = citation[1:-1]
177181
answer = answer.replace(
178182
citation,
179183
(
180184
"<a href='#' class='citation' "
181-
f"id='mark-{citation[1:-1]}'>{citation}</a>"
185+
f"id='mark-{citation_id}'>{citation_id}</a>"
182186
),
183187
)
184188

libs/kotaemon/kotaemon/indices/qa/utils.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,38 @@ def find_text(search_span, context, min_length=5):
55
sentence_list = search_span.split("\n")
66
context = context.replace("\n", " ")
77

8-
matches = []
8+
matches_span = []
99
# don't search for small text
1010
if len(search_span) > min_length:
1111
for sentence in sentence_list:
12-
match = SequenceMatcher(
13-
None, sentence, context, autojunk=False
14-
).find_longest_match()
15-
if match.size > max(len(sentence) * 0.35, min_length):
16-
matches.append((match.b, match.b + match.size))
12+
match_results = SequenceMatcher(
13+
None,
14+
sentence,
15+
context,
16+
autojunk=False,
17+
).get_matching_blocks()
18+
19+
matched_blocks = []
20+
for _, start, length in match_results:
21+
if length > max(len(sentence) * 0.2, min_length):
22+
matched_blocks.append((start, start + length))
23+
24+
if matched_blocks:
25+
start_index = min(start for start, _ in matched_blocks)
26+
end_index = max(end for _, end in matched_blocks)
27+
length = end_index - start_index
28+
29+
if length > max(len(sentence) * 0.35, min_length):
30+
matches_span.append((start_index, end_index))
31+
32+
if matches_span:
33+
# merge all matches into one span
34+
final_span = min(start for start, _ in matches_span), max(
35+
end for _, end in matches_span
36+
)
37+
matches_span = [final_span]
1738

18-
return matches
39+
return matches_span
1940

2041

2142
def find_start_end_phrase(

libs/ktem/ktem/assets/css/main.css

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,6 @@ span.icon {
277277
}
278278

279279
pdfjs-viewer-element {
280-
height: 100vh;
281280
height: 100dvh;
282281
}
283282

@@ -290,9 +289,8 @@ pdfjs-viewer-element {
290289
left: 0;
291290
top: 0;
292291
width: 100%;
293-
height: 100%;
294-
overflow: auto;
295-
background-color: rgb(0, 0, 0);
292+
height: 85dvh;
293+
overflow: hidden;
296294
background-color: rgba(0, 0, 0, 0.4);
297295
}
298296

@@ -302,7 +300,7 @@ pdfjs-viewer-element {
302300

303301
.modal-content {
304302
background-color: #fefefe;
305-
height: 110%;
303+
height: 100%;
306304
display: flex;
307305
flex-direction: column;
308306
}
@@ -323,7 +321,7 @@ pdfjs-viewer-element {
323321

324322
.modal-body {
325323
flex: 1;
326-
overflow: auto;
324+
overflow: hidden;
327325
}
328326

329327
/* Switch checkbox styles */

libs/ktem/ktem/assets/js/main.js

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ function run() {
3232
globalThis.toggleChatColumn = (() => {
3333
/* get flex-grow value of chat_column */
3434
let flex_grow = conv_column.style.flexGrow;
35-
console.log("chat col", flex_grow);
3635
if (flex_grow == '0') {
3736
conv_column.style.flexGrow = '1';
3837
conv_column.style.minWidth = default_conv_column_min_width;
@@ -95,10 +94,24 @@ function run() {
9594
event.preventDefault(); // Prevent the default link behavior
9695
var citationId = event.target.getAttribute('id');
9796

98-
await sleep(100); // Sleep for 500 milliseconds
97+
await sleep(100); // Sleep for 100 milliseconds
98+
99+
// check if modal is open
100+
var modal = document.getElementById("pdf-modal");
99101
var citation = document.querySelector('mark[id="' + citationId + '"]');
100-
if (citation) {
101-
citation.scrollIntoView({ behavior: 'smooth' });
102+
103+
if (modal.style.display == "block") {
104+
// trigger on click event of PDF Preview link
105+
var detail_elem = citation;
106+
// traverse up the DOM tree to find the parent element with tag detail
107+
while (detail_elem.tagName.toLowerCase() != "details") {
108+
detail_elem = detail_elem.parentElement;
109+
}
110+
detail_elem.getElementsByClassName("pdf-link").item(0).click();
111+
} else {
112+
if (citation) {
113+
citation.scrollIntoView({ behavior: 'smooth' });
114+
}
102115
}
103116
}
104117
}

libs/ktem/ktem/assets/js/pdf_viewer.js

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,52 @@ function onBlockLoad () {
4343
modal.style.position = "fixed";
4444
modal.style.width = "70%";
4545
modal.style.left = "15%";
46+
modal.style.height = "100dvh";
4647
} else {
4748
modal.style.position = old_position;
4849
modal.style.width = old_width;
4950
modal.style.left = old_left;
51+
modal.style.height = "85dvh";
5052
}
5153
};
5254
}
5355

56+
globalThis.compareText = (search_phrase, page_label) => {
57+
var iframe = document.querySelector("#pdf-viewer").iframe;
58+
var innerDoc = (iframe.contentDocument) ? iframe.contentDocument : iframe.contentWindow.document;
59+
60+
var query_selector = (
61+
"#viewer > div[data-page-number='" +
62+
page_label +
63+
"'] > div.textLayer > span"
64+
);
65+
var page_spans = innerDoc.querySelectorAll(query_selector);
66+
for (var i = 0; i < page_spans.length; i++) {
67+
var span = page_spans[i];
68+
if (
69+
span.textContent.length > 4 &&
70+
(
71+
search_phrase.includes(span.textContent) ||
72+
span.textContent.includes(search_phrase)
73+
)
74+
) {
75+
span.innerHTML = "<span class='highlight selected'>" + span.textContent + "</span>";
76+
} else {
77+
// if span is already highlighted, remove it
78+
if (span.querySelector(".highlight")) {
79+
span.innerHTML = span.textContent;
80+
}
81+
}
82+
}
83+
}
84+
85+
// Sleep function using Promise and setTimeout
86+
function sleep(ms) {
87+
return new Promise(resolve => setTimeout(resolve, ms));
88+
}
89+
5490
// Function to open modal and display PDF
55-
globalThis.openModal = (event) => {
91+
globalThis.openModal = async (event) => {
5692
event.preventDefault();
5793
var target = event.currentTarget;
5894
var src = target.getAttribute("data-src");
@@ -66,8 +102,8 @@ function onBlockLoad () {
66102
if (current_src != src) {
67103
pdfViewer.setAttribute("src", src);
68104
}
69-
pdfViewer.setAttribute("phrase", phrase);
70-
pdfViewer.setAttribute("search", search);
105+
// pdfViewer.setAttribute("phrase", phrase);
106+
// pdfViewer.setAttribute("search", search);
71107
pdfViewer.setAttribute("page", page);
72108

73109
var scrollableDiv = document.getElementById("chat-info-panel");
@@ -80,6 +116,10 @@ function onBlockLoad () {
80116
info_panel.style.display = "none";
81117
}
82118
scrollableDiv.scrollTop = 0;
119+
120+
/* search for text inside PDF page */
121+
await sleep(500);
122+
compareText(search, page);
83123
}
84124

85125
globalThis.assignPdfOnclickEvent = () => {
@@ -93,7 +133,6 @@ function onBlockLoad () {
93133
var created_modal = document.getElementById("pdf-viewer");
94134
if (!created_modal) {
95135
createModal();
96-
console.log("Created modal")
97136
}
98137

99138
}

0 commit comments

Comments
 (0)