Skip to content

Commit 2f9e211

Browse files
committed
update the search space once a note is found in the text, use the identifier to fetch the specific notes from the map
1 parent 694f0ed commit 2f9e211

File tree

1 file changed

+12
-9
lines changed

1 file changed

+12
-9
lines changed

grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,7 +1512,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
15121512
int clusterPage = Iterables.getLast(clusterTokens).getPage();
15131513

15141514
List<Note> notesSamePage = null;
1515-
List<Triple<String,String, OffsetPosition>> matchedLabelPosition = new ArrayList<>();
1515+
List<Triple<String, String, OffsetPosition>> matchedLabelPositions = new ArrayList<>();
15161516

15171517
// map the matched note labels to their corresponding note objects
15181518
Map<String, Note> labels2Notes = new TreeMap<>();
@@ -1530,20 +1530,23 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
15301530
// map a note label (string) to a valid matching position in the sequence of Layout Tokens
15311531
// of the paragraph segment
15321532

1533+
int start = 0;
15331534
for (Note note : notesSamePage) {
1534-
Optional<LayoutToken> matching = clusterTokens
1535+
List<LayoutToken> clusterReduced = clusterTokens.subList(start, clusterTokens.size());
1536+
Optional<LayoutToken> matching = clusterReduced
15351537
.stream()
15361538
.filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript())
15371539
.findFirst();
15381540

15391541
if (matching.isPresent()) {
1540-
int idx = clusterTokens.indexOf(matching.get());
1542+
int idx = clusterReduced.indexOf(matching.get()) + start;
15411543
note.setIgnored(true);
15421544
OffsetPosition matchingPosition = new OffsetPosition();
15431545
matchingPosition.start = idx;
15441546
matchingPosition.end = idx+1; // to be review, might be more than one layout token
1545-
matchedLabelPosition.add(Triple.of(note.getLabel(), "note", matchingPosition));
1546-
labels2Notes.put(note.getLabel(), note);
1547+
start = matchingPosition.end;
1548+
matchedLabelPositions.add(Triple.of(note.getIdentifier(), "note", matchingPosition));
1549+
labels2Notes.put(note.getIdentifier(), note);
15471550
}
15481551
}
15491552

@@ -1555,7 +1558,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
15551558
.forEach(opu -> {
15561559
// We correct the latest token here, since later we will do a substring in the shared code,
15571560
// and we cannot add a +1 there.
1558-
matchedLabelPosition.add(
1561+
matchedLabelPositions.add(
15591562
Triple.of(LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens.subList(opu.start, opu.end)),
15601563
"url",
15611564
new OffsetPosition(opu.start, opu.end + 1)
@@ -1567,7 +1570,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
15671570
// We can add more elements to be extracted from the paragraphs, here. Each labelPosition it's a
15681571
// Triple with three main elements: the text of the item, the type, and the offsetPositions.
15691572

1570-
if (CollectionUtils.isEmpty(matchedLabelPosition)){
1573+
if (CollectionUtils.isEmpty(matchedLabelPositions)){
15711574
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
15721575
if (isNewParagraph(lastClusterLabel, curParagraph)) {
15731576
if (curParagraph != null && config.isWithSentenceSegmentation()) {
@@ -1617,7 +1620,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
16171620
}
16181621

16191622
// sort the matches by position
1620-
Collections.sort(matchedLabelPosition, (m1, m2) -> {
1623+
Collections.sort(matchedLabelPositions, (m1, m2) -> {
16211624
return m1.getRight().start - m2.getRight().start;
16221625
}
16231626
);
@@ -1626,7 +1629,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
16261629
int pos = 0;
16271630

16281631
// build the paragraph segment, match by match
1629-
for (Triple<String, String, OffsetPosition> referenceInformation : matchedLabelPosition) {
1632+
for (Triple<String, String, OffsetPosition> referenceInformation : matchedLabelPositions) {
16301633
String type = referenceInformation.getMiddle();
16311634
OffsetPosition matchingPosition = referenceInformation.getRight();
16321635

0 commit comments

Comments
 (0)