@@ -1512,7 +1512,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
1512
1512
int clusterPage = Iterables .getLast (clusterTokens ).getPage ();
1513
1513
1514
1514
List <Note > notesSamePage = null ;
1515
- List <Triple <String ,String , OffsetPosition >> matchedLabelPosition = new ArrayList <>();
1515
+ List <Triple <String , String , OffsetPosition >> matchedLabelPositions = new ArrayList <>();
1516
1516
1517
1517
// map the matched note labels to their corresponding note objects
1518
1518
Map <String , Note > labels2Notes = new TreeMap <>();
@@ -1530,20 +1530,23 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
1530
1530
// map a note label (string) to a valid matching position in the sequence of Layout Tokens
1531
1531
// of the paragraph segment
1532
1532
1533
+ int start = 0 ;
1533
1534
for (Note note : notesSamePage ) {
1534
- Optional <LayoutToken > matching = clusterTokens
1535
+ List <LayoutToken > clusterReduced = clusterTokens .subList (start , clusterTokens .size ());
1536
+ Optional <LayoutToken > matching = clusterReduced
1535
1537
.stream ()
1536
1538
.filter (t -> t .getText ().equals (note .getLabel ()) && t .isSuperscript ())
1537
1539
.findFirst ();
1538
1540
1539
1541
if (matching .isPresent ()) {
1540
- int idx = clusterTokens .indexOf (matching .get ());
1542
+ int idx = clusterReduced .indexOf (matching .get ()) + start ;
1541
1543
note .setIgnored (true );
1542
1544
OffsetPosition matchingPosition = new OffsetPosition ();
1543
1545
matchingPosition .start = idx ;
1544
1546
matchingPosition .end = idx +1 ; // to be review, might be more than one layout token
1545
- matchedLabelPosition .add (Triple .of (note .getLabel (), "note" , matchingPosition ));
1546
- labels2Notes .put (note .getLabel (), note );
1547
+ start = matchingPosition .end ;
1548
+ matchedLabelPositions .add (Triple .of (note .getIdentifier (), "note" , matchingPosition ));
1549
+ labels2Notes .put (note .getIdentifier (), note );
1547
1550
}
1548
1551
}
1549
1552
@@ -1555,7 +1558,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
1555
1558
.forEach (opu -> {
1556
1559
// We correct the latest token here, since later we will do a substring in the shared code,
1557
1560
// and we cannot add a +1 there.
1558
- matchedLabelPosition .add (
1561
+ matchedLabelPositions .add (
1559
1562
Triple .of (LayoutTokensUtil .normalizeDehyphenizeText (clusterTokens .subList (opu .start , opu .end )),
1560
1563
"url" ,
1561
1564
new OffsetPosition (opu .start , opu .end + 1 )
@@ -1567,7 +1570,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
1567
1570
// We can add more elements to be extracted from the paragraphs, here. Each labelPosition it's a
1568
1571
// Triple with three main elements: the text of the item, the type, and the offsetPositions.
1569
1572
1570
- if (CollectionUtils .isEmpty (matchedLabelPosition )){
1573
+ if (CollectionUtils .isEmpty (matchedLabelPositions )){
1571
1574
String clusterContent = LayoutTokensUtil .normalizeDehyphenizeText (clusterTokens );
1572
1575
if (isNewParagraph (lastClusterLabel , curParagraph )) {
1573
1576
if (curParagraph != null && config .isWithSentenceSegmentation ()) {
@@ -1617,7 +1620,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
1617
1620
}
1618
1621
1619
1622
// sort the matches by position
1620
- Collections .sort (matchedLabelPosition , (m1 , m2 ) -> {
1623
+ Collections .sort (matchedLabelPositions , (m1 , m2 ) -> {
1621
1624
return m1 .getRight ().start - m2 .getRight ().start ;
1622
1625
}
1623
1626
);
@@ -1626,7 +1629,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
1626
1629
int pos = 0 ;
1627
1630
1628
1631
// build the paragraph segment, match by match
1629
- for (Triple <String , String , OffsetPosition > referenceInformation : matchedLabelPosition ) {
1632
+ for (Triple <String , String , OffsetPosition > referenceInformation : matchedLabelPositions ) {
1630
1633
String type = referenceInformation .getMiddle ();
1631
1634
OffsetPosition matchingPosition = referenceInformation .getRight ();
1632
1635
0 commit comments