Skip to content

Commit 47823ea

Browse files
committed
fix analysis with splitting sentences
1 parent 4ba72f2 commit 47823ea

File tree

4 files changed

+237
-78
lines changed

4 files changed

+237
-78
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright (c) 2023 Works Applications Co., Ltd.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.worksap.nlp.sudachi;
18+
19+
import java.io.IOException;
20+
import java.io.Reader;
21+
import java.nio.CharBuffer;
22+
23+
public class IOTools {
24+
private IOTools() {
25+
// forbid instantiation
26+
}
27+
28+
/**
29+
* Read as much as possible from reader to the result buffer. Some readers
30+
* perform filtering on input by reducing the number of read characters in each
31+
* batch.
32+
*
33+
* @param reader
34+
* input reader
35+
* @param result
36+
* buffer to read into
37+
* @return number of read characters
38+
* @throws IOException
39+
* when read operation fails
40+
*/
41+
public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOException {
42+
int totalRead = 0;
43+
while (result.hasRemaining()) {
44+
int read = reader.read(result);
45+
if (read < 0) {
46+
if (totalRead == 0) {
47+
return -1;
48+
} else {
49+
return totalRead;
50+
}
51+
}
52+
totalRead += read;
53+
}
54+
return totalRead;
55+
}
56+
}

src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java

Lines changed: 21 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import java.nio.CharBuffer;
2424
import java.util.ArrayList;
2525
import java.util.Collections;
26-
import java.util.Iterator;
2726
import java.util.List;
2827

2928
import javax.json.Json;
@@ -78,67 +77,41 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, String text) {
7877
if (text.isEmpty()) {
7978
return Collections.emptyList();
8079
}
81-
UTF8InputText input = buildInputText(text);
82-
String normalized = input.getText();
83-
84-
ArrayList<MorphemeList> sentences = new ArrayList<>();
85-
SentenceDetector detector = new SentenceDetector();
86-
int bos = 0;
87-
int length;
88-
NonBreakChecker checker = new NonBreakChecker(input);
89-
checker.setBos(bos);
90-
while ((length = detector.getEos(normalized, checker)) != 0) {
91-
if (length < 0) {
92-
length = -length;
93-
}
94-
int eos = bos + length;
95-
if (eos < normalized.length()) {
96-
eos = input.getNextInOriginal(eos - 1);
97-
length = eos - bos;
80+
81+
SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);
82+
int length = analysis.tokenizeBuffer(text);
83+
ArrayList<MorphemeList> result = analysis.result;
84+
int bos = analysis.bos;
85+
if (length < 0) {
86+
// treat remaining thing as a single sentence
87+
int eos = analysis.input.getText().length();
88+
if (bos != eos) {
89+
UTF8InputText slice = analysis.input;
90+
if (bos != 0) {
91+
slice = slice.slice(bos, eos);
92+
}
93+
result.add(tokenizeSentence(mode, slice));
9894
}
99-
UTF8InputText sentence = input.slice(bos, eos);
100-
sentences.add(tokenizeSentence(mode, sentence));
101-
normalized = normalized.substring(length);
102-
bos = eos;
103-
checker.setBos(bos);
10495
}
105-
return sentences;
96+
return result;
10697
}
10798

10899
@Override
109100
public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) throws IOException {
110-
ArrayList<MorphemeList> sentences = new ArrayList<>();
111101
CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
112-
SentenceDetector detector = new SentenceDetector();
102+
SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);
113103

114-
while (reader.read(buffer) > 0) {
104+
while (IOTools.readAsMuchAsCan(reader, buffer) > 0) {
115105
buffer.flip();
116-
117-
UTF8InputText input = buildInputText(buffer);
118-
String normalized = input.getText();
119-
120-
int bos = 0;
121-
int length;
122-
NonBreakChecker checker = new NonBreakChecker(input);
123-
checker.setBos(bos);
124-
while ((length = detector.getEos(normalized, checker)) > 0) {
125-
int eos = bos + length;
126-
if (eos < normalized.length()) {
127-
eos = input.getNextInOriginal(eos - 1);
128-
length = eos - bos;
129-
}
130-
UTF8InputText sentence = input.slice(bos, eos);
131-
sentences.add(tokenizeSentence(mode, sentence));
132-
normalized = normalized.substring(length);
133-
bos = eos;
134-
checker.setBos(bos);
135-
}
106+
int length = analysis.tokenizeBuffer(buffer);
136107
if (length < 0) {
137-
buffer.position(input.textIndexToOriginalTextIndex(bos));
108+
buffer.position(analysis.bosPosition());
138109
buffer.compact();
139110
}
140111
}
141112
buffer.flip();
113+
ArrayList<MorphemeList> sentences = analysis.result;
114+
142115
if (buffer.hasRemaining()) {
143116
sentences.add(tokenizeSentence(mode, buildInputText(buffer)));
144117
}
@@ -313,34 +286,4 @@ JsonArrayBuilder pathToJson(List<LatticeNode> path, LatticeImpl lattice) {
313286
void disableEmptyMorpheme() {
314287
allowEmptyMorpheme = false;
315288
}
316-
317-
class NonBreakChecker implements SentenceDetector.NonBreakCheker {
318-
private final UTF8InputText input;
319-
private int bos;
320-
321-
NonBreakChecker(UTF8InputText input) {
322-
this.input = input;
323-
}
324-
325-
public void setBos(int bos) {
326-
this.bos = bos;
327-
}
328-
329-
@Override
330-
public boolean hasNonBreakWord(int length) {
331-
int byteEOS = input.getCodePointsOffsetLength(0, bos + length);
332-
byte[] bytes = input.getByteText();
333-
for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
334-
Iterator<int[]> iterator = lexicon.lookup(bytes, i);
335-
while (iterator.hasNext()) {
336-
int[] r = iterator.next();
337-
int l = r[1];
338-
if (l > byteEOS || (l == byteEOS && bos + length - input.modifiedOffset(i) > 1)) {
339-
return true;
340-
}
341-
}
342-
}
343-
return false;
344-
}
345-
}
346289
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Copyright (c) 2023 Works Applications Co., Ltd.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.worksap.nlp.sudachi;
18+
19+
import com.worksap.nlp.sudachi.dictionary.LexiconSet;
20+
import com.worksap.nlp.sudachi.sentdetect.SentenceDetector;
21+
22+
import java.util.ArrayList;
23+
import java.util.Iterator;
24+
25+
/*internal*/ class SentenceSplittingAnalysis implements SentenceDetector.NonBreakCheker {
26+
private final SentenceDetector detector = new SentenceDetector();
27+
28+
private final Tokenizer.SplitMode mode;
29+
private final JapaneseTokenizer tokenizer;
30+
final ArrayList<MorphemeList> result = new ArrayList<>();
31+
32+
SentenceSplittingAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer) {
33+
this.mode = mode;
34+
this.tokenizer = tokenizer;
35+
}
36+
37+
UTF8InputText input;
38+
int bos;
39+
40+
int tokenizeBuffer(CharSequence buffer) {
41+
UTF8InputText input = tokenizer.buildInputText(buffer);
42+
String normalized = input.getText();
43+
this.input = input;
44+
45+
int bos = 0;
46+
int length;
47+
48+
this.bos = bos;
49+
while ((length = detector.getEos(normalized, this)) > 0) {
50+
int eos = bos + length;
51+
if (eos < normalized.length()) {
52+
eos = input.getNextInOriginal(eos - 1);
53+
length = eos - bos;
54+
}
55+
UTF8InputText sentence = input.slice(bos, eos);
56+
result.add(tokenizer.tokenizeSentence(mode, sentence));
57+
normalized = normalized.substring(length);
58+
bos = eos;
59+
this.bos = bos;
60+
}
61+
62+
// buffer is full, need to clean it up
63+
if (length < 0 && buffer.length() == -length) {
64+
result.add(tokenizer.tokenizeSentence(mode, input));
65+
return -length;
66+
}
67+
68+
return length;
69+
}
70+
71+
int bosPosition() {
72+
return input.textIndexToOriginalTextIndex(bos);
73+
}
74+
75+
@Override
76+
public boolean hasNonBreakWord(int length) {
77+
UTF8InputText inp = input;
78+
int byteEOS = inp.getCodePointsOffsetLength(0, bos + length);
79+
byte[] bytes = inp.getByteText();
80+
LexiconSet lexicon = tokenizer.lexicon;
81+
for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
82+
Iterator<int[]> iterator = lexicon.lookup(bytes, i);
83+
while (iterator.hasNext()) {
84+
int[] r = iterator.next();
85+
int l = r[1];
86+
if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) {
87+
return true;
88+
}
89+
}
90+
}
91+
return false;
92+
}
93+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Copyright (c) 2023 Works Applications Co., Ltd.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.worksap.nlp.sudachi
18+
19+
import java.io.Reader
20+
import java.io.StringReader
21+
import kotlin.math.min
22+
import kotlin.test.Test
23+
import kotlin.test.assertEquals
24+
25+
class JapaneseTokenizerStreamingTest {
26+
private val tokenizer = TestDictionary.user0().create()
27+
28+
class BadReader(private val data: String, private val window: Int = 512) : Reader() {
29+
30+
private var position: Int = 0
31+
override fun read(cbuf: CharArray, off: Int, len: Int): Int {
32+
// mimic ICUNormalizer2CharFilter, but read in 512 char increments instead of 128 (by default)
33+
check(off >= 0)
34+
check(off < cbuf.size)
35+
check(len > 0)
36+
37+
val dataLen = data.length
38+
val remaining = dataLen - position
39+
if (remaining == 0) {
40+
return -1
41+
}
42+
43+
val toRead = min(min(window, remaining), len)
44+
data.toCharArray(cbuf, off, position, position + toRead)
45+
position += toRead
46+
return toRead
47+
}
48+
49+
override fun close() {}
50+
}
51+
52+
@Test
53+
fun streamingTest() {
54+
val reader = StringReader("".repeat(5000))
55+
val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader)
56+
val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
57+
assertEquals(5000, totalLength)
58+
}
59+
60+
@Test
61+
fun streamingTestWithBadReader() {
62+
val reader = BadReader("".repeat(5000))
63+
val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader)
64+
val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
65+
assertEquals(5000, totalLength)
66+
}
67+
}

0 commit comments

Comments
 (0)