fix analysis with splitting sentences

eiennohito · eiennohito · commit 47823ea95d73 · 2023-03-09T18:30:20.000+09:00
diff --git a/src/main/java/com/worksap/nlp/sudachi/IOTools.java b/src/main/java/com/worksap/nlp/sudachi/IOTools.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.sudachi;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+
+public class IOTools {
+    private IOTools() {
+        // forbid instantiation
+    }
+
+    /**
+     * Read as much as possible from reader to the result buffer. Some readers
+     * perform filtering on input by reducing the number of read characters in each
+     * batch.
+     *
+     * @param reader
+     *            input reader
+     * @param result
+     *            buffer to read into
+     * @return number of read characters
+     * @throws IOException
+     *             when read operation fails
+     */
+    public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOException {
+        int totalRead = 0;
+        while (result.hasRemaining()) {
+            int read = reader.read(result);
+            if (read < 0) {
+                if (totalRead == 0) {
+                    return -1;
+                } else {
+                    return totalRead;
+                }
+            }
+            totalRead += read;
+        }
+        return totalRead;
+    }
+}
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
@@ -23,7 +23,6 @@
 import java.nio.CharBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.Iterator;
 import java.util.List;
 
 import javax.json.Json;
@@ -78,67 +77,41 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, String text) {
         if (text.isEmpty()) {
             return Collections.emptyList();
         }
-        UTF8InputText input = buildInputText(text);
-        String normalized = input.getText();
-
-        ArrayList<MorphemeList> sentences = new ArrayList<>();
-        SentenceDetector detector = new SentenceDetector();
-        int bos = 0;
-        int length;
-        NonBreakChecker checker = new NonBreakChecker(input);
-        checker.setBos(bos);
-        while ((length = detector.getEos(normalized, checker)) != 0) {
-            if (length < 0) {
-                length = -length;
-            }
-            int eos = bos + length;
-            if (eos < normalized.length()) {
-                eos = input.getNextInOriginal(eos - 1);
-                length = eos - bos;
+
+        SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);
+        int length = analysis.tokenizeBuffer(text);
+        ArrayList<MorphemeList> result = analysis.result;
+        int bos = analysis.bos;
+        if (length < 0) {
+            // treat remaining thing as a single sentence
+            int eos = analysis.input.getText().length();
+            if (bos != eos) {
+                UTF8InputText slice = analysis.input;
+                if (bos != 0) {
+                    slice = slice.slice(bos, eos);
+                }
+                result.add(tokenizeSentence(mode, slice));
             }
-            UTF8InputText sentence = input.slice(bos, eos);
-            sentences.add(tokenizeSentence(mode, sentence));
-            normalized = normalized.substring(length);
-            bos = eos;
-            checker.setBos(bos);
         }
-        return sentences;
+        return result;
     }
 
     @Override
     public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) throws IOException {
-        ArrayList<MorphemeList> sentences = new ArrayList<>();
         CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
-        SentenceDetector detector = new SentenceDetector();
+        SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);
 
-        while (reader.read(buffer) > 0) {
+        while (IOTools.readAsMuchAsCan(reader, buffer) > 0) {
             buffer.flip();
-
-            UTF8InputText input = buildInputText(buffer);
-            String normalized = input.getText();
-
-            int bos = 0;
-            int length;
-            NonBreakChecker checker = new NonBreakChecker(input);
-            checker.setBos(bos);
-            while ((length = detector.getEos(normalized, checker)) > 0) {
-                int eos = bos + length;
-                if (eos < normalized.length()) {
-                    eos = input.getNextInOriginal(eos - 1);
-                    length = eos - bos;
-                }
-                UTF8InputText sentence = input.slice(bos, eos);
-                sentences.add(tokenizeSentence(mode, sentence));
-                normalized = normalized.substring(length);
-                bos = eos;
-                checker.setBos(bos);
-            }
+            int length = analysis.tokenizeBuffer(buffer);
             if (length < 0) {
-                buffer.position(input.textIndexToOriginalTextIndex(bos));
+                buffer.position(analysis.bosPosition());
                 buffer.compact();
             }
         }
         buffer.flip();
+        ArrayList<MorphemeList> sentences = analysis.result;
+
         if (buffer.hasRemaining()) {
             sentences.add(tokenizeSentence(mode, buildInputText(buffer)));
         }
@@ -313,34 +286,4 @@ JsonArrayBuilder pathToJson(List<LatticeNode> path, LatticeImpl lattice) {
     void disableEmptyMorpheme() {
         allowEmptyMorpheme = false;
     }
-
-    class NonBreakChecker implements SentenceDetector.NonBreakCheker {
-        private final UTF8InputText input;
-        private int bos;
-
-        NonBreakChecker(UTF8InputText input) {
-            this.input = input;
-        }
-
-        public void setBos(int bos) {
-            this.bos = bos;
-        }
-
-        @Override
-        public boolean hasNonBreakWord(int length) {
-            int byteEOS = input.getCodePointsOffsetLength(0, bos + length);
-            byte[] bytes = input.getByteText();
-            for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
-                Iterator<int[]> iterator = lexicon.lookup(bytes, i);
-                while (iterator.hasNext()) {
-                    int[] r = iterator.next();
-                    int l = r[1];
-                    if (l > byteEOS || (l == byteEOS && bos + length - input.modifiedOffset(i) > 1)) {
-                        return true;
-                    }
-                }
-            }
-            return false;
-        }
-    }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.sudachi;
+
+import com.worksap.nlp.sudachi.dictionary.LexiconSet;
+import com.worksap.nlp.sudachi.sentdetect.SentenceDetector;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+
+/*internal*/ class SentenceSplittingAnalysis implements SentenceDetector.NonBreakCheker {
+    private final SentenceDetector detector = new SentenceDetector();
+
+    private final Tokenizer.SplitMode mode;
+    private final JapaneseTokenizer tokenizer;
+    final ArrayList<MorphemeList> result = new ArrayList<>();
+
+    SentenceSplittingAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer) {
+        this.mode = mode;
+        this.tokenizer = tokenizer;
+    }
+
+    UTF8InputText input;
+    int bos;
+
+    int tokenizeBuffer(CharSequence buffer) {
+        UTF8InputText input = tokenizer.buildInputText(buffer);
+        String normalized = input.getText();
+        this.input = input;
+
+        int bos = 0;
+        int length;
+
+        this.bos = bos;
+        while ((length = detector.getEos(normalized, this)) > 0) {
+            int eos = bos + length;
+            if (eos < normalized.length()) {
+                eos = input.getNextInOriginal(eos - 1);
+                length = eos - bos;
+            }
+            UTF8InputText sentence = input.slice(bos, eos);
+            result.add(tokenizer.tokenizeSentence(mode, sentence));
+            normalized = normalized.substring(length);
+            bos = eos;
+            this.bos = bos;
+        }
+
+        // buffer is full, need to clean it up
+        if (length < 0 && buffer.length() == -length) {
+            result.add(tokenizer.tokenizeSentence(mode, input));
+            return -length;
+        }
+
+        return length;
+    }
+
+    int bosPosition() {
+        return input.textIndexToOriginalTextIndex(bos);
+    }
+
+    @Override
+    public boolean hasNonBreakWord(int length) {
+        UTF8InputText inp = input;
+        int byteEOS = inp.getCodePointsOffsetLength(0, bos + length);
+        byte[] bytes = inp.getByteText();
+        LexiconSet lexicon = tokenizer.lexicon;
+        for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
+            Iterator<int[]> iterator = lexicon.lookup(bytes, i);
+            while (iterator.hasNext()) {
+                int[] r = iterator.next();
+                int l = r[1];
+                if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+}
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.sudachi
+
+import java.io.Reader
+import java.io.StringReader
+import kotlin.math.min
+import kotlin.test.Test
+import kotlin.test.assertEquals
+
+class JapaneseTokenizerStreamingTest {
+  private val tokenizer = TestDictionary.user0().create()
+
+  class BadReader(private val data: String, private val window: Int = 512) : Reader() {
+
+    private var position: Int = 0
+    override fun read(cbuf: CharArray, off: Int, len: Int): Int {
+      // mimic ICUNormalizer2CharFilter, but read in 512 char increments instead of 128 (by default)
+      check(off >= 0)
+      check(off < cbuf.size)
+      check(len > 0)
+
+      val dataLen = data.length
+      val remaining = dataLen - position
+      if (remaining == 0) {
+        return -1
+      }
+
+      val toRead = min(min(window, remaining), len)
+      data.toCharArray(cbuf, off, position, position + toRead)
+      position += toRead
+      return toRead
+    }
+
+    override fun close() {}
+  }
+
+  @Test
+  fun streamingTest() {
+    val reader = StringReader("あ".repeat(5000))
+    val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader)
+    val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
+    assertEquals(5000, totalLength)
+  }
+
+  @Test
+  fun streamingTestWithBadReader() {
+    val reader = BadReader("あ".repeat(5000))
+    val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader)
+    val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
+    assertEquals(5000, totalLength)
+  }
+}